one lazy ai
Some checks failed
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Has been cancelled

This commit is contained in:
2025-12-02 22:07:46 -08:00
parent 6d6eba777d
commit 8949a4e24b
2 changed files with 24 additions and 62 deletions

View File

@@ -283,6 +283,11 @@ jobs:
exit 1
fi
# Install poppler-utils, which provides the `pdftocairo` command-line tool
# required by the background worker for PDF-to-image conversion.
echo "Installing system dependency: poppler-utils..."
sudo apt-get update && sudo apt-get install -y poppler-utils
echo "Installing production dependencies and restarting server..."
cd /var/www/flyer-crawler.projectium.com
npm install --omit=dev # Install only production dependencies

View File

@@ -3,9 +3,8 @@ import { Queue, Worker, Job } from 'bullmq';
import IORedis from 'ioredis'; // Correctly imported
import path from 'path';
import fs from 'fs/promises';
// Use pdfjs-dist for PDF parsing and sharp for image processing.
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
import sharp from 'sharp';
import { exec } from 'child_process';
import { promisify } from 'util';
import { logger } from './logger.server';
import * as aiService from './aiService.server';
@@ -18,6 +17,8 @@ export const connection = new IORedis(process.env.REDIS_URL || 'redis://127.0.0.
password: process.env.REDIS_PASSWORD, // Add the password from environment variables
});
const execAsync = promisify(exec);
export const flyerQueue = new Queue<FlyerJobData>('flyer-processing', {
connection,
defaultJobOptions: {
@@ -95,71 +96,27 @@ export const flyerWorker = new Worker<FlyerJobData>(
if (fileExt === '.pdf') {
await job.updateProgress({ message: 'Converting PDF to images...' });
// Load the PDF document using pdfjs-dist
const data = new Uint8Array(await fs.readFile(filePath));
const pdfDocument = await pdfjs.getDocument({ data }).promise;
const outputDir = path.dirname(filePath);
const outputFilePrefix = path.join(outputDir, path.basename(filePath, '.pdf'));
for (let i = 1; i <= pdfDocument.numPages; i++) {
const page = await pdfDocument.getPage(i);
const viewport = page.getViewport({ scale: 1.5 }); // ~150 DPI
// Use the pdftocairo command-line tool for robust, server-side PDF conversion.
// -jpeg flag outputs JPEG files.
// -r 150 sets the resolution to 150 DPI.
// The final argument is the output file prefix. pdftocairo will append page numbers.
const command = `pdftocairo -jpeg -r 150 "${filePath}" "${outputFilePrefix}"`;
await execAsync(command);
// Create a fake canvas and context to render the PDF into raw pixel data.
// This is a common pattern for using pdf.js on the server. We must provide
// stubs for the methods the rendering engine expects to find.
const canvasAndContext = {
canvas: {
width: viewport.width,
height: viewport.height,
getContext: () => canvasAndContext.context,
},
context: {
canvas: null as any, // This will be set below
getImageData: () => ({ data: new Uint8ClampedArray(viewport.width * viewport.height * 4) }), // The most important part for capturing data
// Add a comprehensive set of stub methods that pdf.js might call during rendering.
// These don't need to do anything; they just need to exist to prevent "is not a function" errors.
save: () => {},
restore: () => {},
getTransform: () => ({ a: 1, b: 0, c: 0, d: 1, e: 0, f: 0 }), // Return a default identity matrix
transform: () => {},
setTransform: () => {},
scale: () => {},
rotate: () => {},
translate: () => {},
beginPath: () => {},
moveTo: () => {},
lineTo: () => {},
closePath: () => {},
stroke: () => {},
fill: () => {},
clip: () => {},
fillRect: () => {},
strokeRect: () => {},
clearRect: () => {},
drawImage: () => {},
createPattern: () => ({}),
createLinearGradient: () => ({ addColorStop: () => {} }),
createRadialGradient: () => ({ addColorStop: () => {} }),
},
};
canvasAndContext.context.canvas = canvasAndContext.canvas;
const renderContext = {
canvasContext: canvasAndContext.context as any,
viewport,
canvas: canvasAndContext.canvas as any, // Add the canvas object to the render context
};
const renderTask = page.render(renderContext);
await renderTask.promise;
// After conversion, find the generated image files.
const filesInDir = await fs.readdir(outputDir);
const generatedImages = filesInDir
.filter(f => f.startsWith(path.basename(outputFilePrefix)) && f.endsWith('.jpg'))
.sort(); // Sort to ensure page order.
const rawPixelData = canvasAndContext.context.getImageData().data;
const imageFileName = `${path.basename(filePath, '.pdf')}_page_${i}.jpeg`;
const imageOutputPath = path.join(outputDir, imageFileName);
await sharp(rawPixelData, { raw: { width: viewport.width, height: viewport.height, channels: 4 } }).jpeg().toFile(imageOutputPath);
imagePaths.push({ path: imageOutputPath, mimetype: 'image/jpeg' });
for (const img of generatedImages) {
imagePaths.push({ path: path.join(outputDir, img), mimetype: 'image/jpeg' });
}
logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
} else {
imagePaths.push({ path: filePath, mimetype: `image/${fileExt.slice(1)}` });