move AI flyer processing to background BullMQ jobs using redis for storage
All checks were successful
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Successful in 7m9s

This commit is contained in:
2025-12-02 17:15:10 -08:00
parent 12f4a1be64
commit d028511d38
3 changed files with 35 additions and 21 deletions

7
package-lock.json generated
View File

@@ -32,7 +32,6 @@
"passport-google-oauth20": "^2.0.0",
"passport-jwt": "^4.0.1",
"passport-local": "^1.0.0",
"pdf-poppler": "^0.2.3",
"pdfjs-dist": "^5.4.394",
"pg": "^8.16.3",
"react": "^19.2.0",
@@ -13295,12 +13294,6 @@
"resolved": "https://registry.npmjs.org/pause/-/pause-0.0.1.tgz",
"integrity": "sha512-KG8UEiEVkR3wGEb4m5yZkVCzigAD+cVEJck2CzYZO37ZGJfctvVptVO192MwrtPhzONn6go8ylnOdMhKqi4nfg=="
},
"node_modules/pdf-poppler": {
"version": "0.2.3",
"resolved": "https://registry.npmjs.org/pdf-poppler/-/pdf-poppler-0.2.3.tgz",
"integrity": "sha512-nUczP3M/W4c8/3F6il0LmkxkF33qTKQyxeBmUnPbQLxxhtBX42zfpZqnLysomvMdb756qVR7n5kvNr+LzisXQw==",
"license": "ISC"
},
"node_modules/pdfjs-dist": {
"version": "5.4.394",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.394.tgz",

View File

@@ -46,7 +46,6 @@
"passport-google-oauth20": "^2.0.0",
"passport-jwt": "^4.0.1",
"passport-local": "^1.0.0",
"pdf-poppler": "^0.2.3",
"pdfjs-dist": "^5.4.394",
"pg": "^8.16.3",
"react": "^19.2.0",

View File

@@ -3,7 +3,9 @@ import { Queue, Worker, Job } from 'bullmq';
import IORedis from 'ioredis'; // Correctly imported
import path from 'path';
import fs from 'fs/promises';
import Poppler from 'pdf-poppler';
// Use pdfjs-dist for PDF parsing and sharp for image processing.
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
import sharp from 'sharp';
import { logger } from './logger.server';
import * as aiService from './aiService.server';
@@ -92,19 +94,39 @@ export const flyerWorker = new Worker<FlyerJobData>(
const fileExt = path.extname(filePath).toLowerCase();
if (fileExt === '.pdf') {
const poppler = new Poppler();
const outputDir = path.dirname(filePath);
const outputFilePrefix = path.basename(filePath, '.pdf');
await poppler.pdfToCairo(filePath, path.join(outputDir, outputFilePrefix), {
jpegFile: true,
resolution: 150,
});
await job.updateProgress({ message: 'Converting PDF to images...' });
// Load the PDF document using pdfjs-dist
const data = new Uint8Array(await fs.readFile(filePath));
const pdfDocument = await pdfjs.getDocument({ data }).promise;
const files = await fs.readdir(outputDir);
const generatedImages = files.filter(f => f.startsWith(outputFilePrefix) && f.endsWith('.jpg'));
generatedImages.sort(); // Ensure pages are in order
for (const img of generatedImages) {
imagePaths.push({ path: path.join(outputDir, img), mimetype: 'image/jpeg' });
const outputDir = path.dirname(filePath);
for (let i = 1; i <= pdfDocument.numPages; i++) {
const page = await pdfDocument.getPage(i);
const viewport = page.getViewport({ scale: 1.5 }); // ~150 DPI
// Create a fake canvas and context to render the PDF into raw pixel data.
// This is a common pattern for using pdf.js on the server without a real canvas.
const canvas = {
width: viewport.width,
height: viewport.height,
getContext: () => context,
};
const context = {
canvas: canvas, // The context needs a back-reference to its canvas.
getImageData: () => ({ data: new Uint8ClampedArray(viewport.width * viewport.height * 4) }),
};
const renderContext = { canvasContext: context as any, viewport, canvas: canvas as any };
const renderTask = page.render(renderContext);
await renderTask.promise;
const rawPixelData = context.getImageData().data;
const imageFileName = `${path.basename(filePath, '.pdf')}_page_${i}.jpeg`;
const imageOutputPath = path.join(outputDir, imageFileName);
await sharp(rawPixelData, { raw: { width: viewport.width, height: viewport.height, channels: 4 } }).jpeg().toFile(imageOutputPath);
imagePaths.push({ path: imageOutputPath, mimetype: 'image/jpeg' });
}
logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
} else {