move AI flyer processing to background BullMQ jobs using redis for storage
All checks were successful
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Successful in 7m9s
All checks were successful
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Successful in 7m9s
This commit is contained in:
@@ -3,7 +3,9 @@ import { Queue, Worker, Job } from 'bullmq';
|
||||
import IORedis from 'ioredis'; // Correctly imported
|
||||
import path from 'path';
|
||||
import fs from 'fs/promises';
|
||||
import Poppler from 'pdf-poppler';
|
||||
// Use pdfjs-dist for PDF parsing and sharp for image processing.
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
import sharp from 'sharp';
|
||||
|
||||
import { logger } from './logger.server';
|
||||
import * as aiService from './aiService.server';
|
||||
@@ -92,19 +94,39 @@ export const flyerWorker = new Worker<FlyerJobData>(
|
||||
const fileExt = path.extname(filePath).toLowerCase();
|
||||
|
||||
if (fileExt === '.pdf') {
|
||||
const poppler = new Poppler();
|
||||
const outputDir = path.dirname(filePath);
|
||||
const outputFilePrefix = path.basename(filePath, '.pdf');
|
||||
await poppler.pdfToCairo(filePath, path.join(outputDir, outputFilePrefix), {
|
||||
jpegFile: true,
|
||||
resolution: 150,
|
||||
});
|
||||
await job.updateProgress({ message: 'Converting PDF to images...' });
|
||||
|
||||
// Load the PDF document using pdfjs-dist
|
||||
const data = new Uint8Array(await fs.readFile(filePath));
|
||||
const pdfDocument = await pdfjs.getDocument({ data }).promise;
|
||||
|
||||
const files = await fs.readdir(outputDir);
|
||||
const generatedImages = files.filter(f => f.startsWith(outputFilePrefix) && f.endsWith('.jpg'));
|
||||
generatedImages.sort(); // Ensure pages are in order
|
||||
for (const img of generatedImages) {
|
||||
imagePaths.push({ path: path.join(outputDir, img), mimetype: 'image/jpeg' });
|
||||
const outputDir = path.dirname(filePath);
|
||||
|
||||
for (let i = 1; i <= pdfDocument.numPages; i++) {
|
||||
const page = await pdfDocument.getPage(i);
|
||||
const viewport = page.getViewport({ scale: 1.5 }); // ~150 DPI
|
||||
|
||||
// Create a fake canvas and context to render the PDF into raw pixel data.
|
||||
// This is a common pattern for using pdf.js on the server without a real canvas.
|
||||
const canvas = {
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
getContext: () => context,
|
||||
};
|
||||
const context = {
|
||||
canvas: canvas, // The context needs a back-reference to its canvas.
|
||||
getImageData: () => ({ data: new Uint8ClampedArray(viewport.width * viewport.height * 4) }),
|
||||
};
|
||||
|
||||
const renderContext = { canvasContext: context as any, viewport, canvas: canvas as any };
|
||||
const renderTask = page.render(renderContext);
|
||||
await renderTask.promise;
|
||||
|
||||
const rawPixelData = context.getImageData().data;
|
||||
const imageFileName = `${path.basename(filePath, '.pdf')}_page_${i}.jpeg`;
|
||||
const imageOutputPath = path.join(outputDir, imageFileName);
|
||||
await sharp(rawPixelData, { raw: { width: viewport.width, height: viewport.height, channels: 4 } }).jpeg().toFile(imageOutputPath);
|
||||
imagePaths.push({ path: imageOutputPath, mimetype: 'image/jpeg' });
|
||||
}
|
||||
logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user