one lazy ai

2025-12-02 22:07:46 -08:00
parent 6d6eba777d
commit 8949a4e24b
2 changed files with 24 additions and 62 deletions
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -283,6 +283,11 @@ jobs:
            exit 1
          fi

+          # Install poppler-utils, which provides the `pdftocairo` command-line tool
+          # required by the background worker for PDF-to-image conversion.
+          echo "Installing system dependency: poppler-utils..."
+          sudo apt-get update && sudo apt-get install -y poppler-utils
+
          echo "Installing production dependencies and restarting server..."
          cd /var/www/flyer-crawler.projectium.com
          npm install --omit=dev # Install only production dependencies
--- a/src/services/queueService.server.ts
+++ b/src/services/queueService.server.ts
@@ -3,9 +3,8 @@ import { Queue, Worker, Job } from 'bullmq';
 import IORedis from 'ioredis'; // Correctly imported
 import path from 'path';
 import fs from 'fs/promises';
-// Use pdfjs-dist for PDF parsing and sharp for image processing.
-import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
-import sharp from 'sharp';
+import { exec } from 'child_process';
+import { promisify } from 'util';

 import { logger } from './logger.server';
 import * as aiService from './aiService.server';
@@ -18,6 +17,8 @@ export const connection = new IORedis(process.env.REDIS_URL || 'redis://127.0.0.
  password: process.env.REDIS_PASSWORD, // Add the password from environment variables
 });

+const execAsync = promisify(exec);
+
 export const flyerQueue = new Queue<FlyerJobData>('flyer-processing', {
  connection,
  defaultJobOptions: {
@@ -95,71 +96,27 @@ export const flyerWorker = new Worker<FlyerJobData>(

      if (fileExt === '.pdf') {
        await job.updateProgress({ message: 'Converting PDF to images...' });
-        
-        // Load the PDF document using pdfjs-dist
-        const data = new Uint8Array(await fs.readFile(filePath));
-        const pdfDocument = await pdfjs.getDocument({ data }).promise;

        const outputDir = path.dirname(filePath);
+        const outputFilePrefix = path.join(outputDir, path.basename(filePath, '.pdf'));

-        for (let i = 1; i <= pdfDocument.numPages; i++) {
-          const page = await pdfDocument.getPage(i);
-          const viewport = page.getViewport({ scale: 1.5 }); // ~150 DPI
+        // Use the pdftocairo command-line tool for robust, server-side PDF conversion.
+        // -jpeg flag outputs JPEG files.
+        // -r 150 sets the resolution to 150 DPI.
+        // The final argument is the output file prefix. pdftocairo will append page numbers.
+        const command = `pdftocairo -jpeg -r 150 "${filePath}" "${outputFilePrefix}"`;
+        await execAsync(command);

-          // Create a fake canvas and context to render the PDF into raw pixel data.
-          // This is a common pattern for using pdf.js on the server. We must provide
-          // stubs for the methods the rendering engine expects to find.
-          const canvasAndContext = {
-            canvas: {
-              width: viewport.width,
-              height: viewport.height,
-              getContext: () => canvasAndContext.context,
-            },
-            context: {
-              canvas: null as any, // This will be set below
-              getImageData: () => ({ data: new Uint8ClampedArray(viewport.width * viewport.height * 4) }), // The most important part for capturing data
-              // Add a comprehensive set of stub methods that pdf.js might call during rendering.
-              // These don't need to do anything; they just need to exist to prevent "is not a function" errors.
-              save: () => {},
-              restore: () => {},
-              getTransform: () => ({ a: 1, b: 0, c: 0, d: 1, e: 0, f: 0 }), // Return a default identity matrix
-              transform: () => {},
-              setTransform: () => {},
-              scale: () => {},
-              rotate: () => {},
-              translate: () => {},
-              beginPath: () => {},
-              moveTo: () => {},
-              lineTo: () => {},
-              closePath: () => {},
-              stroke: () => {},
-              fill: () => {},
-              clip: () => {},
-              fillRect: () => {},
-              strokeRect: () => {},
-              clearRect: () => {},
-              drawImage: () => {},
-              createPattern: () => ({}),
-              createLinearGradient: () => ({ addColorStop: () => {} }),
-              createRadialGradient: () => ({ addColorStop: () => {} }),
-            },
-          };
-          canvasAndContext.context.canvas = canvasAndContext.canvas;
-          
-          const renderContext = {
-            canvasContext: canvasAndContext.context as any,
-            viewport,
-            canvas: canvasAndContext.canvas as any, // Add the canvas object to the render context
-          };
-          const renderTask = page.render(renderContext);
-          await renderTask.promise;
+        // After conversion, find the generated image files.
+        const filesInDir = await fs.readdir(outputDir);
+        const generatedImages = filesInDir
+          .filter(f => f.startsWith(path.basename(outputFilePrefix)) && f.endsWith('.jpg'))
+          .sort(); // Sort to ensure page order.

-          const rawPixelData = canvasAndContext.context.getImageData().data;
-          const imageFileName = `${path.basename(filePath, '.pdf')}_page_${i}.jpeg`;
-          const imageOutputPath = path.join(outputDir, imageFileName);
-          await sharp(rawPixelData, { raw: { width: viewport.width, height: viewport.height, channels: 4 } }).jpeg().toFile(imageOutputPath);
-          imagePaths.push({ path: imageOutputPath, mimetype: 'image/jpeg' });
+        for (const img of generatedImages) {
+          imagePaths.push({ path: path.join(outputDir, img), mimetype: 'image/jpeg' });
        }
+
        logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
      } else {
        imagePaths.push({ path: filePath, mimetype: `image/${fileExt.slice(1)}` });