one lazy ai
Some checks failed
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Has been cancelled
Some checks failed
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Has been cancelled
This commit is contained in:
@@ -283,6 +283,11 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Install poppler-utils, which provides the `pdftocairo` command-line tool
|
||||
# required by the background worker for PDF-to-image conversion.
|
||||
echo "Installing system dependency: poppler-utils..."
|
||||
sudo apt-get update && sudo apt-get install -y poppler-utils
|
||||
|
||||
echo "Installing production dependencies and restarting server..."
|
||||
cd /var/www/flyer-crawler.projectium.com
|
||||
npm install --omit=dev # Install only production dependencies
|
||||
|
||||
@@ -3,9 +3,8 @@ import { Queue, Worker, Job } from 'bullmq';
|
||||
import IORedis from 'ioredis'; // Correctly imported
|
||||
import path from 'path';
|
||||
import fs from 'fs/promises';
|
||||
// Use pdfjs-dist for PDF parsing and sharp for image processing.
|
||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
||||
import sharp from 'sharp';
|
||||
import { exec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
|
||||
import { logger } from './logger.server';
|
||||
import * as aiService from './aiService.server';
|
||||
@@ -18,6 +17,8 @@ export const connection = new IORedis(process.env.REDIS_URL || 'redis://127.0.0.
|
||||
password: process.env.REDIS_PASSWORD, // Add the password from environment variables
|
||||
});
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
export const flyerQueue = new Queue<FlyerJobData>('flyer-processing', {
|
||||
connection,
|
||||
defaultJobOptions: {
|
||||
@@ -95,71 +96,27 @@ export const flyerWorker = new Worker<FlyerJobData>(
|
||||
|
||||
if (fileExt === '.pdf') {
|
||||
await job.updateProgress({ message: 'Converting PDF to images...' });
|
||||
|
||||
// Load the PDF document using pdfjs-dist
|
||||
const data = new Uint8Array(await fs.readFile(filePath));
|
||||
const pdfDocument = await pdfjs.getDocument({ data }).promise;
|
||||
|
||||
const outputDir = path.dirname(filePath);
|
||||
const outputFilePrefix = path.join(outputDir, path.basename(filePath, '.pdf'));
|
||||
|
||||
for (let i = 1; i <= pdfDocument.numPages; i++) {
|
||||
const page = await pdfDocument.getPage(i);
|
||||
const viewport = page.getViewport({ scale: 1.5 }); // ~150 DPI
|
||||
// Use the pdftocairo command-line tool for robust, server-side PDF conversion.
|
||||
// -jpeg flag outputs JPEG files.
|
||||
// -r 150 sets the resolution to 150 DPI.
|
||||
// The final argument is the output file prefix. pdftocairo will append page numbers.
|
||||
const command = `pdftocairo -jpeg -r 150 "${filePath}" "${outputFilePrefix}"`;
|
||||
await execAsync(command);
|
||||
|
||||
// Create a fake canvas and context to render the PDF into raw pixel data.
|
||||
// This is a common pattern for using pdf.js on the server. We must provide
|
||||
// stubs for the methods the rendering engine expects to find.
|
||||
const canvasAndContext = {
|
||||
canvas: {
|
||||
width: viewport.width,
|
||||
height: viewport.height,
|
||||
getContext: () => canvasAndContext.context,
|
||||
},
|
||||
context: {
|
||||
canvas: null as any, // This will be set below
|
||||
getImageData: () => ({ data: new Uint8ClampedArray(viewport.width * viewport.height * 4) }), // The most important part for capturing data
|
||||
// Add a comprehensive set of stub methods that pdf.js might call during rendering.
|
||||
// These don't need to do anything; they just need to exist to prevent "is not a function" errors.
|
||||
save: () => {},
|
||||
restore: () => {},
|
||||
getTransform: () => ({ a: 1, b: 0, c: 0, d: 1, e: 0, f: 0 }), // Return a default identity matrix
|
||||
transform: () => {},
|
||||
setTransform: () => {},
|
||||
scale: () => {},
|
||||
rotate: () => {},
|
||||
translate: () => {},
|
||||
beginPath: () => {},
|
||||
moveTo: () => {},
|
||||
lineTo: () => {},
|
||||
closePath: () => {},
|
||||
stroke: () => {},
|
||||
fill: () => {},
|
||||
clip: () => {},
|
||||
fillRect: () => {},
|
||||
strokeRect: () => {},
|
||||
clearRect: () => {},
|
||||
drawImage: () => {},
|
||||
createPattern: () => ({}),
|
||||
createLinearGradient: () => ({ addColorStop: () => {} }),
|
||||
createRadialGradient: () => ({ addColorStop: () => {} }),
|
||||
},
|
||||
};
|
||||
canvasAndContext.context.canvas = canvasAndContext.canvas;
|
||||
|
||||
const renderContext = {
|
||||
canvasContext: canvasAndContext.context as any,
|
||||
viewport,
|
||||
canvas: canvasAndContext.canvas as any, // Add the canvas object to the render context
|
||||
};
|
||||
const renderTask = page.render(renderContext);
|
||||
await renderTask.promise;
|
||||
// After conversion, find the generated image files.
|
||||
const filesInDir = await fs.readdir(outputDir);
|
||||
const generatedImages = filesInDir
|
||||
.filter(f => f.startsWith(path.basename(outputFilePrefix)) && f.endsWith('.jpg'))
|
||||
.sort(); // Sort to ensure page order.
|
||||
|
||||
const rawPixelData = canvasAndContext.context.getImageData().data;
|
||||
const imageFileName = `${path.basename(filePath, '.pdf')}_page_${i}.jpeg`;
|
||||
const imageOutputPath = path.join(outputDir, imageFileName);
|
||||
await sharp(rawPixelData, { raw: { width: viewport.width, height: viewport.height, channels: 4 } }).jpeg().toFile(imageOutputPath);
|
||||
imagePaths.push({ path: imageOutputPath, mimetype: 'image/jpeg' });
|
||||
for (const img of generatedImages) {
|
||||
imagePaths.push({ path: path.join(outputDir, img), mimetype: 'image/jpeg' });
|
||||
}
|
||||
|
||||
logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
|
||||
} else {
|
||||
imagePaths.push({ path: filePath, mimetype: `image/${fileExt.slice(1)}` });
|
||||
|
||||
Reference in New Issue
Block a user