one lazy ai
Some checks failed
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Has been cancelled
Some checks failed
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Has been cancelled
This commit is contained in:
@@ -283,6 +283,11 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Install poppler-utils, which provides the `pdftocairo` command-line tool
|
||||||
|
# required by the background worker for PDF-to-image conversion.
|
||||||
|
echo "Installing system dependency: poppler-utils..."
|
||||||
|
sudo apt-get update && sudo apt-get install -y poppler-utils
|
||||||
|
|
||||||
echo "Installing production dependencies and restarting server..."
|
echo "Installing production dependencies and restarting server..."
|
||||||
cd /var/www/flyer-crawler.projectium.com
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
npm install --omit=dev # Install only production dependencies
|
npm install --omit=dev # Install only production dependencies
|
||||||
|
|||||||
@@ -3,9 +3,8 @@ import { Queue, Worker, Job } from 'bullmq';
|
|||||||
import IORedis from 'ioredis'; // Correctly imported
|
import IORedis from 'ioredis'; // Correctly imported
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
import fs from 'fs/promises';
|
import fs from 'fs/promises';
|
||||||
// Use pdfjs-dist for PDF parsing and sharp for image processing.
|
import { exec } from 'child_process';
|
||||||
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf.mjs';
|
import { promisify } from 'util';
|
||||||
import sharp from 'sharp';
|
|
||||||
|
|
||||||
import { logger } from './logger.server';
|
import { logger } from './logger.server';
|
||||||
import * as aiService from './aiService.server';
|
import * as aiService from './aiService.server';
|
||||||
@@ -18,6 +17,8 @@ export const connection = new IORedis(process.env.REDIS_URL || 'redis://127.0.0.
|
|||||||
password: process.env.REDIS_PASSWORD, // Add the password from environment variables
|
password: process.env.REDIS_PASSWORD, // Add the password from environment variables
|
||||||
});
|
});
|
||||||
|
|
||||||
|
const execAsync = promisify(exec);
|
||||||
|
|
||||||
export const flyerQueue = new Queue<FlyerJobData>('flyer-processing', {
|
export const flyerQueue = new Queue<FlyerJobData>('flyer-processing', {
|
||||||
connection,
|
connection,
|
||||||
defaultJobOptions: {
|
defaultJobOptions: {
|
||||||
@@ -95,71 +96,27 @@ export const flyerWorker = new Worker<FlyerJobData>(
|
|||||||
|
|
||||||
if (fileExt === '.pdf') {
|
if (fileExt === '.pdf') {
|
||||||
await job.updateProgress({ message: 'Converting PDF to images...' });
|
await job.updateProgress({ message: 'Converting PDF to images...' });
|
||||||
|
|
||||||
// Load the PDF document using pdfjs-dist
|
|
||||||
const data = new Uint8Array(await fs.readFile(filePath));
|
|
||||||
const pdfDocument = await pdfjs.getDocument({ data }).promise;
|
|
||||||
|
|
||||||
const outputDir = path.dirname(filePath);
|
const outputDir = path.dirname(filePath);
|
||||||
|
const outputFilePrefix = path.join(outputDir, path.basename(filePath, '.pdf'));
|
||||||
|
|
||||||
for (let i = 1; i <= pdfDocument.numPages; i++) {
|
// Use the pdftocairo command-line tool for robust, server-side PDF conversion.
|
||||||
const page = await pdfDocument.getPage(i);
|
// -jpeg flag outputs JPEG files.
|
||||||
const viewport = page.getViewport({ scale: 1.5 }); // ~150 DPI
|
// -r 150 sets the resolution to 150 DPI.
|
||||||
|
// The final argument is the output file prefix. pdftocairo will append page numbers.
|
||||||
|
const command = `pdftocairo -jpeg -r 150 "${filePath}" "${outputFilePrefix}"`;
|
||||||
|
await execAsync(command);
|
||||||
|
|
||||||
// Create a fake canvas and context to render the PDF into raw pixel data.
|
// After conversion, find the generated image files.
|
||||||
// This is a common pattern for using pdf.js on the server. We must provide
|
const filesInDir = await fs.readdir(outputDir);
|
||||||
// stubs for the methods the rendering engine expects to find.
|
const generatedImages = filesInDir
|
||||||
const canvasAndContext = {
|
.filter(f => f.startsWith(path.basename(outputFilePrefix)) && f.endsWith('.jpg'))
|
||||||
canvas: {
|
.sort(); // Sort to ensure page order.
|
||||||
width: viewport.width,
|
|
||||||
height: viewport.height,
|
|
||||||
getContext: () => canvasAndContext.context,
|
|
||||||
},
|
|
||||||
context: {
|
|
||||||
canvas: null as any, // This will be set below
|
|
||||||
getImageData: () => ({ data: new Uint8ClampedArray(viewport.width * viewport.height * 4) }), // The most important part for capturing data
|
|
||||||
// Add a comprehensive set of stub methods that pdf.js might call during rendering.
|
|
||||||
// These don't need to do anything; they just need to exist to prevent "is not a function" errors.
|
|
||||||
save: () => {},
|
|
||||||
restore: () => {},
|
|
||||||
getTransform: () => ({ a: 1, b: 0, c: 0, d: 1, e: 0, f: 0 }), // Return a default identity matrix
|
|
||||||
transform: () => {},
|
|
||||||
setTransform: () => {},
|
|
||||||
scale: () => {},
|
|
||||||
rotate: () => {},
|
|
||||||
translate: () => {},
|
|
||||||
beginPath: () => {},
|
|
||||||
moveTo: () => {},
|
|
||||||
lineTo: () => {},
|
|
||||||
closePath: () => {},
|
|
||||||
stroke: () => {},
|
|
||||||
fill: () => {},
|
|
||||||
clip: () => {},
|
|
||||||
fillRect: () => {},
|
|
||||||
strokeRect: () => {},
|
|
||||||
clearRect: () => {},
|
|
||||||
drawImage: () => {},
|
|
||||||
createPattern: () => ({}),
|
|
||||||
createLinearGradient: () => ({ addColorStop: () => {} }),
|
|
||||||
createRadialGradient: () => ({ addColorStop: () => {} }),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
canvasAndContext.context.canvas = canvasAndContext.canvas;
|
|
||||||
|
|
||||||
const renderContext = {
|
|
||||||
canvasContext: canvasAndContext.context as any,
|
|
||||||
viewport,
|
|
||||||
canvas: canvasAndContext.canvas as any, // Add the canvas object to the render context
|
|
||||||
};
|
|
||||||
const renderTask = page.render(renderContext);
|
|
||||||
await renderTask.promise;
|
|
||||||
|
|
||||||
const rawPixelData = canvasAndContext.context.getImageData().data;
|
for (const img of generatedImages) {
|
||||||
const imageFileName = `${path.basename(filePath, '.pdf')}_page_${i}.jpeg`;
|
imagePaths.push({ path: path.join(outputDir, img), mimetype: 'image/jpeg' });
|
||||||
const imageOutputPath = path.join(outputDir, imageFileName);
|
|
||||||
await sharp(rawPixelData, { raw: { width: viewport.width, height: viewport.height, channels: 4 } }).jpeg().toFile(imageOutputPath);
|
|
||||||
imagePaths.push({ path: imageOutputPath, mimetype: 'image/jpeg' });
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
|
logger.info(`[Worker] Converted PDF to ${imagePaths.length} images.`);
|
||||||
} else {
|
} else {
|
||||||
imagePaths.push({ path: filePath, mimetype: `image/${fileExt.slice(1)}` });
|
imagePaths.push({ path: filePath, mimetype: `image/${fileExt.slice(1)}` });
|
||||||
|
|||||||
Reference in New Issue
Block a user