Files
flyer-crawler.projectium.com/src/utils/pdfConverter.ts
Torben Sorensen 1d0bd630b2
Some checks failed
Deploy to Web Server flyer-crawler.projectium.com / deploy (push) Failing after 48s
test, more id fixes, and naming all files
2025-11-25 05:59:56 -08:00

93 lines
3.2 KiB
TypeScript

// src/utils/pdfConverter.ts
import * as pdfjsLib from 'pdfjs-dist';
import type { PDFDocumentProxy, PDFPageProxy, PageViewport } from 'pdfjs-dist';
/**
* Renders a single PDF page to a canvas and returns it as a JPEG File object.
* @param pdfPage The PDF page object from pdf.js.
* @param pageNumber The page number (1-based).
* @param originalFileName The name of the original PDF file.
* @param scale The scale at which to render the page.
* @returns A promise that resolves to an image File object.
*/
const renderPageToImageFile = async (
pdfPage: PDFPageProxy,
pageNumber: number,
originalFileName: string,
scale: number
): Promise<File> => {
const viewport = pdfPage.getViewport({ scale });
const canvas = document.createElement('canvas');
const context = canvas.getContext('2d');
canvas.height = viewport.height;
canvas.width = viewport.width;
if (!context) {
throw new Error('Could not get canvas context');
}
await pdfPage.render({ canvas, canvasContext: context, viewport: viewport as PageViewport }).promise;
// Promisify canvas.toBlob for async/await usage
const blob = await new Promise<Blob | null>((resolve) => {
canvas.toBlob(resolve, 'image/jpeg', 0.9);
});
if (!blob) {
throw new Error(`Failed to convert page ${pageNumber} of PDF to blob.`);
}
const newFileName = originalFileName.replace(/\.pdf$/i, `_page_${pageNumber}.jpeg`);
return new File([blob], newFileName, { type: 'image/jpeg' });
};
/**
* Fetches a PDF document from a File object.
* @param pdfFile The PDF file.
* @returns A promise that resolves to the pdf.js document object.
*/
const getPdfDocument = async (pdfFile: File) => {
const arrayBuffer = await pdfFile.arrayBuffer();
const pdf: PDFDocumentProxy = await pdfjsLib.getDocument(arrayBuffer).promise;
return pdf;
};
/**
* Converts all pages of a PDF file into an array of image File objects.
* @param pdfFile The PDF file to convert.
* @param onProgress Optional callback to report conversion progress.
* @returns A promise that resolves to an object containing the array of image files and the total page count.
*/
export const convertPdfToImageFiles = async (
pdfFile: File,
onProgress?: (currentPage: number, totalPages: number) => void
): Promise<{ imageFiles: File[], pageCount: number }> => {
const pdf = await getPdfDocument(pdfFile);
const pageCount = pdf.numPages;
const imageFiles: File[] = [];
const scale = 1.5;
// Create an array of promises, one for each page rendering task.
const pagePromises = Array.from({ length: pageCount }, async (_, i) => {
const pageNumber = i + 1;
const page = await pdf.getPage(pageNumber);
const imageFile = await renderPageToImageFile(page, pageNumber, pdfFile.name, scale);
// Report progress as each page finishes.
onProgress?.(pageNumber, pageCount);
return imageFile;
});
// Process all pages in parallel and collect the results.
const results = await Promise.all(pagePromises);
imageFiles.push(...results);
if (imageFiles.length === 0 && pageCount > 0) {
throw new Error('PDF conversion resulted in zero images, though the PDF has pages. It might be corrupted or contain non-standard content.');
}
return { imageFiles, pageCount };
};