/** * This interface defines the shape of the pdf.js page object that we use. * Since pdf.js is loaded from a CDN, we don't have its types available * at build time, so we define the parts we need here for type safety. */ interface PDFPageProxy { getViewport: (options: { scale: number }) => { width: number; height: number }; render: (options: { canvasContext: CanvasRenderingContext2D; viewport: { width: number; height: number } }) => { promise: Promise }; } /** * Renders a single PDF page to a canvas and returns it as a JPEG File object. * @param pdfPage The PDF page object from pdf.js. * @param pageNumber The page number (1-based). * @param originalFileName The name of the original PDF file. * @param scale The scale at which to render the page. * @returns A promise that resolves to an image File object. */ const renderPageToImageFile = async ( pdfPage: PDFPageProxy, pageNumber: number, originalFileName: string, scale: number ): Promise => { const viewport = pdfPage.getViewport({ scale }); const canvas = document.createElement('canvas'); const context = canvas.getContext('2d'); canvas.height = viewport.height; canvas.width = viewport.width; if (!context) { throw new Error('Could not get canvas context'); } await pdfPage.render({ canvasContext: context, viewport: viewport }).promise; // Promisify canvas.toBlob for async/await usage const blob = await new Promise((resolve) => { canvas.toBlob(resolve, 'image/jpeg', 0.9); }); if (!blob) { throw new Error(`Failed to convert page ${pageNumber} of PDF to blob.`); } const newFileName = originalFileName.replace(/\.pdf$/i, `_page_${pageNumber}.jpeg`); return new File([blob], newFileName, { type: 'image/jpeg' }); }; /** * Fetches a PDF document from a File object. * @param pdfFile The PDF file. * @returns A promise that resolves to the pdf.js document object. */ const getPdfDocument = async (pdfFile: File) => { // @ts-expect-error - pdfjsLib is globally available from the script tag in index.html if (typeof pdfjsLib === 'undefined') { throw new Error('pdf.js library is not loaded. Please check the script tag in index.html.'); } const arrayBuffer = await pdfFile.arrayBuffer(); // @ts-expect-error - pdfjsLib is globally available const pdf = await pdfjsLib.getDocument(arrayBuffer).promise; return pdf; }; /** * Converts all pages of a PDF file into an array of image File objects. * @param pdfFile The PDF file to convert. * @param onProgress Optional callback to report conversion progress. * @returns A promise that resolves to an object containing the array of image files and the total page count. */ export const convertPdfToImageFiles = async ( pdfFile: File, onProgress?: (currentPage: number, totalPages: number) => void ): Promise<{ imageFiles: File[], pageCount: number }> => { const pdf = await getPdfDocument(pdfFile); const pageCount = pdf.numPages; const imageFiles: File[] = []; const scale = 1.5; // Create an array of promises, one for each page rendering task. const pagePromises = Array.from({ length: pageCount }, async (_, i) => { const pageNumber = i + 1; const page = await pdf.getPage(pageNumber); const imageFile = await renderPageToImageFile(page, pageNumber, pdfFile.name, scale); // Report progress as each page finishes. onProgress?.(pageNumber, pageCount); return imageFile; }); // Process all pages in parallel and collect the results. const results = await Promise.all(pagePromises); imageFiles.push(...results); if (imageFiles.length === 0 && pageCount > 0) { throw new Error('PDF conversion resulted in zero images, though the PDF has pages. It might be corrupted or contain non-standard content.'); } return { imageFiles, pageCount }; };