Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d4557e13fb | ||
| 3e41130c69 | |||
|
|
d9034563d6 | ||
| 5836a75157 |
4
package-lock.json
generated
4
package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "flyer-crawler",
|
||||
"version": "0.7.0",
|
||||
"version": "0.7.2",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "flyer-crawler",
|
||||
"version": "0.7.0",
|
||||
"version": "0.7.2",
|
||||
"dependencies": {
|
||||
"@bull-board/api": "^6.14.2",
|
||||
"@bull-board/express": "^6.14.2",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "flyer-crawler",
|
||||
"private": true,
|
||||
"version": "0.7.0",
|
||||
"version": "0.7.2",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "concurrently \"npm:start:dev\" \"vite\"",
|
||||
|
||||
@@ -43,7 +43,7 @@ const ExtractedFlyerItemSchema = z.object({
|
||||
});
|
||||
|
||||
export const AiFlyerDataSchema = z.object({
|
||||
store_name: requiredString('Store name cannot be empty'),
|
||||
store_name: z.string().nullable(),
|
||||
valid_from: z.string().nullable(),
|
||||
valid_to: z.string().nullable(),
|
||||
store_address: z.string().nullable(),
|
||||
@@ -507,7 +507,7 @@ export class AIService {
|
||||
userProfileAddress?: string,
|
||||
logger: Logger = this.logger,
|
||||
): Promise<{
|
||||
store_name: string;
|
||||
store_name: string | null;
|
||||
valid_from: string | null;
|
||||
valid_to: string | null;
|
||||
store_address: string | null;
|
||||
|
||||
@@ -127,4 +127,98 @@ describe('FlyerAiProcessor', () => {
|
||||
expect(result.needsReview).toBe(true);
|
||||
expect(logger.warn).toHaveBeenCalledWith(expect.any(Object), expect.stringContaining('contains no items. The flyer will be saved with an item_count of 0. Flagging for review.'));
|
||||
});
|
||||
|
||||
describe('Batching Logic', () => {
|
||||
it('should process images in batches and merge the results correctly', async () => {
|
||||
// Arrange
|
||||
const jobData = createMockJobData({});
|
||||
// 5 images, with BATCH_SIZE = 4, should result in 2 batches.
|
||||
const imagePaths = [
|
||||
{ path: 'page1.jpg', mimetype: 'image/jpeg' },
|
||||
{ path: 'page2.jpg', mimetype: 'image/jpeg' },
|
||||
{ path: 'page3.jpg', mimetype: 'image/jpeg' },
|
||||
{ path: 'page4.jpg', mimetype: 'image/jpeg' },
|
||||
{ path: 'page5.jpg', mimetype: 'image/jpeg' },
|
||||
];
|
||||
|
||||
const mockAiResponseBatch1 = {
|
||||
store_name: 'Batch 1 Store',
|
||||
valid_from: '2025-01-01',
|
||||
valid_to: '2025-01-07',
|
||||
store_address: '123 Batch St',
|
||||
items: [
|
||||
{ item: 'Item A', price_display: '$1', price_in_cents: 100, quantity: '1', category_name: 'Cat A', master_item_id: 1 },
|
||||
{ item: 'Item B', price_display: '$2', price_in_cents: 200, quantity: '1', category_name: 'Cat B', master_item_id: 2 },
|
||||
],
|
||||
};
|
||||
|
||||
const mockAiResponseBatch2 = {
|
||||
store_name: 'Batch 2 Store', // This should be ignored in the merge
|
||||
valid_from: null,
|
||||
valid_to: null,
|
||||
store_address: null,
|
||||
items: [
|
||||
{ item: 'Item C', price_display: '$3', price_in_cents: 300, quantity: '1', category_name: 'Cat C', master_item_id: 3 },
|
||||
],
|
||||
};
|
||||
|
||||
// Mock the AI service to return different results for each batch call
|
||||
vi.mocked(mockAiService.extractCoreDataFromFlyerImage)
|
||||
.mockResolvedValueOnce(mockAiResponseBatch1)
|
||||
.mockResolvedValueOnce(mockAiResponseBatch2);
|
||||
|
||||
// Act
|
||||
const result = await service.extractAndValidateData(imagePaths, jobData, logger);
|
||||
|
||||
// Assert
|
||||
// 1. AI service was called twice (for 2 batches)
|
||||
expect(mockAiService.extractCoreDataFromFlyerImage).toHaveBeenCalledTimes(2);
|
||||
|
||||
// 2. Check the arguments for each call
|
||||
expect(mockAiService.extractCoreDataFromFlyerImage).toHaveBeenNthCalledWith(1, imagePaths.slice(0, 4), [], undefined, undefined, logger);
|
||||
expect(mockAiService.extractCoreDataFromFlyerImage).toHaveBeenNthCalledWith(2, imagePaths.slice(4, 5), [], undefined, undefined, logger);
|
||||
|
||||
// 3. Check the merged data
|
||||
expect(result.data.store_name).toBe('Batch 1 Store'); // Metadata from the first batch
|
||||
expect(result.data.valid_from).toBe('2025-01-01');
|
||||
expect(result.data.valid_to).toBe('2025-01-07');
|
||||
expect(result.data.store_address).toBe('123 Batch St');
|
||||
|
||||
// 4. Check that items from both batches are merged
|
||||
expect(result.data.items).toHaveLength(3);
|
||||
expect(result.data.items).toEqual(expect.arrayContaining([
|
||||
expect.objectContaining({ item: 'Item A' }),
|
||||
expect.objectContaining({ item: 'Item B' }),
|
||||
expect.objectContaining({ item: 'Item C' }),
|
||||
]));
|
||||
|
||||
// 5. Check that the job is not flagged for review
|
||||
expect(result.needsReview).toBe(false);
|
||||
});
|
||||
|
||||
it('should fill in missing metadata from subsequent batches', async () => {
|
||||
// Arrange
|
||||
const jobData = createMockJobData({});
|
||||
const imagePaths = [
|
||||
{ path: 'page1.jpg', mimetype: 'image/jpeg' }, { path: 'page2.jpg', mimetype: 'image/jpeg' }, { path: 'page3.jpg', mimetype: 'image/jpeg' }, { path: 'page4.jpg', mimetype: 'image/jpeg' }, { path: 'page5.jpg', mimetype: 'image/jpeg' },
|
||||
];
|
||||
|
||||
const mockAiResponseBatch1 = { store_name: null, valid_from: '2025-01-01', valid_to: '2025-01-07', store_address: null, items: [{ item: 'Item A', price_display: '$1', price_in_cents: 100, quantity: '1', category_name: 'Cat A', master_item_id: 1 }] };
|
||||
const mockAiResponseBatch2 = { store_name: 'Batch 2 Store', valid_from: '2025-01-02', valid_to: null, store_address: '456 Subsequent St', items: [{ item: 'Item C', price_display: '$3', price_in_cents: 300, quantity: '1', category_name: 'Cat C', master_item_id: 3 }] };
|
||||
|
||||
vi.mocked(mockAiService.extractCoreDataFromFlyerImage)
|
||||
.mockResolvedValueOnce(mockAiResponseBatch1)
|
||||
.mockResolvedValueOnce(mockAiResponseBatch2);
|
||||
|
||||
// Act
|
||||
const result = await service.extractAndValidateData(imagePaths, jobData, logger);
|
||||
|
||||
// Assert
|
||||
expect(result.data.store_name).toBe('Batch 2 Store'); // Filled from batch 2
|
||||
expect(result.data.valid_from).toBe('2025-01-01'); // Kept from batch 1
|
||||
expect(result.data.valid_to).toBe('2025-01-07'); // Kept from batch 1
|
||||
expect(result.data.store_address).toBe('456 Subsequent St'); // Filled from batch 2
|
||||
expect(result.data.items).toHaveLength(2);
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -94,19 +94,64 @@ export class FlyerAiProcessor {
|
||||
jobData: FlyerJobData,
|
||||
logger: Logger,
|
||||
): Promise<AiProcessorResult> {
|
||||
logger.info(`Starting AI data extraction.`);
|
||||
logger.info(`Starting AI data extraction for ${imagePaths.length} pages.`);
|
||||
const { submitterIp, userProfileAddress } = jobData;
|
||||
const masterItems = await this.personalizationRepo.getAllMasterItems(logger);
|
||||
logger.debug(`Retrieved ${masterItems.length} master items for AI matching.`);
|
||||
|
||||
const extractedData = await this.ai.extractCoreDataFromFlyerImage(
|
||||
imagePaths,
|
||||
masterItems,
|
||||
submitterIp,
|
||||
userProfileAddress,
|
||||
logger,
|
||||
);
|
||||
// BATCHING LOGIC: Process images in chunks to avoid hitting AI payload/token limits.
|
||||
const BATCH_SIZE = 4;
|
||||
const batches = [];
|
||||
for (let i = 0; i < imagePaths.length; i += BATCH_SIZE) {
|
||||
batches.push(imagePaths.slice(i, i + BATCH_SIZE));
|
||||
}
|
||||
|
||||
return this._validateAiData(extractedData, logger);
|
||||
// Initialize container for merged data
|
||||
const mergedData: ValidatedAiDataType = {
|
||||
store_name: null,
|
||||
valid_from: null,
|
||||
valid_to: null,
|
||||
store_address: null,
|
||||
items: [],
|
||||
};
|
||||
|
||||
logger.info(`Processing ${imagePaths.length} pages in ${batches.length} batches (Batch Size: ${BATCH_SIZE}).`);
|
||||
|
||||
for (const [index, batch] of batches.entries()) {
|
||||
logger.info(`Processing batch ${index + 1}/${batches.length} (${batch.length} pages)...`);
|
||||
|
||||
// The AI service handles rate limiting internally (e.g., max 5 RPM).
|
||||
// Processing these sequentially ensures we respect that limit.
|
||||
const batchResult = await this.ai.extractCoreDataFromFlyerImage(
|
||||
batch,
|
||||
masterItems,
|
||||
submitterIp,
|
||||
userProfileAddress,
|
||||
logger,
|
||||
);
|
||||
|
||||
// MERGE LOGIC:
|
||||
// 1. Metadata (Store Name, Dates): Prioritize the first batch (usually the cover page).
|
||||
// If subsequent batches have data and the current is null, fill it in.
|
||||
if (index === 0) {
|
||||
mergedData.store_name = batchResult.store_name;
|
||||
mergedData.valid_from = batchResult.valid_from;
|
||||
mergedData.valid_to = batchResult.valid_to;
|
||||
mergedData.store_address = batchResult.store_address;
|
||||
} else {
|
||||
if (!mergedData.store_name && batchResult.store_name) mergedData.store_name = batchResult.store_name;
|
||||
if (!mergedData.valid_from && batchResult.valid_from) mergedData.valid_from = batchResult.valid_from;
|
||||
if (!mergedData.valid_to && batchResult.valid_to) mergedData.valid_to = batchResult.valid_to;
|
||||
if (!mergedData.store_address && batchResult.store_address) mergedData.store_address = batchResult.store_address;
|
||||
}
|
||||
|
||||
// 2. Items: Append all found items to the master list.
|
||||
mergedData.items.push(...batchResult.items);
|
||||
}
|
||||
|
||||
logger.info(`Batch processing complete. Total items extracted: ${mergedData.items.length}`);
|
||||
|
||||
// Validate the final merged dataset
|
||||
return this._validateAiData(mergedData, logger);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
// src/services/workers.server.ts
|
||||
import { Worker, Job, UnrecoverableError } from 'bullmq';
|
||||
import fsPromises from 'node:fs/promises';
|
||||
import { exec } from 'child_process';
|
||||
|
||||
Reference in New Issue
Block a user