Files
flyer-crawler.projectium.com/src/services/flyerAiProcessor.server.ts
Torben Sorensen 0010396780
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 41s
flyer upload (anon) issues
2025-12-31 02:08:37 -08:00

140 lines
5.4 KiB
TypeScript

// src/services/flyerAiProcessor.server.ts
import { z } from 'zod';
import type { Logger } from 'pino';
import type { AIService } from './aiService.server';
import type { PersonalizationRepository } from './db/personalization.db';
import { AiDataValidationError } from './processingErrors';
import type { FlyerJobData } from '../types/job-data';
import {
AiFlyerDataSchema,
ExtractedFlyerItemSchema,
requiredString,
} from '../types/ai'; // Import consolidated schemas and helper
export type ValidatedAiDataType = z.infer<typeof AiFlyerDataSchema>;
export interface AiProcessorResult {
data: ValidatedAiDataType;
needsReview: boolean;
}
/**
* This class encapsulates the logic for interacting with the AI service
* to extract and validate data from flyer images.
*/
export class FlyerAiProcessor {
constructor(
private ai: AIService,
private personalizationRepo: PersonalizationRepository,
) {}
/**
* Validates the raw data from the AI against the Zod schema.
*/
private _validateAiData(
extractedData: unknown,
logger: Logger,
): AiProcessorResult {
const validationResult = AiFlyerDataSchema.safeParse(extractedData);
if (!validationResult.success) {
const errors = validationResult.error.flatten();
logger.error({ errors, rawData: extractedData }, 'AI response failed validation.');
throw new AiDataValidationError(
'AI response validation failed. The returned data structure is incorrect.',
errors,
extractedData,
);
}
// --- NEW QUALITY CHECK ---
// After structural validation, perform semantic quality checks.
const { store_name, items } = validationResult.data;
let needsReview = false;
// 1. Check for a valid store name, but don't fail the job.
// The data transformer will handle this by assigning a fallback name.
if (!store_name || store_name.trim() === '') {
logger.warn({ rawData: extractedData }, 'AI response is missing a store name. The transformer will use a fallback. Flagging for review.');
needsReview = true;
}
// 2. Check that at least one item was extracted, but don't fail the job.
// An admin can review a flyer with 0 items.
if (!items || items.length === 0) {
logger.warn({ rawData: extractedData }, 'AI response contains no items. The flyer will be saved with an item_count of 0. Flagging for review.');
needsReview = true;
}
logger.info(`AI extracted ${validationResult.data.items.length} items.`);
return { data: validationResult.data, needsReview };
}
/**
* Calls the AI service to extract structured data from the flyer images and validates the response.
*/
public async extractAndValidateData(
imagePaths: { path: string; mimetype: string }[],
jobData: FlyerJobData,
logger: Logger,
): Promise<AiProcessorResult> {
logger.info(`Starting AI data extraction for ${imagePaths.length} pages.`);
const { submitterIp, userProfileAddress } = jobData;
const masterItems = await this.personalizationRepo.getAllMasterItems(logger);
logger.debug(`Retrieved ${masterItems.length} master items for AI matching.`);
// BATCHING LOGIC: Process images in chunks to avoid hitting AI payload/token limits.
const BATCH_SIZE = 4;
const batches = [];
for (let i = 0; i < imagePaths.length; i += BATCH_SIZE) {
batches.push(imagePaths.slice(i, i + BATCH_SIZE));
}
// Initialize container for merged data
const mergedData: ValidatedAiDataType = {
store_name: null,
valid_from: null,
valid_to: null,
store_address: null,
items: [],
};
logger.info(`Processing ${imagePaths.length} pages in ${batches.length} batches (Batch Size: ${BATCH_SIZE}).`);
for (const [index, batch] of batches.entries()) {
logger.info(`Processing batch ${index + 1}/${batches.length} (${batch.length} pages)...`);
// The AI service handles rate limiting internally (e.g., max 5 RPM).
// Processing these sequentially ensures we respect that limit.
const batchResult = await this.ai.extractCoreDataFromFlyerImage(
batch,
masterItems,
submitterIp,
userProfileAddress,
logger,
);
// MERGE LOGIC:
// 1. Metadata (Store Name, Dates): Prioritize the first batch (usually the cover page).
// If subsequent batches have data and the current is null, fill it in.
if (index === 0) {
mergedData.store_name = batchResult.store_name;
mergedData.valid_from = batchResult.valid_from;
mergedData.valid_to = batchResult.valid_to;
mergedData.store_address = batchResult.store_address;
} else {
if (!mergedData.store_name && batchResult.store_name) mergedData.store_name = batchResult.store_name;
if (!mergedData.valid_from && batchResult.valid_from) mergedData.valid_from = batchResult.valid_from;
if (!mergedData.valid_to && batchResult.valid_to) mergedData.valid_to = batchResult.valid_to;
if (!mergedData.store_address && batchResult.store_address) mergedData.store_address = batchResult.store_address;
}
// 2. Items: Append all found items to the master list.
mergedData.items.push(...batchResult.items);
}
logger.info(`Batch processing complete. Total items extracted: ${mergedData.items.length}`);
// Validate the final merged dataset
return this._validateAiData(mergedData, logger);
}
}