All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 21m19s
166 lines
6.2 KiB
TypeScript
166 lines
6.2 KiB
TypeScript
// src/services/flyerAiProcessor.server.ts
|
|
import { z } from 'zod';
|
|
import type { Logger } from 'pino';
|
|
import type { AIService } from './aiService.server';
|
|
import type { PersonalizationRepository } from './db/personalization.db';
|
|
import { AiDataValidationError } from './processingErrors';
|
|
import type { FlyerJobData } from '../types/job-data';
|
|
import {
|
|
AiFlyerDataSchema,
|
|
ExtractedFlyerItemSchema,
|
|
requiredString,
|
|
} from '../types/ai'; // Import consolidated schemas and helper
|
|
|
|
export type ValidatedAiDataType = z.infer<typeof AiFlyerDataSchema>;
|
|
|
|
export interface AiProcessorResult {
|
|
data: ValidatedAiDataType;
|
|
needsReview: boolean;
|
|
}
|
|
|
|
/**
|
|
* This class encapsulates the logic for interacting with the AI service
|
|
* to extract and validate data from flyer images.
|
|
*/
|
|
export class FlyerAiProcessor {
|
|
constructor(
|
|
private ai: AIService,
|
|
private personalizationRepo: PersonalizationRepository,
|
|
) {}
|
|
|
|
/**
|
|
* Validates the raw data from the AI against the Zod schema.
|
|
*/
|
|
private _validateAiData(
|
|
extractedData: unknown,
|
|
logger: Logger,
|
|
): AiProcessorResult {
|
|
const validationResult = AiFlyerDataSchema.safeParse(extractedData);
|
|
if (!validationResult.success) {
|
|
const errors = validationResult.error.flatten();
|
|
logger.error({ errors, rawData: extractedData }, 'AI response failed validation.');
|
|
throw new AiDataValidationError(
|
|
'AI response validation failed. The returned data structure is incorrect.',
|
|
errors,
|
|
extractedData,
|
|
);
|
|
}
|
|
|
|
// --- Data Quality Checks ---
|
|
// After structural validation, perform semantic quality checks to flag low-quality
|
|
// extractions for manual review.
|
|
const { store_name, items, valid_from, valid_to } = validationResult.data;
|
|
const qualityIssues: string[] = [];
|
|
|
|
// 1. Check for a store name.
|
|
if (!store_name || store_name.trim() === '') {
|
|
qualityIssues.push('Missing store name');
|
|
}
|
|
|
|
// 2. Check that items were extracted.
|
|
if (!items || items.length === 0) {
|
|
qualityIssues.push('No items were extracted');
|
|
} else {
|
|
// 3. If items exist, check their quality (e.g., missing prices).
|
|
// The threshold is configurable via an environment variable, defaulting to 0.5 (50%).
|
|
const priceQualityThreshold = parseFloat(process.env.AI_PRICE_QUALITY_THRESHOLD || '0.5');
|
|
|
|
const itemsWithPrice = items.filter(
|
|
(item) => item.price_in_cents != null && item.price_in_cents > 0,
|
|
).length;
|
|
const priceQualityRatio = itemsWithPrice / items.length;
|
|
|
|
if (priceQualityRatio < priceQualityThreshold) {
|
|
// If the ratio of items with a valid price is below the threshold, flag for review.
|
|
qualityIssues.push(
|
|
`Low price quality (${(priceQualityRatio * 100).toFixed(0)}% of items have a price)`,
|
|
);
|
|
}
|
|
}
|
|
|
|
// 4. Check for flyer validity dates.
|
|
if (!valid_from && !valid_to) {
|
|
qualityIssues.push('Missing both valid_from and valid_to dates');
|
|
}
|
|
|
|
const needsReview = qualityIssues.length > 0;
|
|
if (needsReview) {
|
|
logger.warn(
|
|
{ rawData: extractedData, qualityIssues },
|
|
`AI response has quality issues. Flagging for review. Issues: ${qualityIssues.join(', ')}`,
|
|
);
|
|
}
|
|
|
|
logger.info(`AI extracted ${validationResult.data.items.length} items. Needs Review: ${needsReview}`);
|
|
return { data: validationResult.data, needsReview };
|
|
}
|
|
|
|
/**
|
|
* Calls the AI service to extract structured data from the flyer images and validates the response.
|
|
*/
|
|
public async extractAndValidateData(
|
|
imagePaths: { path: string; mimetype: string }[],
|
|
jobData: FlyerJobData,
|
|
logger: Logger,
|
|
): Promise<AiProcessorResult> {
|
|
logger.info(`Starting AI data extraction for ${imagePaths.length} pages.`);
|
|
const { submitterIp, userProfileAddress } = jobData;
|
|
const masterItems = await this.personalizationRepo.getAllMasterItems(logger);
|
|
logger.debug(`Retrieved ${masterItems.length} master items for AI matching.`);
|
|
|
|
// BATCHING LOGIC: Process images in chunks to avoid hitting AI payload/token limits.
|
|
const BATCH_SIZE = 4;
|
|
const batches = [];
|
|
for (let i = 0; i < imagePaths.length; i += BATCH_SIZE) {
|
|
batches.push(imagePaths.slice(i, i + BATCH_SIZE));
|
|
}
|
|
|
|
// Initialize container for merged data
|
|
const mergedData: ValidatedAiDataType = {
|
|
store_name: null,
|
|
valid_from: null,
|
|
valid_to: null,
|
|
store_address: null,
|
|
items: [],
|
|
};
|
|
|
|
logger.info(`Processing ${imagePaths.length} pages in ${batches.length} batches (Batch Size: ${BATCH_SIZE}).`);
|
|
|
|
for (const [index, batch] of batches.entries()) {
|
|
logger.info(`Processing batch ${index + 1}/${batches.length} (${batch.length} pages)...`);
|
|
|
|
// The AI service handles rate limiting internally (e.g., max 5 RPM).
|
|
// Processing these sequentially ensures we respect that limit.
|
|
const batchResult = await this.ai.extractCoreDataFromFlyerImage(
|
|
batch,
|
|
masterItems,
|
|
submitterIp,
|
|
userProfileAddress,
|
|
logger,
|
|
);
|
|
|
|
// MERGE LOGIC:
|
|
// 1. Metadata (Store Name, Dates): Prioritize the first batch (usually the cover page).
|
|
// If subsequent batches have data and the current is null, fill it in.
|
|
if (index === 0) {
|
|
mergedData.store_name = batchResult.store_name;
|
|
mergedData.valid_from = batchResult.valid_from;
|
|
mergedData.valid_to = batchResult.valid_to;
|
|
mergedData.store_address = batchResult.store_address;
|
|
} else {
|
|
if (!mergedData.store_name && batchResult.store_name) mergedData.store_name = batchResult.store_name;
|
|
if (!mergedData.valid_from && batchResult.valid_from) mergedData.valid_from = batchResult.valid_from;
|
|
if (!mergedData.valid_to && batchResult.valid_to) mergedData.valid_to = batchResult.valid_to;
|
|
if (!mergedData.store_address && batchResult.store_address) mergedData.store_address = batchResult.store_address;
|
|
}
|
|
|
|
// 2. Items: Append all found items to the master list.
|
|
mergedData.items.push(...(batchResult.items || []));
|
|
}
|
|
|
|
logger.info(`Batch processing complete. Total items extracted: ${mergedData.items.length}`);
|
|
|
|
// Validate the final merged dataset
|
|
return this._validateAiData(mergedData, logger);
|
|
}
|
|
} |