flyer-crawler.projectium.com/src/services/flyerAiProcessor.server.ts

// src/services/flyerAiProcessor.server.ts
import { z } from 'zod';
import type { Logger } from 'pino';
import type { AIService } from './aiService.server';
import type { PersonalizationRepository } from './db/personalization.db';
import { AiDataValidationError } from './processingErrors';
import type { FlyerJobData } from '../types/job-data';
import { AiFlyerDataSchema } from '../types/ai'; // Import consolidated schemas and helper

export type ValidatedAiDataType = z.infer<typeof AiFlyerDataSchema>;

export interface AiProcessorResult {
  data: ValidatedAiDataType;
  needsReview: boolean;
}

/**
 * Type definition for the extractAndValidateData method signature.
 * Used for dependency injection in tests.
 */
export type ExtractAndValidateDataFn = (
  imagePaths: { path: string; mimetype: string }[],
  jobData: FlyerJobData,
  logger: Logger,
) => Promise<AiProcessorResult>;

/**
 * This class encapsulates the logic for interacting with the AI service
 * to extract and validate data from flyer images.
 */
export class FlyerAiProcessor {
  private extractFn: ExtractAndValidateDataFn | null = null;

  constructor(
    private ai: AIService,
    private personalizationRepo: PersonalizationRepository,
  ) {}

  /**
   * Allows replacing the extractAndValidateData implementation at runtime.
   * This is primarily used for testing to inject mock implementations.
   * @internal
   */
  // Unique ID for this instance (for debugging multiple instance issues)
  private readonly instanceId = Math.random().toString(36).substring(7);

  _setExtractAndValidateData(fn: ExtractAndValidateDataFn | null): void {
    console.error(
      `[DEBUG] FlyerAiProcessor[${this.instanceId}]._setExtractAndValidateData called, ${fn ? 'replacing' : 'resetting'} extract function`,
    );
    this.extractFn = fn;
  }

  /**
   * Validates the raw data from the AI against the Zod schema.
   */
  private _validateAiData(extractedData: unknown, logger: Logger): AiProcessorResult {
    const validationResult = AiFlyerDataSchema.safeParse(extractedData);
    if (!validationResult.success) {
      const errors = validationResult.error.flatten();
      logger.error({ errors, rawData: extractedData }, 'AI response failed validation.');
      throw new AiDataValidationError(
        'AI response validation failed. The returned data structure is incorrect.',
        errors,
        extractedData,
      );
    }

    // --- Data Quality Checks ---
    // After structural validation, perform semantic quality checks to flag low-quality
    // extractions for manual review.
    const { store_name, items, valid_from, valid_to } = validationResult.data;
    const qualityIssues: string[] = [];

    // 1. Check for a store name.
    if (!store_name || store_name.trim() === '') {
      qualityIssues.push('Missing store name');
    }

    // 2. Check that items were extracted.
    if (!items || items.length === 0) {
      qualityIssues.push('No items were extracted');
    } else {
      // 3. If items exist, check their quality (e.g., missing prices).
      // The threshold is configurable via an environment variable, defaulting to 0.5 (50%).
      const priceQualityThreshold = parseFloat(process.env.AI_PRICE_QUALITY_THRESHOLD || '0.5');

      const itemsWithPrice = items.filter(
        (item) => item.price_in_cents != null && item.price_in_cents > 0,
      ).length;
      const priceQualityRatio = itemsWithPrice / items.length;

      if (priceQualityRatio < priceQualityThreshold) {
        // If the ratio of items with a valid price is below the threshold, flag for review.
        qualityIssues.push(
          `Low price quality (${(priceQualityRatio * 100).toFixed(0)}% of items have a price)`,
        );
      }
    }

    // 4. Check for flyer validity dates.
    if (!valid_from && !valid_to) {
      qualityIssues.push('Missing both valid_from and valid_to dates');
    }

    const needsReview = qualityIssues.length > 0;
    if (needsReview) {
      logger.warn(
        { rawData: extractedData, qualityIssues },
        `AI response has quality issues. Flagging for review. Issues: ${qualityIssues.join(', ')}`,
      );
    }

    logger.info(
      `AI extracted ${validationResult.data.items.length} items. Needs Review: ${needsReview}`,
    );
    return { data: validationResult.data, needsReview };
  }

  /**
   * Calls the AI service to extract structured data from the flyer images and validates the response.
   */
  public async extractAndValidateData(
    imagePaths: { path: string; mimetype: string }[],
    jobData: FlyerJobData,
    logger: Logger,
  ): Promise<AiProcessorResult> {
    console.error(
      `[WORKER DEBUG] FlyerAiProcessor[${this.instanceId}]: extractAndValidateData called with ${imagePaths.length} images, extractFn=${this.extractFn ? 'SET' : 'null'}`,
    );

    // If a mock function is injected (for testing), use it instead of the real implementation
    if (this.extractFn) {
      console.error(
        `[WORKER DEBUG] FlyerAiProcessor[${this.instanceId}]: Using injected extractFn mock`,
      );
      return this.extractFn(imagePaths, jobData, logger);
    }

    logger.info(`Starting AI data extraction for ${imagePaths.length} pages.`);
    const { submitterIp, userProfileAddress } = jobData;
    const { items: masterItems } = await this.personalizationRepo.getAllMasterItems(logger);
    logger.debug(`Retrieved ${masterItems.length} master items for AI matching.`);

    // BATCHING LOGIC: Process images in chunks to avoid hitting AI payload/token limits.
    const BATCH_SIZE = 4;
    const batches = [];
    for (let i = 0; i < imagePaths.length; i += BATCH_SIZE) {
      batches.push(imagePaths.slice(i, i + BATCH_SIZE));
    }

    // Initialize container for merged data
    const mergedData: ValidatedAiDataType = {
      store_name: null,
      valid_from: null,
      valid_to: null,
      store_address: null,
      items: [],
    };

    logger.info(
      `Processing ${imagePaths.length} pages in ${batches.length} batches (Batch Size: ${BATCH_SIZE}).`,
    );

    for (const [index, batch] of batches.entries()) {
      logger.info(`Processing batch ${index + 1}/${batches.length} (${batch.length} pages)...`);

      // The AI service handles rate limiting internally (e.g., max 5 RPM).
      // Processing these sequentially ensures we respect that limit.
      const batchResult = await this.ai.extractCoreDataFromFlyerImage(
        batch,
        masterItems,
        submitterIp,
        userProfileAddress,
        logger,
      );

      // MERGE LOGIC:
      // 1. Metadata (Store Name, Dates): Prioritize the first batch (usually the cover page).
      //    If subsequent batches have data and the current is null, fill it in.
      if (index === 0) {
        mergedData.store_name = batchResult.store_name;
        mergedData.valid_from = batchResult.valid_from;
        mergedData.valid_to = batchResult.valid_to;
        mergedData.store_address = batchResult.store_address;
      } else {
        if (!mergedData.store_name && batchResult.store_name)
          mergedData.store_name = batchResult.store_name;
        if (!mergedData.valid_from && batchResult.valid_from)
          mergedData.valid_from = batchResult.valid_from;
        if (!mergedData.valid_to && batchResult.valid_to)
          mergedData.valid_to = batchResult.valid_to;
        if (!mergedData.store_address && batchResult.store_address)
          mergedData.store_address = batchResult.store_address;
      }

      // 2. Items: Append all found items to the master list.
      mergedData.items.push(...(batchResult.items || []));
    }

    logger.info(`Batch processing complete. Total items extracted: ${mergedData.items.length}`);
    console.error(
      `[WORKER DEBUG] FlyerAiProcessor: Merged AI Data:`,
      JSON.stringify(mergedData, null, 2),
    );

    // Validate the final merged dataset
    return this._validateAiData(mergedData, logger);
  }
}