Files
flyer-crawler.projectium.com/sql/migrations/003_receipt_scanning_enhancements.sql
Torben Sorensen 11aeac5edd
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 1m10s
whoa - so much - new features (UPC,etc) - Sentry for app logging! so much more !
2026-01-11 19:07:02 -08:00

170 lines
8.8 KiB
SQL

-- sql/migrations/003_receipt_scanning_enhancements.sql
-- ============================================================================
-- RECEIPT SCANNING ENHANCEMENTS MIGRATION
-- ============================================================================
-- Purpose:
-- This migration adds enhancements to the existing receipt scanning tables:
-- 1. Enhancements to receipts table for better OCR processing
-- 2. Enhancements to receipt_items for better item matching
-- 3. receipt_processing_log for tracking OCR/AI processing attempts
--
-- Existing tables:
-- - receipts (lines 932-948 in master_schema_rollup.sql)
-- - receipt_items (lines 951-966 in master_schema_rollup.sql)
-- ============================================================================
-- 1. Enhancements to receipts table
-- Add store detection confidence
ALTER TABLE public.receipts
ADD COLUMN IF NOT EXISTS store_confidence NUMERIC(5,4);
COMMENT ON COLUMN public.receipts.store_confidence IS 'Confidence score for store detection (0.0-1.0).';
-- Add OCR provider used
ALTER TABLE public.receipts
ADD COLUMN IF NOT EXISTS ocr_provider TEXT;
COMMENT ON COLUMN public.receipts.ocr_provider IS 'Which OCR service processed this receipt: tesseract, openai, anthropic.';
-- Add error details for failed processing
ALTER TABLE public.receipts
ADD COLUMN IF NOT EXISTS error_details JSONB;
COMMENT ON COLUMN public.receipts.error_details IS 'Detailed error information if processing failed.';
-- Add retry count for failed processing
ALTER TABLE public.receipts
ADD COLUMN IF NOT EXISTS retry_count INTEGER DEFAULT 0;
COMMENT ON COLUMN public.receipts.retry_count IS 'Number of processing retry attempts.';
-- Add extracted text confidence
ALTER TABLE public.receipts
ADD COLUMN IF NOT EXISTS ocr_confidence NUMERIC(5,4);
COMMENT ON COLUMN public.receipts.ocr_confidence IS 'Overall OCR text extraction confidence score.';
-- Add currency detection
ALTER TABLE public.receipts
ADD COLUMN IF NOT EXISTS currency TEXT DEFAULT 'CAD';
COMMENT ON COLUMN public.receipts.currency IS 'Detected currency: CAD, USD, etc.';
-- New indexes for receipt processing
CREATE INDEX IF NOT EXISTS idx_receipts_status_retry ON public.receipts(status, retry_count)
WHERE status IN ('pending', 'failed') AND retry_count < 3;
-- 2. Enhancements to receipt_items table
-- Add line number from receipt for ordering
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS line_number INTEGER;
COMMENT ON COLUMN public.receipt_items.line_number IS 'Original line number on the receipt for display ordering.';
-- Add match confidence score
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS match_confidence NUMERIC(5,4);
COMMENT ON COLUMN public.receipt_items.match_confidence IS 'Confidence score for item matching (0.0-1.0).';
-- Add is_discount flag for discount/coupon lines
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS is_discount BOOLEAN DEFAULT FALSE;
COMMENT ON COLUMN public.receipt_items.is_discount IS 'Whether this line is a discount/coupon (negative price).';
-- Add unit_price if per-unit pricing detected
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS unit_price_cents INTEGER;
COMMENT ON COLUMN public.receipt_items.unit_price_cents IS 'Per-unit price if detected (e.g., price per kg).';
-- Add unit type if detected
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS unit_type TEXT;
COMMENT ON COLUMN public.receipt_items.unit_type IS 'Unit type if detected: kg, lb, each, etc.';
-- Add added_to_pantry flag
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS added_to_pantry BOOLEAN DEFAULT FALSE;
COMMENT ON COLUMN public.receipt_items.added_to_pantry IS 'Whether this item has been added to user pantry.';
-- Add pantry_item_id link
ALTER TABLE public.receipt_items
ADD COLUMN IF NOT EXISTS pantry_item_id BIGINT REFERENCES public.pantry_items(pantry_item_id) ON DELETE SET NULL;
COMMENT ON COLUMN public.receipt_items.pantry_item_id IS 'Link to pantry_items if this receipt item was added to pantry.';
-- New indexes for receipt_items
CREATE INDEX IF NOT EXISTS idx_receipt_items_status ON public.receipt_items(status);
CREATE INDEX IF NOT EXISTS idx_receipt_items_added_to_pantry ON public.receipt_items(receipt_id, added_to_pantry)
WHERE added_to_pantry = FALSE;
CREATE INDEX IF NOT EXISTS idx_receipt_items_pantry_item_id ON public.receipt_items(pantry_item_id)
WHERE pantry_item_id IS NOT NULL;
-- 3. Receipt Processing Log - track OCR/AI processing attempts
-- Useful for debugging, monitoring costs, and improving processing
CREATE TABLE IF NOT EXISTS public.receipt_processing_log (
log_id BIGINT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
receipt_id BIGINT NOT NULL REFERENCES public.receipts(receipt_id) ON DELETE CASCADE,
processing_step TEXT NOT NULL,
status TEXT NOT NULL,
provider TEXT,
duration_ms INTEGER,
tokens_used INTEGER,
cost_cents INTEGER,
input_data JSONB,
output_data JSONB,
error_message TEXT,
created_at TIMESTAMPTZ DEFAULT now() NOT NULL,
-- Validate processing step
CONSTRAINT receipt_processing_log_step_check CHECK (processing_step IN (
'upload', 'ocr_extraction', 'text_parsing', 'store_detection',
'item_extraction', 'item_matching', 'price_parsing', 'finalization'
)),
-- Validate status
CONSTRAINT receipt_processing_log_status_check CHECK (status IN ('started', 'completed', 'failed', 'skipped')),
-- Validate provider if specified
CONSTRAINT receipt_processing_log_provider_check CHECK (provider IS NULL OR provider IN (
'tesseract', 'openai', 'anthropic', 'google_vision', 'aws_textract', 'internal'
))
);
COMMENT ON TABLE public.receipt_processing_log IS 'Detailed log of each processing step for receipts, useful for debugging and cost tracking.';
COMMENT ON COLUMN public.receipt_processing_log.processing_step IS 'Which processing step this log entry is for.';
COMMENT ON COLUMN public.receipt_processing_log.status IS 'Status of this step: started, completed, failed, skipped.';
COMMENT ON COLUMN public.receipt_processing_log.provider IS 'External service used: tesseract, openai, anthropic, etc.';
COMMENT ON COLUMN public.receipt_processing_log.duration_ms IS 'How long this step took in milliseconds.';
COMMENT ON COLUMN public.receipt_processing_log.tokens_used IS 'Number of API tokens used (for LLM providers).';
COMMENT ON COLUMN public.receipt_processing_log.cost_cents IS 'Estimated cost in cents for this processing step.';
COMMENT ON COLUMN public.receipt_processing_log.input_data IS 'Input data sent to the processing step (for debugging).';
COMMENT ON COLUMN public.receipt_processing_log.output_data IS 'Output data received from the processing step.';
-- Indexes for receipt_processing_log
CREATE INDEX IF NOT EXISTS idx_receipt_processing_log_receipt_id ON public.receipt_processing_log(receipt_id);
CREATE INDEX IF NOT EXISTS idx_receipt_processing_log_step_status ON public.receipt_processing_log(processing_step, status);
CREATE INDEX IF NOT EXISTS idx_receipt_processing_log_created_at ON public.receipt_processing_log(created_at DESC);
-- 4. Store-specific receipt patterns - help identify stores from receipt text
CREATE TABLE IF NOT EXISTS public.store_receipt_patterns (
pattern_id BIGINT PRIMARY KEY GENERATED ALWAYS AS IDENTITY,
store_id BIGINT NOT NULL REFERENCES public.stores(store_id) ON DELETE CASCADE,
pattern_type TEXT NOT NULL,
pattern_value TEXT NOT NULL,
priority INTEGER DEFAULT 0,
is_active BOOLEAN DEFAULT TRUE,
created_at TIMESTAMPTZ DEFAULT now() NOT NULL,
updated_at TIMESTAMPTZ DEFAULT now() NOT NULL,
-- Validate pattern type
CONSTRAINT store_receipt_patterns_type_check CHECK (pattern_type IN (
'header_regex', 'footer_regex', 'phone_number', 'address_fragment', 'store_number_format'
)),
-- Validate pattern is not empty
CONSTRAINT store_receipt_patterns_value_check CHECK (TRIM(pattern_value) <> ''),
-- Unique constraint per store/type/value
UNIQUE(store_id, pattern_type, pattern_value)
);
COMMENT ON TABLE public.store_receipt_patterns IS 'Patterns to help identify stores from receipt text and format.';
COMMENT ON COLUMN public.store_receipt_patterns.pattern_type IS 'Type of pattern: header_regex, footer_regex, phone_number, etc.';
COMMENT ON COLUMN public.store_receipt_patterns.pattern_value IS 'The actual pattern (regex or literal text).';
COMMENT ON COLUMN public.store_receipt_patterns.priority IS 'Higher priority patterns are checked first.';
COMMENT ON COLUMN public.store_receipt_patterns.is_active IS 'Whether this pattern is currently in use.';
-- Indexes for store_receipt_patterns
CREATE INDEX IF NOT EXISTS idx_store_receipt_patterns_store_id ON public.store_receipt_patterns(store_id);
CREATE INDEX IF NOT EXISTS idx_store_receipt_patterns_active ON public.store_receipt_patterns(pattern_type, is_active, priority DESC)
WHERE is_active = TRUE;