diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 3b2acf83..3ecbb8fb 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -119,7 +119,9 @@ "mcp__localerrors__get_issue", "mcp__localerrors__get_event", "mcp__localerrors__list_teams", - "WebSearch" + "WebSearch", + "Bash(for trigger in update_price_history_on_flyer_item_insert update_recipe_rating_aggregates log_new_recipe log_new_flyer)", + "Bash(do echo \"=== $trigger ===\")" ] }, "enabledMcpjsonServers": [ diff --git a/docs/adr/0050-postgresql-function-observability.md b/docs/adr/0050-postgresql-function-observability.md index aab976d4..d26df29b 100644 --- a/docs/adr/0050-postgresql-function-observability.md +++ b/docs/adr/0050-postgresql-function-observability.md @@ -2,7 +2,7 @@ **Date**: 2026-01-11 -**Status**: Proposed +**Status**: Accepted (Fully Implemented) **Related**: [ADR-015](0015-application-performance-monitoring-and-error-tracking.md), [ADR-004](0004-standardized-application-wide-structured-logging.md) diff --git a/docs/adr/0052-granular-debug-logging-strategy.md b/docs/adr/0052-granular-debug-logging-strategy.md index 8699e940..7eb9a8e9 100644 --- a/docs/adr/0052-granular-debug-logging-strategy.md +++ b/docs/adr/0052-granular-debug-logging-strategy.md @@ -2,7 +2,9 @@ **Date**: 2026-01-11 -**Status**: Proposed +**Status**: Accepted (Fully Implemented) + +**Related**: [ADR-004](0004-standardized-application-wide-structured-logging.md) ## Context @@ -17,7 +19,9 @@ We will adopt a namespace-based debug filter pattern, similar to the `debug` npm ## Implementation -In `src/services/logger.server.ts`: +### Core Implementation (Completed 2026-01-11) + +Implemented in [src/services/logger.server.ts:140-150](src/services/logger.server.ts#L140-L150): ```typescript const debugModules = (process.env.DEBUG_MODULES || '').split(',').map((s) => s.trim()); @@ -33,10 +37,100 @@ export const createScopedLogger = (moduleName: string) => { }; ``` +### Adopted Services (Completed 2026-01-26) + +Services currently using `createScopedLogger`: + +- `ai-service` - AI/Gemini integration ([src/services/aiService.server.ts:1020](src/services/aiService.server.ts#L1020)) +- `flyer-processing-service` - Flyer upload and processing ([src/services/flyerProcessingService.server.ts:20](src/services/flyerProcessingService.server.ts#L20)) + ## Usage -To debug only AI and Database interactions: +### Enable Debug Logging for Specific Modules + +To debug only AI and flyer processing: ```bash -DEBUG_MODULES=ai-service,db-repo npm run dev +DEBUG_MODULES=ai-service,flyer-processing-service npm run dev ``` + +### Enable All Debug Logging + +Use wildcard to enable debug logging for all modules: + +```bash +DEBUG_MODULES=* npm run dev +``` + +### Common Module Names + +| Module Name | Purpose | File | +| -------------------------- | ---------------------------------------- | ----------------------------------------------- | +| `ai-service` | AI/Gemini API interactions | `src/services/aiService.server.ts` | +| `flyer-processing-service` | Flyer upload, validation, and processing | `src/services/flyerProcessingService.server.ts` | + +## Best Practices + +1. **Use Scoped Loggers for Long-Running Services**: Services with complex workflows or external API calls should use `createScopedLogger` to allow targeted debugging. + +2. **Use Child Loggers for Contextual Data**: Even within scoped loggers, create child loggers with job/request-specific context: + + ```typescript + const logger = createScopedLogger('my-service'); + + async function processJob(job: Job) { + const jobLogger = logger.child({ jobId: job.id, jobName: job.name }); + jobLogger.debug('Starting job processing'); + } + ``` + +3. **Module Naming Convention**: Use kebab-case suffixed with `-service` or `-worker` (e.g., `ai-service`, `email-worker`). + +4. **Production Usage**: `DEBUG_MODULES` can be set in production for temporary debugging, but should not be used continuously due to increased log volume. + +## Examples + +### Development Debugging + +Debug AI service issues during development: + +```bash +# Dev container +DEBUG_MODULES=ai-service npm run dev + +# Or via PM2 +DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api-dev +``` + +### Production Troubleshooting + +Temporarily enable debug logging for a specific subsystem: + +```bash +# SSH into production server +ssh root@projectium.com + +# Set environment variable and restart +DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api + +# View logs +pm2 logs flyer-crawler-api --lines 100 + +# Disable debug logging +pm2 unset DEBUG_MODULES flyer-crawler-api +pm2 restart flyer-crawler-api +``` + +## Consequences + +**Positive**: + +- Developers can inspect detailed logs for specific subsystems without log flooding +- Production debugging becomes more targeted and efficient +- No performance impact when debug logging is disabled +- Compatible with existing Pino logging infrastructure + +**Negative**: + +- Requires developers to know module names (mitigated by documentation above) +- Not all services have adopted scoped loggers yet (gradual migration) diff --git a/docs/adr/0053-worker-health-checks.md b/docs/adr/0053-worker-health-checks.md index 13034e6d..6918978e 100644 --- a/docs/adr/0053-worker-health-checks.md +++ b/docs/adr/0053-worker-health-checks.md @@ -2,7 +2,14 @@ **Date**: 2026-01-11 -**Status**: Proposed +**Status**: Accepted (Fully Implemented) + +**Implementation Status**: + +- ✅ BullMQ worker stall configuration (complete) +- ✅ Basic health endpoints (/live, /ready, /redis, etc.) +- ✅ /health/queues endpoint (complete) +- ✅ Worker heartbeat mechanism (complete) ## Context @@ -60,3 +67,76 @@ The `/health/queues` endpoint will: **Negative**: - Requires configuring external monitoring to poll the new endpoint. + +## Implementation Notes + +### Completed (2026-01-11) + +1. **BullMQ Stall Configuration** - `src/config/workerOptions.ts` + - All workers use `defaultWorkerOptions` with: + - `stalledInterval: 30000` (30s) + - `maxStalledCount: 3` + - `lockDuration: 30000` (30s) + - Applied to all 9 workers: flyer, email, analytics, cleanup, weekly-analytics, token-cleanup, receipt, expiry-alert, barcode + +2. **Basic Health Endpoints** - `src/routes/health.routes.ts` + - `/health/live` - Liveness probe + - `/health/ready` - Readiness probe (checks DB, Redis, storage) + - `/health/startup` - Startup probe + - `/health/redis` - Redis connectivity + - `/health/db-pool` - Database connection pool status + +### Implementation Completed (2026-01-26) + +1. **`/health/queues` Endpoint** ✅ + - Added route to `src/routes/health.routes.ts:511-674` + - Iterates through all 9 queues from `src/services/queues.server.ts` + - Fetches job counts using BullMQ Queue API: `getJobCounts()` + - Returns structured response including both queue metrics and worker heartbeats: + + ```typescript + { + status: 'healthy' | 'unhealthy', + timestamp: string, + queues: { + [queueName]: { + waiting: number, + active: number, + failed: number, + delayed: number + } + }, + workers: { + [workerName]: { + alive: boolean, + lastSeen?: string, + pid?: number, + host?: string + } + } + } + ``` + + - Returns 200 OK if all healthy, 503 if any queue/worker unavailable + - Full OpenAPI documentation included + +2. **Worker Heartbeat Mechanism** ✅ + - Added `updateWorkerHeartbeat()` and `startWorkerHeartbeat()` in `src/services/workers.server.ts:100-149` + - Key pattern: `worker:heartbeat:` + - Stores: `{ timestamp: ISO8601, pid: number, host: string }` + - Updates every 30s with 90s TTL + - Integrated with `/health/queues` endpoint (checks if heartbeat < 60s old) + - Heartbeat intervals properly cleaned up in `closeWorkers()` and `gracefulShutdown()` + +3. **Comprehensive Tests** ✅ + - Added 5 test cases in `src/routes/health.routes.test.ts:623-858` + - Tests cover: healthy state, queue failures, stale heartbeats, missing heartbeats, Redis errors + - All tests follow existing patterns with proper mocking + +### Future Enhancements (Not Implemented) + +1. **Queue Depth Alerting** (Low Priority) + - Add configurable thresholds per queue type + - Return 500 if `waiting` count exceeds threshold for extended period + - Consider using Redis for storing threshold breach timestamps + - **Estimate**: 1-2 hours diff --git a/docs/adr/0023-database-normalization-and-referential-integrity.md b/docs/adr/0055-database-normalization-and-referential-integrity.md similarity index 99% rename from docs/adr/0023-database-normalization-and-referential-integrity.md rename to docs/adr/0055-database-normalization-and-referential-integrity.md index 652e7b03..6520204a 100644 --- a/docs/adr/0023-database-normalization-and-referential-integrity.md +++ b/docs/adr/0055-database-normalization-and-referential-integrity.md @@ -1,4 +1,4 @@ -# ADR-023: Database Normalization and Referential Integrity +# ADR-055: Database Normalization and Referential Integrity **Date:** 2026-01-19 **Status:** Accepted diff --git a/docs/adr/adr-implementation-tracker.md b/docs/adr/adr-implementation-tracker.md index dfe0710d..c330c9fe 100644 --- a/docs/adr/adr-implementation-tracker.md +++ b/docs/adr/adr-implementation-tracker.md @@ -15,8 +15,8 @@ This document tracks the implementation status and estimated effort for all Arch | Status | Count | | ---------------------------- | ----- | -| Accepted (Fully Implemented) | 30 | -| Partially Implemented | 2 | +| Accepted (Fully Implemented) | 36 | +| Partially Implemented | 3 | | Proposed (Not Started) | 16 | --- @@ -62,25 +62,30 @@ This document tracks the implementation status and estimated effort for all Arch | [ADR-029](./0029-secret-rotation-and-key-management.md) | Secret Rotation | Proposed | L | Infrastructure changes needed | | [ADR-032](./0032-rate-limiting-strategy.md) | Rate Limiting | Accepted | - | Fully implemented | | [ADR-033](./0033-file-upload-and-storage-strategy.md) | File Upload & Storage | Accepted | - | Fully implemented | +| [ADR-048](./0048-authentication-strategy.md) | Authentication | Partial | M | JWT done, OAuth pending | ### Category 5: Observability & Monitoring -| ADR | Title | Status | Effort | Notes | -| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | --------------------------------- | -| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented | -| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration | -| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Proposed | M | Depends on ADR-015 implementation | +| ADR | Title | Status | Effort | Notes | +| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | ----------------------- | +| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented | +| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration | +| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Accepted | - | Fully implemented | +| [ADR-051](./0051-asynchronous-context-propagation.md) | Context Propagation | Accepted | - | Fully implemented | +| [ADR-052](./0052-granular-debug-logging-strategy.md) | Granular Debug Logging | Accepted | - | Fully implemented | ### Category 6: Deployment & Operations -| ADR | Title | Status | Effort | Notes | -| -------------------------------------------------------------- | ----------------- | -------- | ------ | -------------------------- | -| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented | -| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending | -| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented | -| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed | -| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented | -| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented | +| ADR | Title | Status | Effort | Notes | +| -------------------------------------------------------------- | ------------------ | -------- | ------ | -------------------------- | +| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented | +| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending | +| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented | +| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed | +| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented | +| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented | +| [ADR-053](./0053-worker-health-checks.md) | Worker Health | Accepted | - | Fully implemented | +| [ADR-054](./0054-bugsink-gitea-issue-sync.md) | Bugsink-Gitea Sync | Proposed | L | Automated issue creation | ### Category 7: Frontend / User Interface @@ -99,22 +104,24 @@ This document tracks the implementation status and estimated effort for all Arch | [ADR-010](./0010-testing-strategy-and-standards.md) | Testing Strategy | Accepted | - | Fully implemented | | [ADR-021](./0021-code-formatting-and-linting-unification.md) | Formatting & Linting | Accepted | - | Fully implemented | | [ADR-027](./0027-standardized-naming-convention-for-ai-and-database-types.md) | Naming Conventions | Accepted | - | Fully implemented | +| [ADR-040](./0040-testing-economics-and-priorities.md) | Testing Economics | Accepted | - | Fully implemented | | [ADR-045](./0045-test-data-factories-and-fixtures.md) | Test Data Factories | Accepted | - | Fully implemented | | [ADR-047](./0047-project-file-and-folder-organization.md) | Project Organization | Proposed | XL | Major reorganization | ### Category 9: Architecture Patterns -| ADR | Title | Status | Effort | Notes | -| -------------------------------------------------------- | --------------------- | -------- | ------ | ----------------- | -| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented | -| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented | -| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented | -| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented | -| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented | -| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented | -| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented | -| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented | -| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented | +| ADR | Title | Status | Effort | Notes | +| --------------------------------------------------------------------- | --------------------- | -------- | ------ | ------------------------- | +| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented | +| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented | +| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented | +| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented | +| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented | +| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented | +| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented | +| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented | +| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented | +| [ADR-055](./0055-database-normalization-and-referential-integrity.md) | DB Normalization | Accepted | M | API uses IDs, not strings | --- @@ -122,38 +129,48 @@ This document tracks the implementation status and estimated effort for all Arch These ADRs are proposed but not yet implemented, ordered by suggested implementation priority: -| Priority | ADR | Title | Effort | Rationale | -| -------- | ------- | --------------------------- | ------ | ------------------------------------------------- | -| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging | -| 1b | ADR-050 | PostgreSQL Fn Observability | M | Database function visibility (depends on ADR-015) | -| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing | -| 3 | ADR-023 | Schema Migrations v2 | L | Database evolution support | -| 4 | ADR-029 | Secret Rotation | L | Security improvement | -| 5 | ADR-008 | API Versioning | L | Future API evolution | -| 6 | ADR-030 | Circuit Breaker | L | Resilience improvement | -| 7 | ADR-022 | Real-time Notifications | XL | Major feature enhancement | -| 8 | ADR-011 | Authorization & RBAC | XL | Advanced permission system | -| 9 | ADR-025 | i18n & l10n | XL | Multi-language support | -| 10 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements | +| Priority | ADR | Title | Effort | Rationale | +| -------- | ------- | ------------------------ | ------ | ------------------------------------ | +| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging | +| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing | +| 3 | ADR-054 | Bugsink-Gitea Sync | L | Automated issue tracking from errors | +| 4 | ADR-023 | Schema Migrations v2 | L | Database evolution support | +| 5 | ADR-029 | Secret Rotation | L | Security improvement | +| 6 | ADR-008 | API Versioning | L | Future API evolution | +| 7 | ADR-030 | Circuit Breaker | L | Resilience improvement | +| 8 | ADR-022 | Real-time Notifications | XL | Major feature enhancement | +| 9 | ADR-011 | Authorization & RBAC | XL | Advanced permission system | +| 10 | ADR-025 | i18n & l10n | XL | Multi-language support | +| 11 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements | --- ## Recent Implementation History -| Date | ADR | Change | -| ---------- | ------- | ---------------------------------------------------------------------- | -| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash | -| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs | -| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements | -| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan | -| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting | -| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing | -| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns | -| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization | -| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation | -| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping | -| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger | -| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated | +| Date | ADR | Change | +| ---------- | ------- | ------------------------------------------------------------------------------------ | +| 2026-01-26 | ADR-053 | Marked as fully implemented - /health/queues endpoint and worker heartbeats complete | +| 2026-01-26 | ADR-050 | Marked as fully implemented - PostgreSQL function observability complete | +| 2026-01-26 | ADR-055 | Created (renumbered from duplicate ADR-023) - Database normalization | +| 2026-01-26 | ADR-054 | Added to tracker - Bugsink to Gitea issue synchronization | +| 2026-01-26 | ADR-053 | Added to tracker - Worker health checks and monitoring | +| 2026-01-26 | ADR-052 | Added to tracker - Granular debug logging strategy | +| 2026-01-26 | ADR-051 | Added to tracker - Asynchronous context propagation | +| 2026-01-26 | ADR-048 | Added to tracker - Authentication strategy | +| 2026-01-26 | ADR-040 | Added to tracker - Testing economics and priorities | +| 2026-01-17 | ADR-054 | Created - Bugsink-Gitea sync worker proposal | +| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash | +| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs | +| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements | +| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan | +| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting | +| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing | +| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns | +| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization | +| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation | +| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping | +| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger | +| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated | --- diff --git a/docs/adr/index.md b/docs/adr/index.md index 943fb073..0bb6d3ca 100644 --- a/docs/adr/index.md +++ b/docs/adr/index.md @@ -21,7 +21,7 @@ This directory contains a log of the architectural decisions made for the Flyer **[ADR-003](./0003-standardized-input-validation-using-middleware.md)**: Standardized Input Validation using Middleware (Accepted) **[ADR-008](./0008-api-versioning-strategy.md)**: API Versioning Strategy (Proposed) -**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Proposed) +**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Accepted) **[ADR-022](./0022-real-time-notification-system.md)**: Real-time Notification System (Proposed) **[ADR-028](./0028-api-response-standardization.md)**: API Response Standardization and Envelope Pattern (Implemented) @@ -39,6 +39,9 @@ This directory contains a log of the architectural decisions made for the Flyer **[ADR-004](./0004-standardized-application-wide-structured-logging.md)**: Standardized Application-Wide Structured Logging (Accepted) **[ADR-015](./0015-application-performance-monitoring-and-error-tracking.md)**: Application Performance Monitoring (APM) and Error Tracking (Proposed) +**[ADR-050](./0050-postgresql-function-observability.md)**: PostgreSQL Function Observability (Proposed) +**[ADR-051](./0051-asynchronous-context-propagation.md)**: Asynchronous Context Propagation (Accepted) +**[ADR-052](./0052-granular-debug-logging-strategy.md)**: Granular Debug Logging Strategy (Proposed) ## 6. Deployment & Operations @@ -48,13 +51,15 @@ This directory contains a log of the architectural decisions made for the Flyer **[ADR-024](./0024-feature-flagging-strategy.md)**: Feature Flagging Strategy (Proposed) **[ADR-037](./0037-scheduled-jobs-and-cron-pattern.md)**: Scheduled Jobs and Cron Pattern (Accepted) **[ADR-038](./0038-graceful-shutdown-pattern.md)**: Graceful Shutdown Pattern (Accepted) +**[ADR-053](./0053-worker-health-checks-and-monitoring.md)**: Worker Health Checks and Monitoring (Proposed) +**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed) ## 7. Frontend / User Interface **[ADR-005](./0005-frontend-state-management-and-server-cache-strategy.md)**: Frontend State Management and Server Cache Strategy (Accepted) **[ADR-012](./0012-frontend-component-library-and-design-system.md)**: Frontend Component Library and Design System (Partially Implemented) **[ADR-025](./0025-internationalization-and-localization-strategy.md)**: Internationalization (i18n) and Localization (l10n) Strategy (Proposed) -**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Proposed) +**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Accepted) **[ADR-044](./0044-frontend-feature-organization.md)**: Frontend Feature Organization Pattern (Accepted) ## 8. Development Workflow & Quality @@ -76,3 +81,5 @@ This directory contains a log of the architectural decisions made for the Flyer **[ADR-042](./0042-email-and-notification-architecture.md)**: Email and Notification Architecture (Accepted) **[ADR-043](./0043-express-middleware-pipeline.md)**: Express Middleware Pipeline Architecture (Accepted) **[ADR-046](./0046-image-processing-pipeline.md)**: Image Processing Pipeline (Accepted) +**[ADR-049](./0049-gamification-and-achievement-system.md)**: Gamification and Achievement System (Accepted) +**[ADR-055](./0055-database-normalization-and-referential-integrity.md)**: Database Normalization and Referential Integrity (Accepted) diff --git a/src/routes/health.routes.test.ts b/src/routes/health.routes.test.ts index f2cb5524..749bd464 100644 --- a/src/routes/health.routes.test.ts +++ b/src/routes/health.routes.test.ts @@ -619,4 +619,240 @@ describe('Health Routes (/api/health)', () => { expect(response.body.error.details.database.message).toBe('Database connection failed'); }); }); + + // ============================================================================= + // QUEUE HEALTH MONITORING (ADR-053) + // ============================================================================= + + describe('GET /queues', () => { + // Mock the queues module + beforeEach(async () => { + vi.resetModules(); + // Re-import after mocks are set up + }); + + it('should return 200 OK with queue metrics and worker heartbeats when all healthy', async () => { + // Arrange: Mock queue getJobCounts() and Redis heartbeats + const mockQueues = await import('../services/queues.server'); + const mockQueue = { + getJobCounts: vi.fn().mockResolvedValue({ + waiting: 5, + active: 2, + failed: 1, + delayed: 0, + }), + }; + + // Mock all queues + vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never); + + // Mock Redis heartbeat responses (all healthy, last seen < 60s ago) + const recentTimestamp = new Date(Date.now() - 10000).toISOString(); // 10 seconds ago + const heartbeatValue = JSON.stringify({ + timestamp: recentTimestamp, + pid: 1234, + host: 'test-host', + }); + + mockedRedisConnection.get = vi.fn().mockResolvedValue(heartbeatValue); + + // Act + const response = await supertest(app).get('/api/health/queues'); + + // Assert + expect(response.status).toBe(200); + expect(response.body.success).toBe(true); + expect(response.body.data.status).toBe('healthy'); + expect(response.body.data.queues).toBeDefined(); + expect(response.body.data.workers).toBeDefined(); + + // Verify queue metrics structure + expect(response.body.data.queues['flyer-processing']).toEqual({ + waiting: 5, + active: 2, + failed: 1, + delayed: 0, + }); + + // Verify worker heartbeat structure + expect(response.body.data.workers['flyer-processing']).toEqual({ + alive: true, + lastSeen: recentTimestamp, + pid: 1234, + host: 'test-host', + }); + }); + + it('should return 503 when a queue is unavailable', async () => { + // Arrange: Mock one queue to fail + const mockQueues = await import('../services/queues.server'); + const healthyQueue = { + getJobCounts: vi.fn().mockResolvedValue({ + waiting: 0, + active: 0, + failed: 0, + delayed: 0, + }), + }; + const failingQueue = { + getJobCounts: vi.fn().mockRejectedValue(new Error('Redis connection lost')), + }; + + vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(failingQueue as never); + vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(healthyQueue as never); + vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(healthyQueue as never); + + mockedRedisConnection.get = vi.fn().mockResolvedValue(null); + + // Act + const response = await supertest(app).get('/api/health/queues'); + + // Assert + expect(response.status).toBe(503); + expect(response.body.success).toBe(false); + expect(response.body.error.message).toBe('One or more queues or workers unavailable'); + expect(response.body.error.details.status).toBe('unhealthy'); + expect(response.body.error.details.queues['flyer-processing']).toEqual({ + error: 'Redis connection lost', + }); + }); + + it('should return 503 when a worker heartbeat is stale', async () => { + // Arrange: Mock queues as healthy but one worker heartbeat as stale + const mockQueues = await import('../services/queues.server'); + const mockQueue = { + getJobCounts: vi.fn().mockResolvedValue({ + waiting: 0, + active: 0, + failed: 0, + delayed: 0, + }), + }; + + vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never); + + // Mock heartbeat - one worker is stale (> 60s ago) + const staleTimestamp = new Date(Date.now() - 120000).toISOString(); // 120 seconds ago + const staleHeartbeat = JSON.stringify({ + timestamp: staleTimestamp, + pid: 1234, + host: 'test-host', + }); + + // First call returns stale heartbeat for flyer-processing, rest return null (no heartbeat) + let callCount = 0; + mockedRedisConnection.get = vi.fn().mockImplementation(() => { + callCount++; + return Promise.resolve(callCount === 1 ? staleHeartbeat : null); + }); + + // Act + const response = await supertest(app).get('/api/health/queues'); + + // Assert + expect(response.status).toBe(503); + expect(response.body.success).toBe(false); + expect(response.body.error.details.status).toBe('unhealthy'); + expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false }); + }); + + it('should return 503 when worker heartbeat is missing', async () => { + // Arrange: Mock queues as healthy but no worker heartbeats in Redis + const mockQueues = await import('../services/queues.server'); + const mockQueue = { + getJobCounts: vi.fn().mockResolvedValue({ + waiting: 0, + active: 0, + failed: 0, + delayed: 0, + }), + }; + + vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never); + + // Mock Redis to return null (no heartbeat found) + mockedRedisConnection.get = vi.fn().mockResolvedValue(null); + + // Act + const response = await supertest(app).get('/api/health/queues'); + + // Assert + expect(response.status).toBe(503); + expect(response.body.success).toBe(false); + expect(response.body.error.details.status).toBe('unhealthy'); + expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false }); + }); + + it('should handle Redis connection errors gracefully', async () => { + // Arrange: Mock queues to succeed but Redis get() to fail + const mockQueues = await import('../services/queues.server'); + const mockQueue = { + getJobCounts: vi.fn().mockResolvedValue({ + waiting: 0, + active: 0, + failed: 0, + delayed: 0, + }), + }; + + vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never); + vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never); + + // Mock Redis get() to throw error + mockedRedisConnection.get = vi.fn().mockRejectedValue(new Error('Redis connection lost')); + + // Act + const response = await supertest(app).get('/api/health/queues'); + + // Assert: Should still return queue metrics but mark workers as unhealthy + expect(response.status).toBe(503); + expect(response.body.error.details.queues['flyer-processing']).toEqual({ + waiting: 0, + active: 0, + failed: 0, + delayed: 0, + }); + expect(response.body.error.details.workers['flyer-processing']).toEqual({ + alive: false, + error: 'Redis connection lost', + }); + }); + }); }); diff --git a/src/routes/health.routes.ts b/src/routes/health.routes.ts index bffe0213..146ff5f8 100644 --- a/src/routes/health.routes.ts +++ b/src/routes/health.routes.ts @@ -15,6 +15,17 @@ import fs from 'node:fs/promises'; import { getSimpleWeekAndYear } from '../utils/dateUtils'; import { validateRequest } from '../middleware/validation.middleware'; import { sendSuccess, sendError, ErrorCode } from '../utils/apiResponse'; +import { + flyerQueue, + emailQueue, + analyticsQueue, + weeklyAnalyticsQueue, + cleanupQueue, + tokenCleanupQueue, + receiptQueue, + expiryAlertQueue, + barcodeQueue, +} from '../services/queues.server'; const router = Router(); @@ -442,4 +453,224 @@ router.get( }, ); +// ============================================================================= +// QUEUE HEALTH MONITORING (ADR-053) +// ============================================================================= + +/** + * @openapi + * /health/queues: + * get: + * summary: Queue health and metrics with worker heartbeats + * description: | + * Returns job counts for all BullMQ queues and worker heartbeat status. + * Use this endpoint to monitor queue depths and detect stuck/frozen workers. + * Implements ADR-053: Worker Health Checks and Stalled Job Monitoring. + * tags: + * - Health + * responses: + * 200: + * description: Queue metrics and worker heartbeats retrieved successfully + * content: + * application/json: + * schema: + * type: object + * properties: + * success: + * type: boolean + * example: true + * data: + * type: object + * properties: + * status: + * type: string + * enum: [healthy, unhealthy] + * timestamp: + * type: string + * format: date-time + * queues: + * type: object + * additionalProperties: + * type: object + * properties: + * waiting: + * type: integer + * active: + * type: integer + * failed: + * type: integer + * delayed: + * type: integer + * workers: + * type: object + * additionalProperties: + * type: object + * properties: + * alive: + * type: boolean + * lastSeen: + * type: string + * format: date-time + * pid: + * type: integer + * host: + * type: string + * 503: + * description: Redis unavailable or workers not responding + * content: + * application/json: + * schema: + * $ref: '#/components/schemas/ErrorResponse' + */ +router.get( + '/queues', + validateRequest(emptySchema), + async (req: Request, res: Response, next: NextFunction) => { + try { + // Define all queues to monitor + const queues = [ + { name: 'flyer-processing', queue: flyerQueue }, + { name: 'email-sending', queue: emailQueue }, + { name: 'analytics-reporting', queue: analyticsQueue }, + { name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue }, + { name: 'file-cleanup', queue: cleanupQueue }, + { name: 'token-cleanup', queue: tokenCleanupQueue }, + { name: 'receipt-processing', queue: receiptQueue }, + { name: 'expiry-alerts', queue: expiryAlertQueue }, + { name: 'barcode-detection', queue: barcodeQueue }, + ]; + + // Fetch job counts for all queues in parallel + const queueMetrics = await Promise.all( + queues.map(async ({ name, queue }) => { + try { + const counts = await queue.getJobCounts(); + return { + name, + counts: { + waiting: counts.waiting || 0, + active: counts.active || 0, + failed: counts.failed || 0, + delayed: counts.delayed || 0, + }, + }; + } catch (error) { + // If individual queue fails, return error state + return { + name, + error: error instanceof Error ? error.message : 'Unknown error', + }; + } + }), + ); + + // Fetch worker heartbeats in parallel + const workerNames = queues.map((q) => q.name); + const workerHeartbeats = await Promise.all( + workerNames.map(async (name) => { + try { + const key = `worker:heartbeat:${name}`; + const value = await redisConnection.get(key); + + if (!value) { + return { name, alive: false }; + } + + const heartbeat = JSON.parse(value) as { + timestamp: string; + pid: number; + host: string; + }; + const lastSeenMs = new Date(heartbeat.timestamp).getTime(); + const nowMs = Date.now(); + const ageSeconds = (nowMs - lastSeenMs) / 1000; + + // Consider alive if last heartbeat < 60 seconds ago + const alive = ageSeconds < 60; + + return { + name, + alive, + lastSeen: heartbeat.timestamp, + pid: heartbeat.pid, + host: heartbeat.host, + }; + } catch (error) { + // If heartbeat check fails, mark as unknown + return { + name, + alive: false, + error: error instanceof Error ? error.message : 'Unknown error', + }; + } + }), + ); + + // Build response objects + const queuesData: Record< + string, + { waiting: number; active: number; failed: number; delayed: number } | { error: string } + > = {}; + const workersData: Record< + string, + | { alive: boolean; lastSeen?: string; pid?: number; host?: string } + | { alive: boolean; error: string } + > = {}; + let hasErrors = false; + + for (const metric of queueMetrics) { + if ('error' in metric) { + queuesData[metric.name] = { error: metric.error }; + hasErrors = true; + } else { + queuesData[metric.name] = metric.counts; + } + } + + for (const heartbeat of workerHeartbeats) { + if ('error' in heartbeat) { + workersData[heartbeat.name] = { alive: false, error: heartbeat.error }; + } else if (!heartbeat.alive) { + workersData[heartbeat.name] = { alive: false }; + hasErrors = true; + } else { + workersData[heartbeat.name] = { + alive: heartbeat.alive, + lastSeen: heartbeat.lastSeen, + pid: heartbeat.pid, + host: heartbeat.host, + }; + } + } + + const response = { + status: hasErrors ? ('unhealthy' as const) : ('healthy' as const), + timestamp: new Date().toISOString(), + queues: queuesData, + workers: workersData, + }; + + if (hasErrors) { + return sendError( + res, + ErrorCode.SERVICE_UNAVAILABLE, + 'One or more queues or workers unavailable', + 503, + response, + ); + } + + return sendSuccess(res, response); + } catch (error: unknown) { + // Redis connection error or other unexpected failure + if (error instanceof Error) { + return next(error); + } + const message = + (error as { message?: string })?.message || 'Failed to retrieve queue metrics'; + return next(new Error(message)); + } + }, +); + export default router; diff --git a/src/services/workers.server.ts b/src/services/workers.server.ts index 2be7cf59..ed7aaf7a 100644 --- a/src/services/workers.server.ts +++ b/src/services/workers.server.ts @@ -3,6 +3,7 @@ import { Worker, Job } from 'bullmq'; import fsPromises from 'node:fs/promises'; import { exec } from 'child_process'; import { promisify } from 'util'; +import os from 'os'; import { logger } from './logger.server'; import { connection } from './redis.server'; @@ -91,6 +92,45 @@ const createWorkerProcessor = (processor: (job: Job) => Promise) => }; }; +/** + * Updates the worker heartbeat in Redis. + * Stores timestamp, PID, and hostname to detect frozen/hung workers. + * TTL is 90s, so if heartbeat isn't updated for 90s, the key expires. + * Implements ADR-053: Worker Health Checks. + */ +const updateWorkerHeartbeat = async (workerName: string) => { + const key = `worker:heartbeat:${workerName}`; + const value = JSON.stringify({ + timestamp: new Date().toISOString(), + pid: process.pid, + host: os.hostname(), + }); + + try { + await connection.set(key, value, 'EX', 90); + } catch (error) { + logger.error({ err: error, workerName }, `Failed to update heartbeat for worker ${workerName}`); + } +}; + +/** + * Starts periodic heartbeat updates for a worker. + * Updates every 30 seconds with 90s TTL. + */ +const startWorkerHeartbeat = (worker: Worker) => { + // Initial heartbeat + updateWorkerHeartbeat(worker.name); + + // Periodic heartbeat updates + const heartbeatInterval = setInterval(() => { + updateWorkerHeartbeat(worker.name); + }, 30000); // 30 seconds + + // Store interval on worker for cleanup + (worker as unknown as { heartbeatInterval?: NodeJS.Timeout }).heartbeatInterval = + heartbeatInterval; +}; + const attachWorkerEventListeners = (worker: Worker) => { worker.on('completed', (job: Job, returnValue: unknown) => { logger.info({ returnValue }, `[${worker.name}] Job ${job.id} completed successfully.`); @@ -102,6 +142,9 @@ const attachWorkerEventListeners = (worker: Worker) => { `[${worker.name}] Job ${job?.id} has ultimately failed after all attempts.`, ); }); + + // Start heartbeat monitoring for this worker + startWorkerHeartbeat(worker); }; export const flyerWorker = new Worker( @@ -219,17 +262,28 @@ const SHUTDOWN_TIMEOUT = 30000; // 30 seconds * without exiting the process. */ export const closeWorkers = async () => { - await Promise.all([ - flyerWorker.close(), - emailWorker.close(), - analyticsWorker.close(), - cleanupWorker.close(), - weeklyAnalyticsWorker.close(), - tokenCleanupWorker.close(), - receiptWorker.close(), - expiryAlertWorker.close(), - barcodeWorker.close(), - ]); + // Clear heartbeat intervals + const workers = [ + flyerWorker, + emailWorker, + analyticsWorker, + cleanupWorker, + weeklyAnalyticsWorker, + tokenCleanupWorker, + receiptWorker, + expiryAlertWorker, + barcodeWorker, + ]; + + workers.forEach((worker) => { + const interval = (worker as unknown as { heartbeatInterval?: NodeJS.Timeout }) + .heartbeatInterval; + if (interval) { + clearInterval(interval); + } + }); + + await Promise.all(workers.map((w) => w.close())); }; export const gracefulShutdown = async (signal: string) => {