Compare commits

...

13 Commits

Author SHA1 Message Date
Gitea Actions
e86ce51b6c ci: Bump version to 0.12.14 [skip ci] 2026-01-26 17:52:02 +05:00
840a7a62d3 adr work
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 1m15s
2026-01-26 04:51:10 -08:00
5720820d95 adr-053 done 2026-01-26 04:51:09 -08:00
Gitea Actions
e5cdb54308 ci: Bump version to 0.12.13 [skip ci] 2026-01-24 02:48:50 +05:00
a3f212ff81 Primary Issue: TZ Environment Variable Breaking Tests
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 18m47s
2026-01-23 13:40:48 -08:00
Gitea Actions
de263f74b0 ci: Bump version to 0.12.12 [skip ci] 2026-01-24 00:30:16 +05:00
a71e41302b no TZ in tests - who knew?
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 18m35s
2026-01-23 11:28:45 -08:00
Gitea Actions
3575803252 ci: Bump version to 0.12.11 [skip ci] 2026-01-23 12:40:09 +05:00
d03900cefe set PST as common time zone for log matching ease
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 19m4s
2026-01-22 23:38:45 -08:00
Gitea Actions
6d49639845 ci: Bump version to 0.12.10 [skip ci] 2026-01-23 10:59:29 +05:00
d4543cf4b9 Bugsink Fixes
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 19m12s
2026-01-22 21:55:18 -08:00
Gitea Actions
4f08238698 ci: Bump version to 0.12.9 [skip ci] 2026-01-23 10:49:32 +05:00
38b35f87aa Bugsink Fixes
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Has been cancelled
2026-01-22 21:48:32 -08:00
30 changed files with 1670 additions and 203 deletions

View File

@@ -117,7 +117,11 @@
"Bash(git -C \"C:\\\\Users\\\\games3\\\\.claude\\\\plugins\\\\marketplaces\\\\claude-plugins-official\" fetch --dry-run -v)",
"mcp__localerrors__get_project",
"mcp__localerrors__get_issue",
"mcp__localerrors__get_event"
"mcp__localerrors__get_event",
"mcp__localerrors__list_teams",
"WebSearch",
"Bash(for trigger in update_price_history_on_flyer_item_insert update_recipe_rating_aggregates log_new_recipe log_new_flyer)",
"Bash(do echo \"=== $trigger ===\")"
]
},
"enabledMcpjsonServers": [

View File

@@ -94,11 +94,18 @@ WORKER_LOCK_DURATION=120000
# Error Tracking (ADR-015)
# ===================
# Sentry-compatible error tracking via Bugsink (self-hosted)
# DSNs are created in Bugsink UI at http://localhost:8000 (dev) or your production URL
# Backend DSN - for Express/Node.js errors
SENTRY_DSN=
# Frontend DSN - for React/browser errors (uses VITE_ prefix)
VITE_SENTRY_DSN=
# DSNs are created in Bugsink UI at https://localhost:8443 (dev) or your production URL
#
# Dev container projects:
# - Project 1: Backend API (Dev) - receives Pino, PostgreSQL errors
# - Project 2: Frontend (Dev) - receives browser errors via Sentry SDK
# - Project 4: Infrastructure (Dev) - receives Redis, NGINX, Vite errors
#
# Backend DSN - for Express/Node.js errors (internal container URL)
SENTRY_DSN=http://<key>@localhost:8000/1
# Frontend DSN - for React/browser errors (uses nginx proxy for browser access)
# Note: Browsers cannot reach localhost:8000 directly, so we use nginx proxy at /bugsink-api/
VITE_SENTRY_DSN=https://<key>@localhost/bugsink-api/2
# Environment name for error grouping (defaults to NODE_ENV)
SENTRY_ENVIRONMENT=development
VITE_SENTRY_ENVIRONMENT=development

View File

@@ -123,23 +123,30 @@ The dev container now matches production by using PM2 for process management.
### Log Aggregation (ADR-050)
All logs flow to Bugsink via Logstash:
All logs flow to Bugsink via Logstash with 3-project routing:
| Source | Log Location | Status |
| ----------------- | --------------------------------- | ------ |
| Backend (Pino) | `/var/log/pm2/api-*.log` | Active |
| Worker (Pino) | `/var/log/pm2/worker-*.log` | Active |
| Vite | `/var/log/pm2/vite-*.log` | Active |
| PostgreSQL | `/var/log/postgresql/*.log` | Active |
| Redis | `/var/log/redis/redis-server.log` | Active |
| NGINX | `/var/log/nginx/*.log` | Active |
| Frontend (Sentry) | Browser -> Bugsink SDK | Active |
| Source | Log Location | Bugsink Project |
| ----------------- | --------------------------------- | ------------------ |
| Backend (Pino) | `/var/log/pm2/api-*.log` | Backend API (1) |
| Worker (Pino) | `/var/log/pm2/worker-*.log` | Backend API (1) |
| PostgreSQL | `/var/log/postgresql/*.log` | Backend API (1) |
| Vite | `/var/log/pm2/vite-*.log` | Infrastructure (4) |
| Redis | `/var/log/redis/redis-server.log` | Infrastructure (4) |
| NGINX | `/var/log/nginx/*.log` | Infrastructure (4) |
| Frontend (Sentry) | Browser -> nginx proxy | Frontend (2) |
**Bugsink Projects (Dev Container)**:
- Project 1: Backend API (Dev) - Application errors
- Project 2: Frontend (Dev) - Browser errors via nginx proxy
- Project 4: Infrastructure (Dev) - Redis, NGINX, Vite errors
**Key Files**:
- `ecosystem.dev.config.cjs` - PM2 development configuration
- `scripts/dev-entrypoint.sh` - Container startup script
- `docker/logstash/bugsink.conf` - Logstash pipeline configuration
- `docker/nginx/dev.conf` - NGINX config with Bugsink API proxy
**Full Dev Container Guide**: See [docs/development/DEV-CONTAINER.md](docs/development/DEV-CONTAINER.md)
@@ -215,6 +222,7 @@ Common issues with solutions:
4. **Filename collisions** - Multer predictable names → Use `${Date.now()}-${Math.round(Math.random() * 1e9)}`
5. **Response format mismatches** - API format changes → Log response bodies, update assertions
6. **External service failures** - PM2/Redis unavailable → try/catch with graceful degradation
7. **TZ environment variable breaks async hooks** - `TZ=America/Los_Angeles` causes `RangeError: Invalid triggerAsyncId value: NaN` → Tests now explicitly set `TZ=` (empty) in package.json scripts
**Full Details**: See test issues section at end of this document or [docs/development/TESTING.md](docs/development/TESTING.md)
@@ -370,3 +378,28 @@ API formats change: `data.jobId` vs `data.job.id`, nested vs flat, string vs num
PM2/Redis health checks fail when unavailable.
**Solution**: try/catch with graceful degradation or mock
### 7. TZ Environment Variable Breaking Async Hooks
**Problem**: When `TZ=America/Los_Angeles` (or other timezone values) is set in the environment, Node.js async_hooks module can produce `RangeError: Invalid triggerAsyncId value: NaN`. This breaks React Testing Library's `render()` function which uses async hooks internally.
**Root Cause**: Setting `TZ` to certain timezone values interferes with Node.js's internal async tracking mechanism, causing invalid async IDs to be generated.
**Symptoms**:
```text
RangeError: Invalid triggerAsyncId value: NaN
process.env.NODE_ENV.queueSeveralMicrotasks node_modules/react/cjs/react.development.js:751:15
process.env.NODE_ENV.exports.act node_modules/react/cjs/react.development.js:886:11
node_modules/@testing-library/react/dist/act-compat.js:46:25
renderRoot node_modules/@testing-library/react/dist/pure.js:189:26
```
**Solution**: Explicitly unset `TZ` in all test scripts by adding `TZ=` (empty value) to cross-env:
```json
"test:unit": "cross-env NODE_ENV=test TZ= tsx ..."
"test:integration": "cross-env NODE_ENV=test TZ= tsx ..."
```
**Context**: This issue was introduced in commit `d03900c` which added `TZ: 'America/Los_Angeles'` to PM2 ecosystem configs for consistent log timestamps in production/dev environments. Tests must explicitly override this to prevent the async hooks error.

View File

@@ -174,6 +174,21 @@ BUGSINK = {\n\
}\n\
\n\
ALLOWED_HOSTS = deduce_allowed_hosts(BUGSINK["BASE_URL"])\n\
# Also allow 127.0.0.1 access (both localhost and 127.0.0.1 should work)\n\
if "127.0.0.1" not in ALLOWED_HOSTS:\n\
ALLOWED_HOSTS.append("127.0.0.1")\n\
if "localhost" not in ALLOWED_HOSTS:\n\
ALLOWED_HOSTS.append("localhost")\n\
\n\
# CSRF Trusted Origins (Django 4.0+ requires full origin for HTTPS POST requests)\n\
# This fixes "CSRF verification failed" errors when accessing Bugsink via HTTPS\n\
# Both localhost and 127.0.0.1 must be trusted to support different access patterns\n\
CSRF_TRUSTED_ORIGINS = [\n\
"https://localhost:8443",\n\
"https://127.0.0.1:8443",\n\
"http://localhost:8000",\n\
"http://127.0.0.1:8000",\n\
]\n\
\n\
# Console email backend for dev\n\
EMAIL_BACKEND = "bugsink.email_backends.QuietConsoleEmailBackend"\n\

View File

@@ -57,6 +57,8 @@ services:
- '8000:8000' # Bugsink error tracking HTTP (ADR-015)
- '8443:8443' # Bugsink error tracking HTTPS (ADR-015)
environment:
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
- TZ=America/Los_Angeles
# Core settings
- NODE_ENV=development
# Database - use service name for Docker networking
@@ -122,6 +124,10 @@ services:
ports:
- '5432:5432'
environment:
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: America/Los_Angeles
# PostgreSQL timezone setting (used by log_timezone and timezone parameters)
PGTZ: America/Los_Angeles
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: flyer_crawler_dev
@@ -142,6 +148,8 @@ services:
postgres
-c config_file=/var/lib/postgresql/data/postgresql.conf
-c hba_file=/var/lib/postgresql/data/pg_hba.conf
-c timezone=America/Los_Angeles
-c log_timezone=America/Los_Angeles
-c log_min_messages=notice
-c client_min_messages=notice
-c logging_collector=on
@@ -175,6 +183,9 @@ services:
user: root
ports:
- '6379:6379'
environment:
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: America/Los_Angeles
volumes:
- redis_data:/data
# Create log volume for Logstash access (ADR-050)

View File

@@ -12,9 +12,18 @@
# - NGINX logs (/var/log/nginx/*.log) - Access and error logs
# - Redis logs (/var/log/redis/*.log) - Via shared volume (ADR-050)
#
# Bugsink Projects:
# - Project 1: Backend API (Dev) - Pino errors, PostgreSQL errors
# Bugsink Projects (3-project architecture):
# - Project 1: Backend API (Dev) - Pino/PM2 app errors, PostgreSQL errors
# DSN Key: cea01396c56246adb5878fa5ee6b1d22
# - Project 2: Frontend (Dev) - Configured via Sentry SDK in browser
# DSN Key: d92663cb73cf4145b677b84029e4b762
# - Project 4: Infrastructure (Dev) - Redis, NGINX, PM2 operational logs
# DSN Key: 14e8791da3d347fa98073261b596cab9
#
# Routing Logic:
# - Backend logs (type: pm2_api, pm2_worker, pino, postgres) -> Project 1
# - Infrastructure logs (type: redis, nginx_error, nginx_5xx) -> Project 4
# - Vite errors (type: pm2_vite with errors) -> Project 4 (build tooling)
#
# Related Documentation:
# - docs/adr/0050-postgresql-function-observability.md
@@ -112,7 +121,8 @@ input {
# ============================================================================
# Captures PostgreSQL log output including fn_log() structured JSON messages.
# PostgreSQL is configured to write logs to /var/log/postgresql/ (shared volume).
# Log format: "2026-01-22 00:00:00 UTC [5724] postgres@flyer_crawler_dev LOG: message"
# Log format: "2026-01-22 14:30:00 PST [5724] postgres@flyer_crawler_dev LOG: message"
# Note: Timestamps are in PST (America/Los_Angeles) timezone as configured in compose.dev.yml
file {
path => "/var/log/postgresql/*.log"
type => "postgres"
@@ -217,10 +227,11 @@ filter {
# PostgreSQL Log Processing (ADR-050)
# ============================================================================
# PostgreSQL log format in dev container:
# "2026-01-22 00:00:00 UTC [5724] postgres@flyer_crawler_dev LOG: message"
# "2026-01-22 07:06:03 UTC [19851] postgres@flyer_crawler_dev ERROR: column "id" does not exist"
# "2026-01-22 14:30:00 PST [5724] postgres@flyer_crawler_dev LOG: message"
# "2026-01-22 15:06:03 PST [19851] postgres@flyer_crawler_dev ERROR: column "id" does not exist"
# Note: Timestamps are in PST (America/Los_Angeles) timezone
if [type] == "postgres" {
# Parse PostgreSQL log prefix with UTC timezone
# Parse PostgreSQL log prefix with timezone (PST in dev, may vary in prod)
grok {
match => { "message" => "%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{TIME} %{WORD:pg_timezone} \[%{POSINT:pg_pid}\] %{DATA:pg_user}@%{DATA:pg_database} %{WORD:pg_level}: ?%{GREEDYDATA:pg_message}" }
tag_on_failure => ["_postgres_grok_failure"]
@@ -344,26 +355,56 @@ filter {
}
# ============================================================================
# Generate Sentry Event ID for all errors
# Generate Sentry Event ID and Ensure Required Fields for all errors
# ============================================================================
# CRITICAL: sentry_level MUST be set for all errors before output.
# Bugsink's PostgreSQL schema limits level to varchar(7), so valid values are:
# fatal, error, warning, info, debug (all <= 7 chars)
# If sentry_level is not set, the literal "%{sentry_level}" (16 chars) is sent,
# causing PostgreSQL insertion failures.
# ============================================================================
if "error" in [tags] {
# Use Ruby for robust field handling - handles all edge cases
ruby {
code => '
require "securerandom"
event.set("sentry_event_id", SecureRandom.hex(16))
'
}
# Ensure error_message has a fallback value
if ![error_message] {
mutate { add_field => { "error_message" => "%{message}" } }
# Generate unique event ID for Sentry
event.set("sentry_event_id", SecureRandom.hex(16))
# =====================================================================
# CRITICAL: Validate and set sentry_level
# =====================================================================
# Valid Sentry levels (max 7 chars for Bugsink PostgreSQL schema):
# fatal, error, warning, info, debug
# Default to "error" if missing, empty, or invalid.
# =====================================================================
valid_levels = ["fatal", "error", "warning", "info", "debug"]
current_level = event.get("sentry_level")
if current_level.nil? || current_level.to_s.strip.empty? || !valid_levels.include?(current_level.to_s.downcase)
event.set("sentry_level", "error")
else
# Normalize to lowercase
event.set("sentry_level", current_level.to_s.downcase)
end
# =====================================================================
# Ensure error_message has a fallback value
# =====================================================================
error_msg = event.get("error_message")
if error_msg.nil? || error_msg.to_s.strip.empty?
fallback_msg = event.get("message") || event.get("msg") || "Unknown error"
event.set("error_message", fallback_msg.to_s)
end
'
}
}
}
output {
# ============================================================================
# Forward Errors to Bugsink (Backend API Project)
# Forward Errors to Bugsink (Project Routing)
# ============================================================================
# Bugsink uses Sentry-compatible API. Events must include:
# - event_id: 32 hex characters (UUID without dashes)
@@ -373,9 +414,50 @@ output {
# - platform: "node" for backend, "javascript" for frontend
#
# Authentication via X-Sentry-Auth header with project's public key.
# Dev container DSN: http://cea01396c56246adb5878fa5ee6b1d22@localhost:8000/1
#
# Project Routing:
# - Project 1 (Backend): Pino app logs, PostgreSQL errors
# - Project 4 (Infrastructure): Redis, NGINX, Vite build errors
# ============================================================================
if "error" in [tags] {
# ============================================================================
# Infrastructure Errors -> Project 4
# ============================================================================
# Redis warnings/errors, NGINX errors, and Vite build errors go to
# the Infrastructure project for separation from application code errors.
if "error" in [tags] and ([type] == "redis" or [type] == "nginx_error" or [type] == "nginx_access" or [type] == "pm2_vite") {
http {
url => "http://localhost:8000/api/4/store/"
http_method => "post"
format => "json"
headers => {
"X-Sentry-Auth" => "Sentry sentry_key=14e8791da3d347fa98073261b596cab9, sentry_version=7"
"Content-Type" => "application/json"
}
mapping => {
"event_id" => "%{sentry_event_id}"
"timestamp" => "%{@timestamp}"
"level" => "%{sentry_level}"
"platform" => "other"
"logger" => "%{type}"
"message" => "%{error_message}"
"extra" => {
"hostname" => "%{[host][name]}"
"source_type" => "%{type}"
"tags" => "%{tags}"
"original_message" => "%{message}"
"project" => "infrastructure"
}
}
}
}
# ============================================================================
# Backend Application Errors -> Project 1
# ============================================================================
# Pino application logs (API, Worker), PostgreSQL function errors, and
# native PostgreSQL errors go to the Backend API project.
else if "error" in [tags] and ([type] in ["pm2_api", "pm2_worker", "pino", "postgres"]) {
http {
url => "http://localhost:8000/api/1/store/"
http_method => "post"
@@ -384,7 +466,6 @@ output {
"X-Sentry-Auth" => "Sentry sentry_key=cea01396c56246adb5878fa5ee6b1d22, sentry_version=7"
"Content-Type" => "application/json"
}
# Transform event to Sentry format using regular fields (not @metadata)
mapping => {
"event_id" => "%{sentry_event_id}"
"timestamp" => "%{@timestamp}"
@@ -397,6 +478,38 @@ output {
"source_type" => "%{type}"
"tags" => "%{tags}"
"original_message" => "%{message}"
"project" => "backend"
}
}
}
}
# ============================================================================
# Fallback: Any other errors -> Project 1
# ============================================================================
# Catch-all for any errors that don't match specific routing rules.
else if "error" in [tags] {
http {
url => "http://localhost:8000/api/1/store/"
http_method => "post"
format => "json"
headers => {
"X-Sentry-Auth" => "Sentry sentry_key=cea01396c56246adb5878fa5ee6b1d22, sentry_version=7"
"Content-Type" => "application/json"
}
mapping => {
"event_id" => "%{sentry_event_id}"
"timestamp" => "%{@timestamp}"
"level" => "%{sentry_level}"
"platform" => "node"
"logger" => "%{type}"
"message" => "%{error_message}"
"extra" => {
"hostname" => "%{[host][name]}"
"source_type" => "%{type}"
"tags" => "%{tags}"
"original_message" => "%{message}"
"project" => "backend-fallback"
}
}
}

View File

@@ -60,6 +60,37 @@ server {
proxy_set_header X-Forwarded-Proto $scheme;
}
# ============================================================================
# Bugsink Sentry API Proxy (for frontend error reporting)
# ============================================================================
# The frontend Sentry SDK cannot reach localhost:8000 directly from the browser
# because port 8000 is only accessible within the container network.
# This proxy allows the browser to send errors to https://localhost/bugsink-api/
# which NGINX forwards to the Bugsink container on port 8000.
#
# Frontend DSN format: https://localhost/bugsink-api/<project_id>
# Example: https://localhost/bugsink-api/2 for Frontend (Dev) project
#
# The Sentry SDK sends POST requests to /bugsink-api/<project>/store/
# This proxy strips /bugsink-api and forwards to http://localhost:8000/api/
# ============================================================================
location /bugsink-api/ {
proxy_pass http://localhost:8000/api/;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Allow large error payloads with stack traces
client_max_body_size 10M;
# Timeouts for error reporting (should be fast)
proxy_connect_timeout 10s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
# Proxy WebSocket connections for real-time notifications
location /ws {
proxy_pass http://localhost:3001;

View File

@@ -2,6 +2,10 @@
# This file is mounted into the PostgreSQL container to enable structured logging
# from database functions via fn_log()
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
timezone = 'America/Los_Angeles'
log_timezone = 'America/Los_Angeles'
# Enable logging to files for Logstash pickup
logging_collector = on
log_destination = 'stderr'

View File

@@ -28,9 +28,28 @@ The `.env.local` file uses `localhost` while `compose.dev.yml` uses `127.0.0.1`.
## HTTPS Setup
- Self-signed certificates auto-generated with mkcert on container startup
- CSRF Protection: Django configured with `SECURE_PROXY_SSL_HEADER` to trust `X-Forwarded-Proto` from nginx
- CSRF Protection: Django configured with `CSRF_TRUSTED_ORIGINS` for both `localhost` and `127.0.0.1` (see below)
- HTTPS proxy: nginx on port 8443 proxies to Bugsink on port 8000
- HTTPS is for UI access only - Sentry SDK uses HTTP directly
### CSRF Configuration
Django 4.0+ requires `CSRF_TRUSTED_ORIGINS` for HTTPS POST requests. The Bugsink configuration (`Dockerfile.dev`) includes:
```python
CSRF_TRUSTED_ORIGINS = [
"https://localhost:8443",
"https://127.0.0.1:8443",
"http://localhost:8000",
"http://127.0.0.1:8000",
]
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
```
**Both hostnames are required** because browsers treat `localhost` and `127.0.0.1` as different origins.
If you get "CSRF verification failed" errors, see [BUGSINK-SETUP.md](tools/BUGSINK-SETUP.md#csrf-verification-failed) for troubleshooting.
## Isolation Benefits
- Dev errors stay local, don't pollute production/test dashboards

View File

@@ -2,7 +2,7 @@
**Date**: 2026-01-11
**Status**: Proposed
**Status**: Accepted (Fully Implemented)
**Related**: [ADR-015](0015-application-performance-monitoring-and-error-tracking.md), [ADR-004](0004-standardized-application-wide-structured-logging.md)

View File

@@ -2,7 +2,9 @@
**Date**: 2026-01-11
**Status**: Proposed
**Status**: Accepted (Fully Implemented)
**Related**: [ADR-004](0004-standardized-application-wide-structured-logging.md)
## Context
@@ -17,7 +19,9 @@ We will adopt a namespace-based debug filter pattern, similar to the `debug` npm
## Implementation
In `src/services/logger.server.ts`:
### Core Implementation (Completed 2026-01-11)
Implemented in [src/services/logger.server.ts:140-150](src/services/logger.server.ts#L140-L150):
```typescript
const debugModules = (process.env.DEBUG_MODULES || '').split(',').map((s) => s.trim());
@@ -33,10 +37,100 @@ export const createScopedLogger = (moduleName: string) => {
};
```
### Adopted Services (Completed 2026-01-26)
Services currently using `createScopedLogger`:
- `ai-service` - AI/Gemini integration ([src/services/aiService.server.ts:1020](src/services/aiService.server.ts#L1020))
- `flyer-processing-service` - Flyer upload and processing ([src/services/flyerProcessingService.server.ts:20](src/services/flyerProcessingService.server.ts#L20))
## Usage
To debug only AI and Database interactions:
### Enable Debug Logging for Specific Modules
To debug only AI and flyer processing:
```bash
DEBUG_MODULES=ai-service,db-repo npm run dev
DEBUG_MODULES=ai-service,flyer-processing-service npm run dev
```
### Enable All Debug Logging
Use wildcard to enable debug logging for all modules:
```bash
DEBUG_MODULES=* npm run dev
```
### Common Module Names
| Module Name | Purpose | File |
| -------------------------- | ---------------------------------------- | ----------------------------------------------- |
| `ai-service` | AI/Gemini API interactions | `src/services/aiService.server.ts` |
| `flyer-processing-service` | Flyer upload, validation, and processing | `src/services/flyerProcessingService.server.ts` |
## Best Practices
1. **Use Scoped Loggers for Long-Running Services**: Services with complex workflows or external API calls should use `createScopedLogger` to allow targeted debugging.
2. **Use Child Loggers for Contextual Data**: Even within scoped loggers, create child loggers with job/request-specific context:
```typescript
const logger = createScopedLogger('my-service');
async function processJob(job: Job) {
const jobLogger = logger.child({ jobId: job.id, jobName: job.name });
jobLogger.debug('Starting job processing');
}
```
3. **Module Naming Convention**: Use kebab-case suffixed with `-service` or `-worker` (e.g., `ai-service`, `email-worker`).
4. **Production Usage**: `DEBUG_MODULES` can be set in production for temporary debugging, but should not be used continuously due to increased log volume.
## Examples
### Development Debugging
Debug AI service issues during development:
```bash
# Dev container
DEBUG_MODULES=ai-service npm run dev
# Or via PM2
DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api-dev
```
### Production Troubleshooting
Temporarily enable debug logging for a specific subsystem:
```bash
# SSH into production server
ssh root@projectium.com
# Set environment variable and restart
DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api
# View logs
pm2 logs flyer-crawler-api --lines 100
# Disable debug logging
pm2 unset DEBUG_MODULES flyer-crawler-api
pm2 restart flyer-crawler-api
```
## Consequences
**Positive**:
- Developers can inspect detailed logs for specific subsystems without log flooding
- Production debugging becomes more targeted and efficient
- No performance impact when debug logging is disabled
- Compatible with existing Pino logging infrastructure
**Negative**:
- Requires developers to know module names (mitigated by documentation above)
- Not all services have adopted scoped loggers yet (gradual migration)

View File

@@ -2,7 +2,14 @@
**Date**: 2026-01-11
**Status**: Proposed
**Status**: Accepted (Fully Implemented)
**Implementation Status**:
- ✅ BullMQ worker stall configuration (complete)
- ✅ Basic health endpoints (/live, /ready, /redis, etc.)
- ✅ /health/queues endpoint (complete)
- ✅ Worker heartbeat mechanism (complete)
## Context
@@ -60,3 +67,76 @@ The `/health/queues` endpoint will:
**Negative**:
- Requires configuring external monitoring to poll the new endpoint.
## Implementation Notes
### Completed (2026-01-11)
1. **BullMQ Stall Configuration** - `src/config/workerOptions.ts`
- All workers use `defaultWorkerOptions` with:
- `stalledInterval: 30000` (30s)
- `maxStalledCount: 3`
- `lockDuration: 30000` (30s)
- Applied to all 9 workers: flyer, email, analytics, cleanup, weekly-analytics, token-cleanup, receipt, expiry-alert, barcode
2. **Basic Health Endpoints** - `src/routes/health.routes.ts`
- `/health/live` - Liveness probe
- `/health/ready` - Readiness probe (checks DB, Redis, storage)
- `/health/startup` - Startup probe
- `/health/redis` - Redis connectivity
- `/health/db-pool` - Database connection pool status
### Implementation Completed (2026-01-26)
1. **`/health/queues` Endpoint** ✅
- Added route to `src/routes/health.routes.ts:511-674`
- Iterates through all 9 queues from `src/services/queues.server.ts`
- Fetches job counts using BullMQ Queue API: `getJobCounts()`
- Returns structured response including both queue metrics and worker heartbeats:
```typescript
{
status: 'healthy' | 'unhealthy',
timestamp: string,
queues: {
[queueName]: {
waiting: number,
active: number,
failed: number,
delayed: number
}
},
workers: {
[workerName]: {
alive: boolean,
lastSeen?: string,
pid?: number,
host?: string
}
}
}
```
- Returns 200 OK if all healthy, 503 if any queue/worker unavailable
- Full OpenAPI documentation included
2. **Worker Heartbeat Mechanism** ✅
- Added `updateWorkerHeartbeat()` and `startWorkerHeartbeat()` in `src/services/workers.server.ts:100-149`
- Key pattern: `worker:heartbeat:<worker-name>`
- Stores: `{ timestamp: ISO8601, pid: number, host: string }`
- Updates every 30s with 90s TTL
- Integrated with `/health/queues` endpoint (checks if heartbeat < 60s old)
- Heartbeat intervals properly cleaned up in `closeWorkers()` and `gracefulShutdown()`
3. **Comprehensive Tests** ✅
- Added 5 test cases in `src/routes/health.routes.test.ts:623-858`
- Tests cover: healthy state, queue failures, stale heartbeats, missing heartbeats, Redis errors
- All tests follow existing patterns with proper mocking
### Future Enhancements (Not Implemented)
1. **Queue Depth Alerting** (Low Priority)
- Add configurable thresholds per queue type
- Return 500 if `waiting` count exceeds threshold for extended period
- Consider using Redis for storing threshold breach timestamps
- **Estimate**: 1-2 hours

View File

@@ -1,4 +1,4 @@
# ADR-023: Database Normalization and Referential Integrity
# ADR-055: Database Normalization and Referential Integrity
**Date:** 2026-01-19
**Status:** Accepted

View File

@@ -15,8 +15,8 @@ This document tracks the implementation status and estimated effort for all Arch
| Status | Count |
| ---------------------------- | ----- |
| Accepted (Fully Implemented) | 30 |
| Partially Implemented | 2 |
| Accepted (Fully Implemented) | 36 |
| Partially Implemented | 3 |
| Proposed (Not Started) | 16 |
---
@@ -62,25 +62,30 @@ This document tracks the implementation status and estimated effort for all Arch
| [ADR-029](./0029-secret-rotation-and-key-management.md) | Secret Rotation | Proposed | L | Infrastructure changes needed |
| [ADR-032](./0032-rate-limiting-strategy.md) | Rate Limiting | Accepted | - | Fully implemented |
| [ADR-033](./0033-file-upload-and-storage-strategy.md) | File Upload & Storage | Accepted | - | Fully implemented |
| [ADR-048](./0048-authentication-strategy.md) | Authentication | Partial | M | JWT done, OAuth pending |
### Category 5: Observability & Monitoring
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | --------------------------------- |
| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented |
| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration |
| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Proposed | M | Depends on ADR-015 implementation |
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | ----------------------- |
| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented |
| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration |
| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Accepted | - | Fully implemented |
| [ADR-051](./0051-asynchronous-context-propagation.md) | Context Propagation | Accepted | - | Fully implemented |
| [ADR-052](./0052-granular-debug-logging-strategy.md) | Granular Debug Logging | Accepted | - | Fully implemented |
### Category 6: Deployment & Operations
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------- | ----------------- | -------- | ------ | -------------------------- |
| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented |
| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending |
| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented |
| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed |
| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented |
| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented |
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------- | ------------------ | -------- | ------ | -------------------------- |
| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented |
| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending |
| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented |
| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed |
| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented |
| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented |
| [ADR-053](./0053-worker-health-checks.md) | Worker Health | Accepted | - | Fully implemented |
| [ADR-054](./0054-bugsink-gitea-issue-sync.md) | Bugsink-Gitea Sync | Proposed | L | Automated issue creation |
### Category 7: Frontend / User Interface
@@ -99,22 +104,24 @@ This document tracks the implementation status and estimated effort for all Arch
| [ADR-010](./0010-testing-strategy-and-standards.md) | Testing Strategy | Accepted | - | Fully implemented |
| [ADR-021](./0021-code-formatting-and-linting-unification.md) | Formatting & Linting | Accepted | - | Fully implemented |
| [ADR-027](./0027-standardized-naming-convention-for-ai-and-database-types.md) | Naming Conventions | Accepted | - | Fully implemented |
| [ADR-040](./0040-testing-economics-and-priorities.md) | Testing Economics | Accepted | - | Fully implemented |
| [ADR-045](./0045-test-data-factories-and-fixtures.md) | Test Data Factories | Accepted | - | Fully implemented |
| [ADR-047](./0047-project-file-and-folder-organization.md) | Project Organization | Proposed | XL | Major reorganization |
### Category 9: Architecture Patterns
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------- | --------------------- | -------- | ------ | ----------------- |
| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented |
| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented |
| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented |
| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented |
| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented |
| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented |
| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented |
| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented |
| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented |
| ADR | Title | Status | Effort | Notes |
| --------------------------------------------------------------------- | --------------------- | -------- | ------ | ------------------------- |
| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented |
| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented |
| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented |
| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented |
| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented |
| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented |
| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented |
| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented |
| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented |
| [ADR-055](./0055-database-normalization-and-referential-integrity.md) | DB Normalization | Accepted | M | API uses IDs, not strings |
---
@@ -122,38 +129,49 @@ This document tracks the implementation status and estimated effort for all Arch
These ADRs are proposed but not yet implemented, ordered by suggested implementation priority:
| Priority | ADR | Title | Effort | Rationale |
| -------- | ------- | --------------------------- | ------ | ------------------------------------------------- |
| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging |
| 1b | ADR-050 | PostgreSQL Fn Observability | M | Database function visibility (depends on ADR-015) |
| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing |
| 3 | ADR-023 | Schema Migrations v2 | L | Database evolution support |
| 4 | ADR-029 | Secret Rotation | L | Security improvement |
| 5 | ADR-008 | API Versioning | L | Future API evolution |
| 6 | ADR-030 | Circuit Breaker | L | Resilience improvement |
| 7 | ADR-022 | Real-time Notifications | XL | Major feature enhancement |
| 8 | ADR-011 | Authorization & RBAC | XL | Advanced permission system |
| 9 | ADR-025 | i18n & l10n | XL | Multi-language support |
| 10 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements |
| Priority | ADR | Title | Effort | Rationale |
| -------- | ------- | ------------------------ | ------ | ------------------------------------ |
| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging |
| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing |
| 3 | ADR-054 | Bugsink-Gitea Sync | L | Automated issue tracking from errors |
| 4 | ADR-023 | Schema Migrations v2 | L | Database evolution support |
| 5 | ADR-029 | Secret Rotation | L | Security improvement |
| 6 | ADR-008 | API Versioning | L | Future API evolution |
| 7 | ADR-030 | Circuit Breaker | L | Resilience improvement |
| 8 | ADR-022 | Real-time Notifications | XL | Major feature enhancement |
| 9 | ADR-011 | Authorization & RBAC | XL | Advanced permission system |
| 10 | ADR-025 | i18n & l10n | XL | Multi-language support |
| 11 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements |
---
## Recent Implementation History
| Date | ADR | Change |
| ---------- | ------- | ---------------------------------------------------------------------- |
| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash |
| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs |
| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements |
| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan |
| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting |
| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing |
| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns |
| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization |
| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation |
| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping |
| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger |
| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated |
| Date | ADR | Change |
| ---------- | ------- | ------------------------------------------------------------------------------------ |
| 2026-01-26 | ADR-052 | Marked as fully implemented - createScopedLogger with DEBUG_MODULES support complete |
| 2026-01-26 | ADR-053 | Marked as fully implemented - /health/queues endpoint and worker heartbeats complete |
| 2026-01-26 | ADR-050 | Marked as fully implemented - PostgreSQL function observability complete |
| 2026-01-26 | ADR-055 | Created (renumbered from duplicate ADR-023) - Database normalization |
| 2026-01-26 | ADR-054 | Added to tracker - Bugsink to Gitea issue synchronization |
| 2026-01-26 | ADR-053 | Added to tracker - Worker health checks and monitoring |
| 2026-01-26 | ADR-052 | Added to tracker - Granular debug logging strategy |
| 2026-01-26 | ADR-051 | Added to tracker - Asynchronous context propagation |
| 2026-01-26 | ADR-048 | Added to tracker - Authentication strategy |
| 2026-01-26 | ADR-040 | Added to tracker - Testing economics and priorities |
| 2026-01-17 | ADR-054 | Created - Bugsink-Gitea sync worker proposal |
| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash |
| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs |
| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements |
| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan |
| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting |
| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing |
| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns |
| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization |
| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation |
| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping |
| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger |
| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated |
---

View File

@@ -21,7 +21,7 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-003](./0003-standardized-input-validation-using-middleware.md)**: Standardized Input Validation using Middleware (Accepted)
**[ADR-008](./0008-api-versioning-strategy.md)**: API Versioning Strategy (Proposed)
**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Proposed)
**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Accepted)
**[ADR-022](./0022-real-time-notification-system.md)**: Real-time Notification System (Proposed)
**[ADR-028](./0028-api-response-standardization.md)**: API Response Standardization and Envelope Pattern (Implemented)
@@ -39,6 +39,9 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-004](./0004-standardized-application-wide-structured-logging.md)**: Standardized Application-Wide Structured Logging (Accepted)
**[ADR-015](./0015-application-performance-monitoring-and-error-tracking.md)**: Application Performance Monitoring (APM) and Error Tracking (Proposed)
**[ADR-050](./0050-postgresql-function-observability.md)**: PostgreSQL Function Observability (Proposed)
**[ADR-051](./0051-asynchronous-context-propagation.md)**: Asynchronous Context Propagation (Accepted)
**[ADR-052](./0052-granular-debug-logging-strategy.md)**: Granular Debug Logging Strategy (Proposed)
## 6. Deployment & Operations
@@ -48,13 +51,15 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-024](./0024-feature-flagging-strategy.md)**: Feature Flagging Strategy (Proposed)
**[ADR-037](./0037-scheduled-jobs-and-cron-pattern.md)**: Scheduled Jobs and Cron Pattern (Accepted)
**[ADR-038](./0038-graceful-shutdown-pattern.md)**: Graceful Shutdown Pattern (Accepted)
**[ADR-053](./0053-worker-health-checks-and-monitoring.md)**: Worker Health Checks and Monitoring (Proposed)
**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed)
## 7. Frontend / User Interface
**[ADR-005](./0005-frontend-state-management-and-server-cache-strategy.md)**: Frontend State Management and Server Cache Strategy (Accepted)
**[ADR-012](./0012-frontend-component-library-and-design-system.md)**: Frontend Component Library and Design System (Partially Implemented)
**[ADR-025](./0025-internationalization-and-localization-strategy.md)**: Internationalization (i18n) and Localization (l10n) Strategy (Proposed)
**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Proposed)
**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Accepted)
**[ADR-044](./0044-frontend-feature-organization.md)**: Frontend Feature Organization Pattern (Accepted)
## 8. Development Workflow & Quality
@@ -76,3 +81,5 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-042](./0042-email-and-notification-architecture.md)**: Email and Notification Architecture (Accepted)
**[ADR-043](./0043-express-middleware-pipeline.md)**: Express Middleware Pipeline Architecture (Accepted)
**[ADR-046](./0046-image-processing-pipeline.md)**: Image Processing Pipeline (Accepted)
**[ADR-049](./0049-gamification-and-achievement-system.md)**: Gamification and Achievement System (Accepted)
**[ADR-055](./0055-database-normalization-and-referential-integrity.md)**: Database Normalization and Referential Integrity (Accepted)

View File

@@ -175,29 +175,30 @@ npm run dev:pm2:logs
### Log Flow Architecture (ADR-050)
All application logs flow through Logstash to Bugsink:
All application logs flow through Logstash to Bugsink using a 3-project architecture:
```
```text
+------------------+ +------------------+ +------------------+
| PM2 Logs | | PostgreSQL | | Redis Logs |
| PM2 Logs | | PostgreSQL | | Redis/NGINX |
| /var/log/pm2/ | | /var/log/ | | /var/log/redis/ |
+--------+---------+ | postgresql/ | +--------+---------+
| +--------+---------+ |
| (API + Worker) | | postgresql/ | | /var/log/nginx/ |
+--------+---------+ +--------+---------+ +--------+---------+
| | |
v v v
+------------------------------------------------------------------------+
| LOGSTASH |
| /etc/logstash/conf.d/bugsink.conf |
| (Routes by log type) |
+------------------------------------------------------------------------+
| | |
| +---------+---------+ |
| | | |
v v v v
+------------------+ +------------------+ +------------------+
| Errors -> | | Operational -> | | NGINX Logs -> |
| Bugsink API | | /var/log/ | | /var/log/ |
| (Project 1) | | logstash/*.log | | logstash/*.log |
+------------------+ +------------------+ +------------------+
v v v
+------------------+ +------------------+ +------------------+
| Backend API | | Frontend (Dev) | | Infrastructure |
| (Project 1) | | (Project 2) | | (Project 4) |
| - Pino errors | | - Browser SDK | | - Redis warnings |
| - PostgreSQL | | (not Logstash) | | - NGINX errors |
+------------------+ +------------------+ | - Vite errors |
+------------------+
```
### Log Sources
@@ -231,8 +232,11 @@ podman exec flyer-crawler-dev curl -s localhost:9600/_node/stats/pipelines?prett
- **URL**: `https://localhost:8443`
- **Login**: `admin@localhost` / `admin`
- **Projects**:
- Project 1: Backend API (errors from Pino, PostgreSQL, Redis)
- Project 2: Frontend (errors from Sentry SDK in browser)
- Project 1: Backend API (Dev) - Pino app errors, PostgreSQL errors
- Project 2: Frontend (Dev) - Browser errors via Sentry SDK
- Project 4: Infrastructure (Dev) - Redis warnings, NGINX errors, Vite build errors
**Note**: Frontend DSN uses nginx proxy (`/bugsink-api/`) because browsers cannot reach `localhost:8000` directly. See [BUGSINK-SETUP.md](../tools/BUGSINK-SETUP.md#frontend-nginx-proxy) for details.
---
@@ -268,14 +272,41 @@ podman-compose -f compose.dev.yml down
Key environment variables are set in `compose.dev.yml`:
| Variable | Value | Purpose |
| ----------------- | ----------------------------- | -------------------- |
| `NODE_ENV` | `development` | Environment mode |
| `DB_HOST` | `postgres` | PostgreSQL hostname |
| `REDIS_URL` | `redis://redis:6379` | Redis connection URL |
| `FRONTEND_URL` | `https://localhost` | CORS origin |
| `SENTRY_DSN` | `http://...@127.0.0.1:8000/1` | Backend Bugsink DSN |
| `VITE_SENTRY_DSN` | `http://...@127.0.0.1:8000/2` | Frontend Bugsink DSN |
| Variable | Value | Purpose |
| ----------------- | ----------------------------- | --------------------------- |
| `TZ` | `America/Los_Angeles` | Timezone (PST) for all logs |
| `NODE_ENV` | `development` | Environment mode |
| `DB_HOST` | `postgres` | PostgreSQL hostname |
| `REDIS_URL` | `redis://redis:6379` | Redis connection URL |
| `FRONTEND_URL` | `https://localhost` | CORS origin |
| `SENTRY_DSN` | `http://...@127.0.0.1:8000/1` | Backend Bugsink DSN |
| `VITE_SENTRY_DSN` | `http://...@127.0.0.1:8000/2` | Frontend Bugsink DSN |
### Timezone Configuration
All dev container services are configured to use PST (America/Los_Angeles) timezone for consistent log timestamps:
| Service | Configuration | Notes |
| ---------- | ------------------------------------------------ | ------------------------------ |
| App | `TZ=America/Los_Angeles` in compose.dev.yml | Also set via dev-entrypoint.sh |
| PostgreSQL | `timezone` and `log_timezone` in postgres config | Logs timestamps in PST |
| Redis | `TZ=America/Los_Angeles` in compose.dev.yml | Alpine uses TZ env var |
| PM2 | `TZ` in ecosystem.dev.config.cjs | Pino timestamps use local time |
**Verifying Timezone**:
```bash
# Check container timezone
podman exec flyer-crawler-dev date
# Check PostgreSQL timezone
podman exec flyer-crawler-postgres psql -U postgres -c "SHOW timezone;"
# Check Redis log timestamps
MSYS_NO_PATHCONV=1 podman exec flyer-crawler-redis cat /var/log/redis/redis-server.log | head -5
```
**Note**: If you need UTC timestamps for production compatibility, change `TZ=UTC` in compose.dev.yml and restart containers.
---

View File

@@ -101,6 +101,7 @@ MSYS_NO_PATHCONV=1 podman exec flyer-crawler-dev ls -la /var/log/redis/
| NGINX logs missing | Output directory | `ls -lh /var/log/logstash/nginx-access-*.log` |
| Redis logs missing | Shared volume | Dev: Check `redis_logs` volume mounted; Prod: Check `/var/log/redis/redis-server.log` exists |
| High disk usage | Log rotation | Verify `/etc/logrotate.d/logstash` configured |
| varchar(7) error | Level validation | Add Ruby filter to validate/normalize `sentry_level` before output |
## Related Documentation

View File

@@ -11,6 +11,7 @@ This runbook provides step-by-step diagnostics and solutions for common Logstash
| Wrong Bugsink project | Environment detection failed | Verify `pg_database` field extraction |
| 403 authentication error | Missing/wrong DSN key | Check `X-Sentry-Auth` header |
| 500 error from Bugsink | Invalid event format | Verify `event_id` and required fields |
| varchar(7) constraint | Unresolved `%{sentry_level}` | Add Ruby filter for level validation |
---
@@ -385,7 +386,88 @@ systemctl status logstash
---
### Issue 7: Log File Rotation Issues
### Issue 7: Level Field Constraint Violation (varchar(7))
**Symptoms:**
- Bugsink returns HTTP 500 errors
- PostgreSQL errors: `value too long for type character varying(7)`
- Events fail to insert with literal `%{sentry_level}` string (16 characters)
**Root Cause:**
When Logstash cannot determine the log level (no error patterns matched), the `sentry_level` field remains as the unresolved placeholder `%{sentry_level}`. Bugsink's PostgreSQL schema has a `varchar(7)` constraint on the level field.
Valid Sentry levels (all <= 7 characters): `fatal`, `error`, `warning`, `info`, `debug`
**Diagnosis:**
```bash
# Check for HTTP 500 responses in Logstash logs
podman exec flyer-crawler-dev cat /var/log/logstash/logstash.log | grep "500"
# Check Bugsink for constraint violation errors
# Via MCP:
mcp__localerrors__list_issues({ project_id: 1, status: 'unresolved' })
```
**Solution:**
Add a Ruby filter block in `docker/logstash/bugsink.conf` to validate and normalize the `sentry_level` field before sending to Bugsink:
```ruby
# Add this AFTER all mutate filters that set sentry_level
# and BEFORE the output section
ruby {
code => '
level = event.get("sentry_level")
# Check if level is invalid (nil, empty, contains placeholder, or too long)
if level.nil? || level.to_s.empty? || level.to_s.include?("%{") || level.to_s.length > 7
# Default to "error" for error-tagged events, "info" otherwise
if event.get("tags")&.include?("error")
event.set("sentry_level", "error")
else
event.set("sentry_level", "info")
end
else
# Normalize to lowercase and validate
normalized = level.to_s.downcase
valid_levels = ["fatal", "error", "warning", "info", "debug"]
unless valid_levels.include?(normalized)
normalized = "error"
end
event.set("sentry_level", normalized)
end
'
}
```
**Key validations performed:**
1. Checks for nil or empty values
2. Detects unresolved placeholders (`%{...}`)
3. Enforces 7-character maximum length
4. Normalizes to lowercase
5. Validates against allowed Sentry levels
6. Defaults to "error" for error-tagged events, "info" otherwise
**Verification:**
```bash
# Restart Logstash
podman exec flyer-crawler-dev systemctl restart logstash
# Generate a test log that triggers the filter
podman exec flyer-crawler-dev pm2 restart flyer-crawler-api-dev
# Check no new HTTP 500 errors
podman exec flyer-crawler-dev cat /var/log/logstash/logstash.log | tail -50 | grep -E "(500|error)"
```
---
### Issue 8: Log File Rotation Issues
**Symptoms:**

View File

@@ -49,18 +49,24 @@ Bugsink is a lightweight, self-hosted error tracking platform that is fully comp
| Web UI | `https://localhost:8443` (nginx proxy) |
| Internal URL | `http://localhost:8000` (direct) |
| Credentials | `admin@localhost` / `admin` |
| Backend Project | Project ID 1 - `flyer-crawler-dev-backend` |
| Frontend Project | Project ID 2 - `flyer-crawler-dev-frontend` |
| Backend Project | Project ID 1 - `Backend API (Dev)` |
| Frontend Project | Project ID 2 - `Frontend (Dev)` |
| Infra Project | Project ID 4 - `Infrastructure (Dev)` |
| Backend DSN | `http://<key>@localhost:8000/1` |
| Frontend DSN | `http://<key>@localhost:8000/2` |
| Frontend DSN | `https://<key>@localhost/bugsink-api/2` (via nginx proxy) |
| Infra DSN | `http://<key>@localhost:8000/4` (Logstash only) |
| Database | `postgresql://bugsink:bugsink_dev_password@postgres:5432/bugsink` |
**Important:** The Frontend DSN uses an nginx proxy (`/bugsink-api/`) because the browser cannot reach `localhost:8000` directly (container-internal port). See [Frontend Nginx Proxy](#frontend-nginx-proxy) for details.
**Configuration Files:**
| File | Purpose |
| ----------------- | ----------------------------------------------------------------- |
| `compose.dev.yml` | Initial DSNs using `127.0.0.1:8000` (container startup) |
| `.env.local` | **OVERRIDES** compose.dev.yml with `localhost:8000` (app runtime) |
| File | Purpose |
| ------------------------------ | ------------------------------------------------------- |
| `compose.dev.yml` | Initial DSNs using `127.0.0.1:8000` (container startup) |
| `.env.local` | **OVERRIDES** compose.dev.yml (app runtime) |
| `docker/nginx/dev.conf` | Nginx proxy for Bugsink API (frontend error reporting) |
| `docker/logstash/bugsink.conf` | Log routing to Backend/Infrastructure projects |
**Note:** `.env.local` takes precedence over `compose.dev.yml` environment variables.
@@ -360,75 +366,127 @@ const config = {
---
## Frontend Nginx Proxy
The frontend Sentry SDK runs in the browser, which cannot directly reach `localhost:8000` (the Bugsink container-internal port). To solve this, we use an nginx proxy.
### How It Works
```text
Browser --HTTPS--> https://localhost/bugsink-api/2/store/
|
v (nginx proxy)
http://localhost:8000/api/2/store/
|
v
Bugsink (internal)
```
### Nginx Configuration
Location: `docker/nginx/dev.conf`
```nginx
# Proxy Bugsink Sentry API for frontend error reporting
location /bugsink-api/ {
proxy_pass http://localhost:8000/api/;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Allow large error payloads with stack traces
client_max_body_size 10M;
# Timeouts for error reporting
proxy_connect_timeout 10s;
proxy_send_timeout 30s;
proxy_read_timeout 30s;
}
```
### Frontend DSN Format
```bash
# .env.local
# Uses nginx proxy path instead of direct port
VITE_SENTRY_DSN=https://<key>@localhost/bugsink-api/2
```
### Testing Frontend Error Reporting
1. Open browser console at `https://localhost`
2. Trigger a test error:
```javascript
throw new Error('Test frontend error from browser');
```
3. Check Bugsink Frontend (Dev) project for the error
4. Verify browser console shows Sentry SDK activity (if VITE_SENTRY_DEBUG=true)
---
## Logstash Integration
Logstash aggregates logs from multiple sources and forwards error patterns to Bugsink.
**Note:** See [ADR-015](../adr/0015-application-performance-monitoring-and-error-tracking.md) for the full architecture.
### 3-Project Architecture
Logstash routes errors to different Bugsink projects based on log source:
| Project | ID | Receives |
| -------------------- | --- | --------------------------------------------- |
| Backend API (Dev) | 1 | Pino app errors, PostgreSQL errors |
| Frontend (Dev) | 2 | Browser errors (via Sentry SDK, not Logstash) |
| Infrastructure (Dev) | 4 | Redis warnings, NGINX errors, Vite errors |
### Log Sources
| Source | Log Path | Error Detection |
| ---------- | ---------------------- | ------------------------- |
| Pino (app) | `/app/logs/*.log` | level >= 50 (error/fatal) |
| Redis | `/var/log/redis/*.log` | WARNING/ERROR log levels |
| PostgreSQL | (future) | ERROR/FATAL log levels |
| Source | Log Path | Project Destination | Error Detection |
| ---------- | --------------------------- | ------------------- | ------------------------- |
| PM2 API | `/var/log/pm2/api-*.log` | Backend (1) | level >= 50 (error/fatal) |
| PM2 Worker | `/var/log/pm2/worker-*.log` | Backend (1) | level >= 50 (error/fatal) |
| PM2 Vite | `/var/log/pm2/vite-*.log` | Infrastructure (4) | error keyword patterns |
| PostgreSQL | `/var/log/postgresql/*.log` | Backend (1) | ERROR/FATAL log levels |
| Redis | `/var/log/redis/*.log` | Infrastructure (4) | WARNING level (`#`) |
| NGINX | `/var/log/nginx/error.log` | Infrastructure (4) | error/crit/alert/emerg |
### Pipeline Configuration
**Location:** `/etc/logstash/conf.d/bugsink.conf`
**Location:** `/etc/logstash/conf.d/bugsink.conf` (or `docker/logstash/bugsink.conf` in project)
```conf
# === INPUTS ===
input {
file {
path => "/app/logs/*.log"
codec => json
type => "pino"
tags => ["app"]
}
The configuration:
file {
path => "/var/log/redis/*.log"
type => "redis"
tags => ["redis"]
}
1. **Inputs**: Reads from PM2 logs, PostgreSQL logs, Redis logs, NGINX logs
2. **Filters**: Detects errors and assigns tags based on log type
3. **Outputs**: Routes to appropriate Bugsink project based on log source
**Key Routing Logic:**
```ruby
# Infrastructure logs -> Project 4
if "error" in [tags] and ([type] == "redis" or [type] == "nginx_error" or [type] == "pm2_vite") {
http { url => "http://localhost:8000/api/4/store/" ... }
}
# === FILTERS ===
filter {
if [type] == "pino" and [level] >= 50 {
mutate { add_tag => ["error"] }
}
if [type] == "redis" {
grok {
match => { "message" => "%{POSINT:pid}:%{WORD:role} %{MONTHDAY} %{MONTH} %{TIME} %{WORD:loglevel} %{GREEDYDATA:redis_message}" }
}
if [loglevel] in ["WARNING", "ERROR"] {
mutate { add_tag => ["error"] }
}
}
}
# === OUTPUT ===
output {
if "error" in [tags] {
http {
url => "http://localhost:8000/api/store/"
http_method => "post"
format => "json"
}
}
# Backend logs -> Project 1
else if "error" in [tags] and ([type] in ["pm2_api", "pm2_worker", "pino", "postgres"]) {
http { url => "http://localhost:8000/api/1/store/" ... }
}
```
### Benefits
1. **Secondary Capture Path**: Catches errors before SDK initialization
2. **Log-Based Errors**: Captures errors that don't throw exceptions
3. **Infrastructure Monitoring**: Redis connection issues, slow commands
4. **Historical Analysis**: Process existing log files
1. **Separation of Concerns**: Application errors separate from infrastructure issues
2. **Secondary Capture Path**: Catches errors before SDK initialization
3. **Log-Based Errors**: Captures errors that don't throw exceptions
4. **Infrastructure Monitoring**: Redis, NGINX, build tooling issues
5. **Historical Analysis**: Process existing log files
---
@@ -743,6 +801,228 @@ podman exec flyer-crawler-dev psql -U postgres -h postgres -c "\l" | grep bugsin
ssh root@projectium.com "cd /opt/bugsink && bugsink-manage check"
```
### PostgreSQL Sequence Out of Sync (Duplicate Key Errors)
**Symptoms:**
- Bugsink throws `duplicate key value violates unique constraint "projects_project_pkey"`
- Error detail shows: `Key (id)=(1) already exists`
- New projects or other entities fail to create
**Root Cause:**
PostgreSQL sequences can become out of sync with actual data after:
- Manual data insertion or database seeding
- Restoring from backup
- Copying data between environments
The sequence generates IDs that already exist in the table.
**Diagnosis:**
```bash
# Dev Container - Check sequence vs max ID
podman exec flyer-crawler-dev psql -U bugsink -h postgres -d bugsink -c "
SELECT
(SELECT MAX(id) FROM projects_project) as max_id,
(SELECT last_value FROM projects_project_id_seq) as seq_last_value,
CASE
WHEN (SELECT MAX(id) FROM projects_project) <= (SELECT last_value FROM projects_project_id_seq)
THEN 'OK'
ELSE 'OUT OF SYNC - Needs reset'
END as status;
"
# Production
ssh root@projectium.com "cd /opt/bugsink && bugsink-manage dbshell" <<< "
SELECT MAX(id) as max_id, (SELECT last_value FROM projects_project_id_seq) as seq_value FROM projects_project;
"
```
**Solution:**
Reset the sequence to the maximum existing ID:
```bash
# Dev Container
podman exec flyer-crawler-dev psql -U bugsink -h postgres -d bugsink -c "
SELECT setval('projects_project_id_seq', COALESCE((SELECT MAX(id) FROM projects_project), 1), true);
"
# Production
ssh root@projectium.com "cd /opt/bugsink && bugsink-manage dbshell" <<< "
SELECT setval('projects_project_id_seq', COALESCE((SELECT MAX(id) FROM projects_project), 1), true);
"
```
**Verification:**
After running the fix, verify:
```bash
# Next ID should be max_id + 1
podman exec flyer-crawler-dev psql -U bugsink -h postgres -d bugsink -c "
SELECT nextval('projects_project_id_seq') - 1 as current_seq_value;
"
```
**Prevention:**
When manually inserting data or restoring backups, always reset sequences:
```sql
-- Generic pattern for any table/sequence
SELECT setval('SEQUENCE_NAME', COALESCE((SELECT MAX(id) FROM TABLE_NAME), 1), true);
-- Common Bugsink sequences that may need reset:
SELECT setval('projects_project_id_seq', COALESCE((SELECT MAX(id) FROM projects_project), 1), true);
SELECT setval('teams_team_id_seq', COALESCE((SELECT MAX(id) FROM teams_team), 1), true);
SELECT setval('releases_release_id_seq', COALESCE((SELECT MAX(id) FROM releases_release), 1), true);
```
### Logstash Level Field Constraint Violation
**Symptoms:**
- Bugsink errors: `value too long for type character varying(7)`
- Errors in Backend API project from Logstash
- Log shows `%{sentry_level}` literal string being sent
**Root Cause:**
Logstash sends the literal placeholder `%{sentry_level}` (16 characters) to Bugsink when:
- No error pattern is detected in the log message
- The `sentry_level` field is not properly initialized
- Bugsink's `level` column has a `varchar(7)` constraint
Valid Sentry levels are: `fatal`, `error`, `warning`, `info`, `debug` (all <= 7 characters).
**Diagnosis:**
```bash
# Check for recent level constraint errors in Bugsink
# Via MCP:
mcp__localerrors__list_issues({ project_id: 1, status: 'unresolved' })
# Or check Logstash logs for HTTP 500 responses
podman exec flyer-crawler-dev cat /var/log/logstash/logstash.log | grep "500"
```
**Solution:**
The fix requires updating the Logstash configuration (`docker/logstash/bugsink.conf`) to:
1. Validate `sentry_level` is not nil, empty, or contains placeholder text
2. Set a default value of "error" for any error-tagged event without a valid level
3. Normalize levels to lowercase
**Key filter block (Ruby):**
```ruby
ruby {
code => '
level = event.get("sentry_level")
# Check if level is invalid (nil, empty, contains placeholder, or invalid value)
if level.nil? || level.to_s.empty? || level.to_s.include?("%{") || level.to_s.length > 7
# Default to "error" for error-tagged events, "info" otherwise
if event.get("tags")&.include?("error")
event.set("sentry_level", "error")
else
event.set("sentry_level", "info")
end
else
# Normalize to lowercase and validate
normalized = level.to_s.downcase
valid_levels = ["fatal", "error", "warning", "info", "debug"]
unless valid_levels.include?(normalized)
normalized = "error"
end
event.set("sentry_level", normalized)
end
'
}
```
**Verification:**
After applying the fix:
1. Restart Logstash: `podman exec flyer-crawler-dev systemctl restart logstash`
2. Generate a test error and verify it appears in Bugsink without level errors
3. Check no new "value too long" errors appear in the project
### CSRF Verification Failed
**Symptoms:** "CSRF verification failed. Request aborted." error when performing actions in Bugsink UI (resolving issues, changing settings, etc.)
**Root Cause:**
Django 4.0+ requires `CSRF_TRUSTED_ORIGINS` to be explicitly configured for HTTPS POST requests. The error occurs because:
1. Bugsink is accessed via `https://localhost:8443` (nginx HTTPS proxy)
2. Django's CSRF protection validates the `Origin` header against `CSRF_TRUSTED_ORIGINS`
3. Without explicit configuration, Django rejects POST requests from HTTPS origins
**Why localhost vs 127.0.0.1 Matters:**
- `localhost` and `127.0.0.1` are treated as DIFFERENT origins by browsers
- If you access Bugsink via `https://localhost:8443`, Django must trust `https://localhost:8443`
- If you access via `https://127.0.0.1:8443`, Django must trust `https://127.0.0.1:8443`
- The fix includes BOTH to allow either access pattern
**Configuration (Already Applied):**
The Bugsink Django configuration in `Dockerfile.dev` includes:
```python
# CSRF Trusted Origins (Django 4.0+ requires full origin for HTTPS POST requests)
CSRF_TRUSTED_ORIGINS = [
"https://localhost:8443",
"https://127.0.0.1:8443",
"http://localhost:8000",
"http://127.0.0.1:8000",
]
# HTTPS proxy support (nginx reverse proxy on port 8443)
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
```
**Verification:**
```bash
# Verify CSRF_TRUSTED_ORIGINS is configured
podman exec flyer-crawler-dev sh -c 'cat /opt/bugsink/conf/bugsink_conf.py | grep -A 6 CSRF_TRUSTED'
# Expected output:
# CSRF_TRUSTED_ORIGINS = [
# "https://localhost:8443",
# "https://127.0.0.1:8443",
# "http://localhost:8000",
# "http://127.0.0.1:8000",
# ]
```
**If Issue Persists After Fix:**
1. **Rebuild the container image** (configuration is baked into the image):
```bash
podman-compose -f compose.dev.yml down
podman build -f Dockerfile.dev -t localhost/flyer-crawler-dev:latest .
podman-compose -f compose.dev.yml up -d
```
2. **Clear browser cookies** for localhost:8443
3. **Check nginx X-Forwarded-Proto header** - the nginx config must set this header for Django to recognize HTTPS:
```bash
podman exec flyer-crawler-dev cat /etc/nginx/sites-available/bugsink | grep X-Forwarded-Proto
# Should show: proxy_set_header X-Forwarded-Proto $scheme;
```
---
## Related Documentation

View File

@@ -44,6 +44,8 @@ if (missingVars.length > 0) {
// --- Shared Environment Variables ---
// These come from compose.dev.yml environment section
const sharedEnv = {
// Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: process.env.TZ || 'America/Los_Angeles',
NODE_ENV: 'development',
DB_HOST: process.env.DB_HOST || 'postgres',
DB_PORT: process.env.DB_PORT || '5432',
@@ -160,6 +162,8 @@ module.exports = {
min_uptime: '5s',
// Environment
env: {
// Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: process.env.TZ || 'America/Los_Angeles',
NODE_ENV: 'development',
// Vite-specific env vars (VITE_ prefix)
VITE_SENTRY_DSN: process.env.VITE_SENTRY_DSN,

4
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "flyer-crawler",
"version": "0.12.8",
"version": "0.12.14",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "flyer-crawler",
"version": "0.12.8",
"version": "0.12.14",
"dependencies": {
"@bull-board/api": "^6.14.2",
"@bull-board/express": "^6.14.2",

View File

@@ -1,7 +1,7 @@
{
"name": "flyer-crawler",
"private": true,
"version": "0.12.8",
"version": "0.12.14",
"type": "module",
"scripts": {
"dev": "concurrently \"npm:start:dev\" \"vite\"",
@@ -14,12 +14,12 @@
"start": "npm run start:prod",
"build": "vite build",
"preview": "vite preview",
"test": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx ./node_modules/vitest/vitest.mjs run",
"test-wsl": "cross-env NODE_ENV=test vitest run",
"test": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx ./node_modules/vitest/vitest.mjs run",
"test-wsl": "cross-env NODE_ENV=test TZ= vitest run",
"test:coverage": "npm run clean && npm run test:unit -- --coverage && npm run test:integration -- --coverage",
"test:unit": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project unit -c vite.config.ts",
"test:integration": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project integration -c vitest.config.integration.ts",
"test:e2e": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --config vitest.config.e2e.ts",
"test:unit": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project unit -c vite.config.ts",
"test:integration": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project integration -c vitest.config.integration.ts",
"test:e2e": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --config vitest.config.e2e.ts",
"format": "prettier --write .",
"lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
"type-check": "tsc --noEmit",

View File

@@ -23,6 +23,26 @@ set -e
echo "Starting Flyer Crawler Dev Container..."
# ============================================================================
# Timezone Configuration
# ============================================================================
# Ensure TZ is set for consistent log timestamps across all services.
# TZ should be set via compose.dev.yml environment (default: America/Los_Angeles)
# ============================================================================
if [ -n "$TZ" ]; then
echo "Timezone configured: $TZ"
# Link timezone data if available (for date command and other tools)
if [ -f "/usr/share/zoneinfo/$TZ" ]; then
ln -sf "/usr/share/zoneinfo/$TZ" /etc/localtime
echo "$TZ" > /etc/timezone
echo "System timezone set to: $(date +%Z) ($(date))"
else
echo "Warning: Timezone data not found for $TZ, using TZ environment variable only"
fi
else
echo "Warning: TZ environment variable not set, using container default timezone"
fi
# Configure Bugsink HTTPS (ADR-015)
echo "Configuring Bugsink HTTPS..."
mkdir -p /etc/bugsink/ssl

View File

@@ -27,9 +27,13 @@ const defaultProps = {
};
const setupSuccessMocks = () => {
// The API returns {success, data: {userprofile, token}}, and the mutation extracts .data
const mockAuthResponse = {
userprofile: createMockUserProfile({ user: { user_id: '123', email: 'test@example.com' } }),
token: 'mock-token',
success: true,
data: {
userprofile: createMockUserProfile({ user: { user_id: '123', email: 'test@example.com' } }),
token: 'mock-token',
},
};
(mockedApiClient.loginUser as Mock).mockResolvedValue(
new Response(JSON.stringify(mockAuthResponse)),

View File

@@ -82,7 +82,11 @@ const defaultAuthenticatedProps = {
};
const setupSuccessMocks = () => {
const mockAuthResponse = { userprofile: authenticatedProfile, token: 'mock-token' };
// The API returns {success, data: {userprofile, token}}, and the mutation extracts .data
const mockAuthResponse = {
success: true,
data: { userprofile: authenticatedProfile, token: 'mock-token' },
};
(mockedApiClient.loginUser as Mock).mockResolvedValue(
new Response(JSON.stringify(mockAuthResponse)),
);

View File

@@ -619,4 +619,240 @@ describe('Health Routes (/api/health)', () => {
expect(response.body.error.details.database.message).toBe('Database connection failed');
});
});
// =============================================================================
// QUEUE HEALTH MONITORING (ADR-053)
// =============================================================================
describe('GET /queues', () => {
// Mock the queues module
beforeEach(async () => {
vi.resetModules();
// Re-import after mocks are set up
});
it('should return 200 OK with queue metrics and worker heartbeats when all healthy', async () => {
// Arrange: Mock queue getJobCounts() and Redis heartbeats
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 5,
active: 2,
failed: 1,
delayed: 0,
}),
};
// Mock all queues
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock Redis heartbeat responses (all healthy, last seen < 60s ago)
const recentTimestamp = new Date(Date.now() - 10000).toISOString(); // 10 seconds ago
const heartbeatValue = JSON.stringify({
timestamp: recentTimestamp,
pid: 1234,
host: 'test-host',
});
mockedRedisConnection.get = vi.fn().mockResolvedValue(heartbeatValue);
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(200);
expect(response.body.success).toBe(true);
expect(response.body.data.status).toBe('healthy');
expect(response.body.data.queues).toBeDefined();
expect(response.body.data.workers).toBeDefined();
// Verify queue metrics structure
expect(response.body.data.queues['flyer-processing']).toEqual({
waiting: 5,
active: 2,
failed: 1,
delayed: 0,
});
// Verify worker heartbeat structure
expect(response.body.data.workers['flyer-processing']).toEqual({
alive: true,
lastSeen: recentTimestamp,
pid: 1234,
host: 'test-host',
});
});
it('should return 503 when a queue is unavailable', async () => {
// Arrange: Mock one queue to fail
const mockQueues = await import('../services/queues.server');
const healthyQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
const failingQueue = {
getJobCounts: vi.fn().mockRejectedValue(new Error('Redis connection lost')),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(failingQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(healthyQueue as never);
mockedRedisConnection.get = vi.fn().mockResolvedValue(null);
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(503);
expect(response.body.success).toBe(false);
expect(response.body.error.message).toBe('One or more queues or workers unavailable');
expect(response.body.error.details.status).toBe('unhealthy');
expect(response.body.error.details.queues['flyer-processing']).toEqual({
error: 'Redis connection lost',
});
});
it('should return 503 when a worker heartbeat is stale', async () => {
// Arrange: Mock queues as healthy but one worker heartbeat as stale
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock heartbeat - one worker is stale (> 60s ago)
const staleTimestamp = new Date(Date.now() - 120000).toISOString(); // 120 seconds ago
const staleHeartbeat = JSON.stringify({
timestamp: staleTimestamp,
pid: 1234,
host: 'test-host',
});
// First call returns stale heartbeat for flyer-processing, rest return null (no heartbeat)
let callCount = 0;
mockedRedisConnection.get = vi.fn().mockImplementation(() => {
callCount++;
return Promise.resolve(callCount === 1 ? staleHeartbeat : null);
});
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(503);
expect(response.body.success).toBe(false);
expect(response.body.error.details.status).toBe('unhealthy');
expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false });
});
it('should return 503 when worker heartbeat is missing', async () => {
// Arrange: Mock queues as healthy but no worker heartbeats in Redis
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock Redis to return null (no heartbeat found)
mockedRedisConnection.get = vi.fn().mockResolvedValue(null);
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(503);
expect(response.body.success).toBe(false);
expect(response.body.error.details.status).toBe('unhealthy');
expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false });
});
it('should handle Redis connection errors gracefully', async () => {
// Arrange: Mock queues to succeed but Redis get() to fail
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock Redis get() to throw error
mockedRedisConnection.get = vi.fn().mockRejectedValue(new Error('Redis connection lost'));
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert: Should still return queue metrics but mark workers as unhealthy
expect(response.status).toBe(503);
expect(response.body.error.details.queues['flyer-processing']).toEqual({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
});
expect(response.body.error.details.workers['flyer-processing']).toEqual({
alive: false,
error: 'Redis connection lost',
});
});
});
});

View File

@@ -15,6 +15,17 @@ import fs from 'node:fs/promises';
import { getSimpleWeekAndYear } from '../utils/dateUtils';
import { validateRequest } from '../middleware/validation.middleware';
import { sendSuccess, sendError, ErrorCode } from '../utils/apiResponse';
import {
flyerQueue,
emailQueue,
analyticsQueue,
weeklyAnalyticsQueue,
cleanupQueue,
tokenCleanupQueue,
receiptQueue,
expiryAlertQueue,
barcodeQueue,
} from '../services/queues.server';
const router = Router();
@@ -442,4 +453,224 @@ router.get(
},
);
// =============================================================================
// QUEUE HEALTH MONITORING (ADR-053)
// =============================================================================
/**
* @openapi
* /health/queues:
* get:
* summary: Queue health and metrics with worker heartbeats
* description: |
* Returns job counts for all BullMQ queues and worker heartbeat status.
* Use this endpoint to monitor queue depths and detect stuck/frozen workers.
* Implements ADR-053: Worker Health Checks and Stalled Job Monitoring.
* tags:
* - Health
* responses:
* 200:
* description: Queue metrics and worker heartbeats retrieved successfully
* content:
* application/json:
* schema:
* type: object
* properties:
* success:
* type: boolean
* example: true
* data:
* type: object
* properties:
* status:
* type: string
* enum: [healthy, unhealthy]
* timestamp:
* type: string
* format: date-time
* queues:
* type: object
* additionalProperties:
* type: object
* properties:
* waiting:
* type: integer
* active:
* type: integer
* failed:
* type: integer
* delayed:
* type: integer
* workers:
* type: object
* additionalProperties:
* type: object
* properties:
* alive:
* type: boolean
* lastSeen:
* type: string
* format: date-time
* pid:
* type: integer
* host:
* type: string
* 503:
* description: Redis unavailable or workers not responding
* content:
* application/json:
* schema:
* $ref: '#/components/schemas/ErrorResponse'
*/
router.get(
'/queues',
validateRequest(emptySchema),
async (req: Request, res: Response, next: NextFunction) => {
try {
// Define all queues to monitor
const queues = [
{ name: 'flyer-processing', queue: flyerQueue },
{ name: 'email-sending', queue: emailQueue },
{ name: 'analytics-reporting', queue: analyticsQueue },
{ name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue },
{ name: 'file-cleanup', queue: cleanupQueue },
{ name: 'token-cleanup', queue: tokenCleanupQueue },
{ name: 'receipt-processing', queue: receiptQueue },
{ name: 'expiry-alerts', queue: expiryAlertQueue },
{ name: 'barcode-detection', queue: barcodeQueue },
];
// Fetch job counts for all queues in parallel
const queueMetrics = await Promise.all(
queues.map(async ({ name, queue }) => {
try {
const counts = await queue.getJobCounts();
return {
name,
counts: {
waiting: counts.waiting || 0,
active: counts.active || 0,
failed: counts.failed || 0,
delayed: counts.delayed || 0,
},
};
} catch (error) {
// If individual queue fails, return error state
return {
name,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}),
);
// Fetch worker heartbeats in parallel
const workerNames = queues.map((q) => q.name);
const workerHeartbeats = await Promise.all(
workerNames.map(async (name) => {
try {
const key = `worker:heartbeat:${name}`;
const value = await redisConnection.get(key);
if (!value) {
return { name, alive: false };
}
const heartbeat = JSON.parse(value) as {
timestamp: string;
pid: number;
host: string;
};
const lastSeenMs = new Date(heartbeat.timestamp).getTime();
const nowMs = Date.now();
const ageSeconds = (nowMs - lastSeenMs) / 1000;
// Consider alive if last heartbeat < 60 seconds ago
const alive = ageSeconds < 60;
return {
name,
alive,
lastSeen: heartbeat.timestamp,
pid: heartbeat.pid,
host: heartbeat.host,
};
} catch (error) {
// If heartbeat check fails, mark as unknown
return {
name,
alive: false,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}),
);
// Build response objects
const queuesData: Record<
string,
{ waiting: number; active: number; failed: number; delayed: number } | { error: string }
> = {};
const workersData: Record<
string,
| { alive: boolean; lastSeen?: string; pid?: number; host?: string }
| { alive: boolean; error: string }
> = {};
let hasErrors = false;
for (const metric of queueMetrics) {
if ('error' in metric) {
queuesData[metric.name] = { error: metric.error };
hasErrors = true;
} else {
queuesData[metric.name] = metric.counts;
}
}
for (const heartbeat of workerHeartbeats) {
if ('error' in heartbeat) {
workersData[heartbeat.name] = { alive: false, error: heartbeat.error };
} else if (!heartbeat.alive) {
workersData[heartbeat.name] = { alive: false };
hasErrors = true;
} else {
workersData[heartbeat.name] = {
alive: heartbeat.alive,
lastSeen: heartbeat.lastSeen,
pid: heartbeat.pid,
host: heartbeat.host,
};
}
}
const response = {
status: hasErrors ? ('unhealthy' as const) : ('healthy' as const),
timestamp: new Date().toISOString(),
queues: queuesData,
workers: workersData,
};
if (hasErrors) {
return sendError(
res,
ErrorCode.SERVICE_UNAVAILABLE,
'One or more queues or workers unavailable',
503,
response,
);
}
return sendSuccess(res, response);
} catch (error: unknown) {
// Redis connection error or other unexpected failure
if (error instanceof Error) {
return next(error);
}
const message =
(error as { message?: string })?.message || 'Failed to retrieve queue metrics';
return next(new Error(message));
}
},
);
export default router;

View File

@@ -132,7 +132,8 @@ describe('API Client', () => {
.mockResolvedValueOnce({
ok: true,
status: 200,
json: () => Promise.resolve({ token: 'new-refreshed-token' }),
// The API returns {success, data: {token}} wrapper format
json: () => Promise.resolve({ success: true, data: { token: 'new-refreshed-token' } }),
} as Response)
.mockResolvedValueOnce({
ok: true,
@@ -218,7 +219,7 @@ describe('API Client', () => {
localStorage.setItem('authToken', 'expired-token');
// Mock the global fetch to return a sequence of responses:
// 1. 401 Unauthorized (initial API call)
// 2. 200 OK (token refresh call)
// 2. 200 OK (token refresh call) - uses API wrapper format {success, data: {token}}
// 3. 200 OK (retry of the initial API call)
vi.mocked(global.fetch)
.mockResolvedValueOnce({
@@ -229,7 +230,8 @@ describe('API Client', () => {
.mockResolvedValueOnce({
ok: true,
status: 200,
json: () => Promise.resolve({ token: 'new-refreshed-token' }),
// The API returns {success, data: {token}} wrapper format
json: () => Promise.resolve({ success: true, data: { token: 'new-refreshed-token' } }),
} as Response)
.mockResolvedValueOnce({
ok: true,

View File

@@ -62,12 +62,33 @@ vi.mock('./logger.server', () => ({
vi.mock('bullmq', () => ({
Worker: mocks.MockWorker,
Queue: vi.fn(function () {
return { add: vi.fn() };
return { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) };
}),
// Add UnrecoverableError to the mock so it can be used in tests
UnrecoverableError: class UnrecoverableError extends Error {},
}));
// Mock redis.server to prevent real Redis connection attempts
vi.mock('./redis.server', () => ({
connection: {
on: vi.fn(),
quit: vi.fn().mockResolvedValue(undefined),
},
}));
// Mock queues.server to provide mock queue instances
vi.mock('./queues.server', () => ({
flyerQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
emailQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
analyticsQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
cleanupQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
weeklyAnalyticsQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
tokenCleanupQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
receiptQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
expiryAlertQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
barcodeQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
}));
// Mock flyerProcessingService.server as flyerWorker and cleanupWorker depend on it
vi.mock('./flyerProcessingService.server', () => {
// Mock the constructor to return an object with the mocked methods
@@ -88,6 +109,67 @@ vi.mock('./flyerDataTransformer', () => ({
},
}));
// Mock aiService.server to prevent initialization issues
vi.mock('./aiService.server', () => ({
aiService: {
extractAndValidateData: vi.fn(),
},
}));
// Mock db/index.db to prevent database connections
vi.mock('./db/index.db', () => ({
personalizationRepo: {},
}));
// Mock flyerAiProcessor.server
vi.mock('./flyerAiProcessor.server', () => ({
FlyerAiProcessor: vi.fn().mockImplementation(function () {
return { processFlyer: vi.fn() };
}),
}));
// Mock flyerPersistenceService.server
vi.mock('./flyerPersistenceService.server', () => ({
FlyerPersistenceService: vi.fn().mockImplementation(function () {
return { persistFlyerData: vi.fn() };
}),
}));
// Mock db/connection.db to prevent database connections
vi.mock('./db/connection.db', () => ({
withTransaction: vi.fn(),
}));
// Mock receiptService.server
vi.mock('./receiptService.server', () => ({
processReceiptJob: vi.fn().mockResolvedValue(undefined),
}));
// Mock expiryService.server
vi.mock('./expiryService.server', () => ({
processExpiryAlertJob: vi.fn().mockResolvedValue(undefined),
}));
// Mock barcodeService.server
vi.mock('./barcodeService.server', () => ({
processBarcodeDetectionJob: vi.fn().mockResolvedValue(undefined),
}));
// Mock flyerFileHandler.server
vi.mock('./flyerFileHandler.server', () => ({
FlyerFileHandler: vi.fn().mockImplementation(function () {
return { handleFile: vi.fn() };
}),
}));
// Mock workerOptions config
vi.mock('../config/workerOptions', () => ({
defaultWorkerOptions: {
lockDuration: 30000,
stalledInterval: 30000,
},
}));
// Helper to create a mock BullMQ Job object
const createMockJob = <T>(data: T): Job<T> => {
return {

View File

@@ -3,6 +3,7 @@ import { Worker, Job } from 'bullmq';
import fsPromises from 'node:fs/promises';
import { exec } from 'child_process';
import { promisify } from 'util';
import os from 'os';
import { logger } from './logger.server';
import { connection } from './redis.server';
@@ -91,6 +92,45 @@ const createWorkerProcessor = <T, R>(processor: (job: Job<T>) => Promise<R>) =>
};
};
/**
* Updates the worker heartbeat in Redis.
* Stores timestamp, PID, and hostname to detect frozen/hung workers.
* TTL is 90s, so if heartbeat isn't updated for 90s, the key expires.
* Implements ADR-053: Worker Health Checks.
*/
const updateWorkerHeartbeat = async (workerName: string) => {
const key = `worker:heartbeat:${workerName}`;
const value = JSON.stringify({
timestamp: new Date().toISOString(),
pid: process.pid,
host: os.hostname(),
});
try {
await connection.set(key, value, 'EX', 90);
} catch (error) {
logger.error({ err: error, workerName }, `Failed to update heartbeat for worker ${workerName}`);
}
};
/**
* Starts periodic heartbeat updates for a worker.
* Updates every 30 seconds with 90s TTL.
*/
const startWorkerHeartbeat = (worker: Worker) => {
// Initial heartbeat
updateWorkerHeartbeat(worker.name);
// Periodic heartbeat updates
const heartbeatInterval = setInterval(() => {
updateWorkerHeartbeat(worker.name);
}, 30000); // 30 seconds
// Store interval on worker for cleanup
(worker as unknown as { heartbeatInterval?: NodeJS.Timeout }).heartbeatInterval =
heartbeatInterval;
};
const attachWorkerEventListeners = (worker: Worker) => {
worker.on('completed', (job: Job, returnValue: unknown) => {
logger.info({ returnValue }, `[${worker.name}] Job ${job.id} completed successfully.`);
@@ -102,6 +142,9 @@ const attachWorkerEventListeners = (worker: Worker) => {
`[${worker.name}] Job ${job?.id} has ultimately failed after all attempts.`,
);
});
// Start heartbeat monitoring for this worker
startWorkerHeartbeat(worker);
};
export const flyerWorker = new Worker<FlyerJobData>(
@@ -219,17 +262,28 @@ const SHUTDOWN_TIMEOUT = 30000; // 30 seconds
* without exiting the process.
*/
export const closeWorkers = async () => {
await Promise.all([
flyerWorker.close(),
emailWorker.close(),
analyticsWorker.close(),
cleanupWorker.close(),
weeklyAnalyticsWorker.close(),
tokenCleanupWorker.close(),
receiptWorker.close(),
expiryAlertWorker.close(),
barcodeWorker.close(),
]);
// Clear heartbeat intervals
const workers = [
flyerWorker,
emailWorker,
analyticsWorker,
cleanupWorker,
weeklyAnalyticsWorker,
tokenCleanupWorker,
receiptWorker,
expiryAlertWorker,
barcodeWorker,
];
workers.forEach((worker) => {
const interval = (worker as unknown as { heartbeatInterval?: NodeJS.Timeout })
.heartbeatInterval;
if (interval) {
clearInterval(interval);
}
});
await Promise.all(workers.map((w) => w.close()));
};
export const gracefulShutdown = async (signal: string) => {