Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e86ce51b6c | ||
| 840a7a62d3 | |||
| 5720820d95 | |||
|
|
e5cdb54308 | ||
| a3f212ff81 | |||
|
|
de263f74b0 | ||
| a71e41302b | |||
|
|
3575803252 | ||
| d03900cefe | |||
|
|
6d49639845 | ||
| d4543cf4b9 | |||
|
|
4f08238698 | ||
| 38b35f87aa |
@@ -117,7 +117,11 @@
|
||||
"Bash(git -C \"C:\\\\Users\\\\games3\\\\.claude\\\\plugins\\\\marketplaces\\\\claude-plugins-official\" fetch --dry-run -v)",
|
||||
"mcp__localerrors__get_project",
|
||||
"mcp__localerrors__get_issue",
|
||||
"mcp__localerrors__get_event"
|
||||
"mcp__localerrors__get_event",
|
||||
"mcp__localerrors__list_teams",
|
||||
"WebSearch",
|
||||
"Bash(for trigger in update_price_history_on_flyer_item_insert update_recipe_rating_aggregates log_new_recipe log_new_flyer)",
|
||||
"Bash(do echo \"=== $trigger ===\")"
|
||||
]
|
||||
},
|
||||
"enabledMcpjsonServers": [
|
||||
|
||||
17
.env.example
17
.env.example
@@ -94,11 +94,18 @@ WORKER_LOCK_DURATION=120000
|
||||
# Error Tracking (ADR-015)
|
||||
# ===================
|
||||
# Sentry-compatible error tracking via Bugsink (self-hosted)
|
||||
# DSNs are created in Bugsink UI at http://localhost:8000 (dev) or your production URL
|
||||
# Backend DSN - for Express/Node.js errors
|
||||
SENTRY_DSN=
|
||||
# Frontend DSN - for React/browser errors (uses VITE_ prefix)
|
||||
VITE_SENTRY_DSN=
|
||||
# DSNs are created in Bugsink UI at https://localhost:8443 (dev) or your production URL
|
||||
#
|
||||
# Dev container projects:
|
||||
# - Project 1: Backend API (Dev) - receives Pino, PostgreSQL errors
|
||||
# - Project 2: Frontend (Dev) - receives browser errors via Sentry SDK
|
||||
# - Project 4: Infrastructure (Dev) - receives Redis, NGINX, Vite errors
|
||||
#
|
||||
# Backend DSN - for Express/Node.js errors (internal container URL)
|
||||
SENTRY_DSN=http://<key>@localhost:8000/1
|
||||
# Frontend DSN - for React/browser errors (uses nginx proxy for browser access)
|
||||
# Note: Browsers cannot reach localhost:8000 directly, so we use nginx proxy at /bugsink-api/
|
||||
VITE_SENTRY_DSN=https://<key>@localhost/bugsink-api/2
|
||||
# Environment name for error grouping (defaults to NODE_ENV)
|
||||
SENTRY_ENVIRONMENT=development
|
||||
VITE_SENTRY_ENVIRONMENT=development
|
||||
|
||||
53
CLAUDE.md
53
CLAUDE.md
@@ -123,23 +123,30 @@ The dev container now matches production by using PM2 for process management.
|
||||
|
||||
### Log Aggregation (ADR-050)
|
||||
|
||||
All logs flow to Bugsink via Logstash:
|
||||
All logs flow to Bugsink via Logstash with 3-project routing:
|
||||
|
||||
| Source | Log Location | Status |
|
||||
| ----------------- | --------------------------------- | ------ |
|
||||
| Backend (Pino) | `/var/log/pm2/api-*.log` | Active |
|
||||
| Worker (Pino) | `/var/log/pm2/worker-*.log` | Active |
|
||||
| Vite | `/var/log/pm2/vite-*.log` | Active |
|
||||
| PostgreSQL | `/var/log/postgresql/*.log` | Active |
|
||||
| Redis | `/var/log/redis/redis-server.log` | Active |
|
||||
| NGINX | `/var/log/nginx/*.log` | Active |
|
||||
| Frontend (Sentry) | Browser -> Bugsink SDK | Active |
|
||||
| Source | Log Location | Bugsink Project |
|
||||
| ----------------- | --------------------------------- | ------------------ |
|
||||
| Backend (Pino) | `/var/log/pm2/api-*.log` | Backend API (1) |
|
||||
| Worker (Pino) | `/var/log/pm2/worker-*.log` | Backend API (1) |
|
||||
| PostgreSQL | `/var/log/postgresql/*.log` | Backend API (1) |
|
||||
| Vite | `/var/log/pm2/vite-*.log` | Infrastructure (4) |
|
||||
| Redis | `/var/log/redis/redis-server.log` | Infrastructure (4) |
|
||||
| NGINX | `/var/log/nginx/*.log` | Infrastructure (4) |
|
||||
| Frontend (Sentry) | Browser -> nginx proxy | Frontend (2) |
|
||||
|
||||
**Bugsink Projects (Dev Container)**:
|
||||
|
||||
- Project 1: Backend API (Dev) - Application errors
|
||||
- Project 2: Frontend (Dev) - Browser errors via nginx proxy
|
||||
- Project 4: Infrastructure (Dev) - Redis, NGINX, Vite errors
|
||||
|
||||
**Key Files**:
|
||||
|
||||
- `ecosystem.dev.config.cjs` - PM2 development configuration
|
||||
- `scripts/dev-entrypoint.sh` - Container startup script
|
||||
- `docker/logstash/bugsink.conf` - Logstash pipeline configuration
|
||||
- `docker/nginx/dev.conf` - NGINX config with Bugsink API proxy
|
||||
|
||||
**Full Dev Container Guide**: See [docs/development/DEV-CONTAINER.md](docs/development/DEV-CONTAINER.md)
|
||||
|
||||
@@ -215,6 +222,7 @@ Common issues with solutions:
|
||||
4. **Filename collisions** - Multer predictable names → Use `${Date.now()}-${Math.round(Math.random() * 1e9)}`
|
||||
5. **Response format mismatches** - API format changes → Log response bodies, update assertions
|
||||
6. **External service failures** - PM2/Redis unavailable → try/catch with graceful degradation
|
||||
7. **TZ environment variable breaks async hooks** - `TZ=America/Los_Angeles` causes `RangeError: Invalid triggerAsyncId value: NaN` → Tests now explicitly set `TZ=` (empty) in package.json scripts
|
||||
|
||||
**Full Details**: See test issues section at end of this document or [docs/development/TESTING.md](docs/development/TESTING.md)
|
||||
|
||||
@@ -370,3 +378,28 @@ API formats change: `data.jobId` vs `data.job.id`, nested vs flat, string vs num
|
||||
PM2/Redis health checks fail when unavailable.
|
||||
|
||||
**Solution**: try/catch with graceful degradation or mock
|
||||
|
||||
### 7. TZ Environment Variable Breaking Async Hooks
|
||||
|
||||
**Problem**: When `TZ=America/Los_Angeles` (or other timezone values) is set in the environment, Node.js async_hooks module can produce `RangeError: Invalid triggerAsyncId value: NaN`. This breaks React Testing Library's `render()` function which uses async hooks internally.
|
||||
|
||||
**Root Cause**: Setting `TZ` to certain timezone values interferes with Node.js's internal async tracking mechanism, causing invalid async IDs to be generated.
|
||||
|
||||
**Symptoms**:
|
||||
|
||||
```text
|
||||
RangeError: Invalid triggerAsyncId value: NaN
|
||||
❯ process.env.NODE_ENV.queueSeveralMicrotasks node_modules/react/cjs/react.development.js:751:15
|
||||
❯ process.env.NODE_ENV.exports.act node_modules/react/cjs/react.development.js:886:11
|
||||
❯ node_modules/@testing-library/react/dist/act-compat.js:46:25
|
||||
❯ renderRoot node_modules/@testing-library/react/dist/pure.js:189:26
|
||||
```
|
||||
|
||||
**Solution**: Explicitly unset `TZ` in all test scripts by adding `TZ=` (empty value) to cross-env:
|
||||
|
||||
```json
|
||||
"test:unit": "cross-env NODE_ENV=test TZ= tsx ..."
|
||||
"test:integration": "cross-env NODE_ENV=test TZ= tsx ..."
|
||||
```
|
||||
|
||||
**Context**: This issue was introduced in commit `d03900c` which added `TZ: 'America/Los_Angeles'` to PM2 ecosystem configs for consistent log timestamps in production/dev environments. Tests must explicitly override this to prevent the async hooks error.
|
||||
|
||||
@@ -174,6 +174,21 @@ BUGSINK = {\n\
|
||||
}\n\
|
||||
\n\
|
||||
ALLOWED_HOSTS = deduce_allowed_hosts(BUGSINK["BASE_URL"])\n\
|
||||
# Also allow 127.0.0.1 access (both localhost and 127.0.0.1 should work)\n\
|
||||
if "127.0.0.1" not in ALLOWED_HOSTS:\n\
|
||||
ALLOWED_HOSTS.append("127.0.0.1")\n\
|
||||
if "localhost" not in ALLOWED_HOSTS:\n\
|
||||
ALLOWED_HOSTS.append("localhost")\n\
|
||||
\n\
|
||||
# CSRF Trusted Origins (Django 4.0+ requires full origin for HTTPS POST requests)\n\
|
||||
# This fixes "CSRF verification failed" errors when accessing Bugsink via HTTPS\n\
|
||||
# Both localhost and 127.0.0.1 must be trusted to support different access patterns\n\
|
||||
CSRF_TRUSTED_ORIGINS = [\n\
|
||||
"https://localhost:8443",\n\
|
||||
"https://127.0.0.1:8443",\n\
|
||||
"http://localhost:8000",\n\
|
||||
"http://127.0.0.1:8000",\n\
|
||||
]\n\
|
||||
\n\
|
||||
# Console email backend for dev\n\
|
||||
EMAIL_BACKEND = "bugsink.email_backends.QuietConsoleEmailBackend"\n\
|
||||
|
||||
@@ -57,6 +57,8 @@ services:
|
||||
- '8000:8000' # Bugsink error tracking HTTP (ADR-015)
|
||||
- '8443:8443' # Bugsink error tracking HTTPS (ADR-015)
|
||||
environment:
|
||||
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
|
||||
- TZ=America/Los_Angeles
|
||||
# Core settings
|
||||
- NODE_ENV=development
|
||||
# Database - use service name for Docker networking
|
||||
@@ -122,6 +124,10 @@ services:
|
||||
ports:
|
||||
- '5432:5432'
|
||||
environment:
|
||||
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
|
||||
TZ: America/Los_Angeles
|
||||
# PostgreSQL timezone setting (used by log_timezone and timezone parameters)
|
||||
PGTZ: America/Los_Angeles
|
||||
POSTGRES_USER: postgres
|
||||
POSTGRES_PASSWORD: postgres
|
||||
POSTGRES_DB: flyer_crawler_dev
|
||||
@@ -142,6 +148,8 @@ services:
|
||||
postgres
|
||||
-c config_file=/var/lib/postgresql/data/postgresql.conf
|
||||
-c hba_file=/var/lib/postgresql/data/pg_hba.conf
|
||||
-c timezone=America/Los_Angeles
|
||||
-c log_timezone=America/Los_Angeles
|
||||
-c log_min_messages=notice
|
||||
-c client_min_messages=notice
|
||||
-c logging_collector=on
|
||||
@@ -175,6 +183,9 @@ services:
|
||||
user: root
|
||||
ports:
|
||||
- '6379:6379'
|
||||
environment:
|
||||
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
|
||||
TZ: America/Los_Angeles
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
# Create log volume for Logstash access (ADR-050)
|
||||
|
||||
@@ -12,9 +12,18 @@
|
||||
# - NGINX logs (/var/log/nginx/*.log) - Access and error logs
|
||||
# - Redis logs (/var/log/redis/*.log) - Via shared volume (ADR-050)
|
||||
#
|
||||
# Bugsink Projects:
|
||||
# - Project 1: Backend API (Dev) - Pino errors, PostgreSQL errors
|
||||
# Bugsink Projects (3-project architecture):
|
||||
# - Project 1: Backend API (Dev) - Pino/PM2 app errors, PostgreSQL errors
|
||||
# DSN Key: cea01396c56246adb5878fa5ee6b1d22
|
||||
# - Project 2: Frontend (Dev) - Configured via Sentry SDK in browser
|
||||
# DSN Key: d92663cb73cf4145b677b84029e4b762
|
||||
# - Project 4: Infrastructure (Dev) - Redis, NGINX, PM2 operational logs
|
||||
# DSN Key: 14e8791da3d347fa98073261b596cab9
|
||||
#
|
||||
# Routing Logic:
|
||||
# - Backend logs (type: pm2_api, pm2_worker, pino, postgres) -> Project 1
|
||||
# - Infrastructure logs (type: redis, nginx_error, nginx_5xx) -> Project 4
|
||||
# - Vite errors (type: pm2_vite with errors) -> Project 4 (build tooling)
|
||||
#
|
||||
# Related Documentation:
|
||||
# - docs/adr/0050-postgresql-function-observability.md
|
||||
@@ -112,7 +121,8 @@ input {
|
||||
# ============================================================================
|
||||
# Captures PostgreSQL log output including fn_log() structured JSON messages.
|
||||
# PostgreSQL is configured to write logs to /var/log/postgresql/ (shared volume).
|
||||
# Log format: "2026-01-22 00:00:00 UTC [5724] postgres@flyer_crawler_dev LOG: message"
|
||||
# Log format: "2026-01-22 14:30:00 PST [5724] postgres@flyer_crawler_dev LOG: message"
|
||||
# Note: Timestamps are in PST (America/Los_Angeles) timezone as configured in compose.dev.yml
|
||||
file {
|
||||
path => "/var/log/postgresql/*.log"
|
||||
type => "postgres"
|
||||
@@ -217,10 +227,11 @@ filter {
|
||||
# PostgreSQL Log Processing (ADR-050)
|
||||
# ============================================================================
|
||||
# PostgreSQL log format in dev container:
|
||||
# "2026-01-22 00:00:00 UTC [5724] postgres@flyer_crawler_dev LOG: message"
|
||||
# "2026-01-22 07:06:03 UTC [19851] postgres@flyer_crawler_dev ERROR: column "id" does not exist"
|
||||
# "2026-01-22 14:30:00 PST [5724] postgres@flyer_crawler_dev LOG: message"
|
||||
# "2026-01-22 15:06:03 PST [19851] postgres@flyer_crawler_dev ERROR: column "id" does not exist"
|
||||
# Note: Timestamps are in PST (America/Los_Angeles) timezone
|
||||
if [type] == "postgres" {
|
||||
# Parse PostgreSQL log prefix with UTC timezone
|
||||
# Parse PostgreSQL log prefix with timezone (PST in dev, may vary in prod)
|
||||
grok {
|
||||
match => { "message" => "%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{TIME} %{WORD:pg_timezone} \[%{POSINT:pg_pid}\] %{DATA:pg_user}@%{DATA:pg_database} %{WORD:pg_level}: ?%{GREEDYDATA:pg_message}" }
|
||||
tag_on_failure => ["_postgres_grok_failure"]
|
||||
@@ -344,26 +355,56 @@ filter {
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# Generate Sentry Event ID for all errors
|
||||
# Generate Sentry Event ID and Ensure Required Fields for all errors
|
||||
# ============================================================================
|
||||
# CRITICAL: sentry_level MUST be set for all errors before output.
|
||||
# Bugsink's PostgreSQL schema limits level to varchar(7), so valid values are:
|
||||
# fatal, error, warning, info, debug (all <= 7 chars)
|
||||
# If sentry_level is not set, the literal "%{sentry_level}" (16 chars) is sent,
|
||||
# causing PostgreSQL insertion failures.
|
||||
# ============================================================================
|
||||
if "error" in [tags] {
|
||||
# Use Ruby for robust field handling - handles all edge cases
|
||||
ruby {
|
||||
code => '
|
||||
require "securerandom"
|
||||
event.set("sentry_event_id", SecureRandom.hex(16))
|
||||
'
|
||||
}
|
||||
|
||||
# Ensure error_message has a fallback value
|
||||
if ![error_message] {
|
||||
mutate { add_field => { "error_message" => "%{message}" } }
|
||||
# Generate unique event ID for Sentry
|
||||
event.set("sentry_event_id", SecureRandom.hex(16))
|
||||
|
||||
# =====================================================================
|
||||
# CRITICAL: Validate and set sentry_level
|
||||
# =====================================================================
|
||||
# Valid Sentry levels (max 7 chars for Bugsink PostgreSQL schema):
|
||||
# fatal, error, warning, info, debug
|
||||
# Default to "error" if missing, empty, or invalid.
|
||||
# =====================================================================
|
||||
valid_levels = ["fatal", "error", "warning", "info", "debug"]
|
||||
current_level = event.get("sentry_level")
|
||||
|
||||
if current_level.nil? || current_level.to_s.strip.empty? || !valid_levels.include?(current_level.to_s.downcase)
|
||||
event.set("sentry_level", "error")
|
||||
else
|
||||
# Normalize to lowercase
|
||||
event.set("sentry_level", current_level.to_s.downcase)
|
||||
end
|
||||
|
||||
# =====================================================================
|
||||
# Ensure error_message has a fallback value
|
||||
# =====================================================================
|
||||
error_msg = event.get("error_message")
|
||||
if error_msg.nil? || error_msg.to_s.strip.empty?
|
||||
fallback_msg = event.get("message") || event.get("msg") || "Unknown error"
|
||||
event.set("error_message", fallback_msg.to_s)
|
||||
end
|
||||
'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output {
|
||||
# ============================================================================
|
||||
# Forward Errors to Bugsink (Backend API Project)
|
||||
# Forward Errors to Bugsink (Project Routing)
|
||||
# ============================================================================
|
||||
# Bugsink uses Sentry-compatible API. Events must include:
|
||||
# - event_id: 32 hex characters (UUID without dashes)
|
||||
@@ -373,9 +414,50 @@ output {
|
||||
# - platform: "node" for backend, "javascript" for frontend
|
||||
#
|
||||
# Authentication via X-Sentry-Auth header with project's public key.
|
||||
# Dev container DSN: http://cea01396c56246adb5878fa5ee6b1d22@localhost:8000/1
|
||||
#
|
||||
# Project Routing:
|
||||
# - Project 1 (Backend): Pino app logs, PostgreSQL errors
|
||||
# - Project 4 (Infrastructure): Redis, NGINX, Vite build errors
|
||||
# ============================================================================
|
||||
if "error" in [tags] {
|
||||
|
||||
# ============================================================================
|
||||
# Infrastructure Errors -> Project 4
|
||||
# ============================================================================
|
||||
# Redis warnings/errors, NGINX errors, and Vite build errors go to
|
||||
# the Infrastructure project for separation from application code errors.
|
||||
if "error" in [tags] and ([type] == "redis" or [type] == "nginx_error" or [type] == "nginx_access" or [type] == "pm2_vite") {
|
||||
http {
|
||||
url => "http://localhost:8000/api/4/store/"
|
||||
http_method => "post"
|
||||
format => "json"
|
||||
headers => {
|
||||
"X-Sentry-Auth" => "Sentry sentry_key=14e8791da3d347fa98073261b596cab9, sentry_version=7"
|
||||
"Content-Type" => "application/json"
|
||||
}
|
||||
mapping => {
|
||||
"event_id" => "%{sentry_event_id}"
|
||||
"timestamp" => "%{@timestamp}"
|
||||
"level" => "%{sentry_level}"
|
||||
"platform" => "other"
|
||||
"logger" => "%{type}"
|
||||
"message" => "%{error_message}"
|
||||
"extra" => {
|
||||
"hostname" => "%{[host][name]}"
|
||||
"source_type" => "%{type}"
|
||||
"tags" => "%{tags}"
|
||||
"original_message" => "%{message}"
|
||||
"project" => "infrastructure"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# Backend Application Errors -> Project 1
|
||||
# ============================================================================
|
||||
# Pino application logs (API, Worker), PostgreSQL function errors, and
|
||||
# native PostgreSQL errors go to the Backend API project.
|
||||
else if "error" in [tags] and ([type] in ["pm2_api", "pm2_worker", "pino", "postgres"]) {
|
||||
http {
|
||||
url => "http://localhost:8000/api/1/store/"
|
||||
http_method => "post"
|
||||
@@ -384,7 +466,6 @@ output {
|
||||
"X-Sentry-Auth" => "Sentry sentry_key=cea01396c56246adb5878fa5ee6b1d22, sentry_version=7"
|
||||
"Content-Type" => "application/json"
|
||||
}
|
||||
# Transform event to Sentry format using regular fields (not @metadata)
|
||||
mapping => {
|
||||
"event_id" => "%{sentry_event_id}"
|
||||
"timestamp" => "%{@timestamp}"
|
||||
@@ -397,6 +478,38 @@ output {
|
||||
"source_type" => "%{type}"
|
||||
"tags" => "%{tags}"
|
||||
"original_message" => "%{message}"
|
||||
"project" => "backend"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# Fallback: Any other errors -> Project 1
|
||||
# ============================================================================
|
||||
# Catch-all for any errors that don't match specific routing rules.
|
||||
else if "error" in [tags] {
|
||||
http {
|
||||
url => "http://localhost:8000/api/1/store/"
|
||||
http_method => "post"
|
||||
format => "json"
|
||||
headers => {
|
||||
"X-Sentry-Auth" => "Sentry sentry_key=cea01396c56246adb5878fa5ee6b1d22, sentry_version=7"
|
||||
"Content-Type" => "application/json"
|
||||
}
|
||||
mapping => {
|
||||
"event_id" => "%{sentry_event_id}"
|
||||
"timestamp" => "%{@timestamp}"
|
||||
"level" => "%{sentry_level}"
|
||||
"platform" => "node"
|
||||
"logger" => "%{type}"
|
||||
"message" => "%{error_message}"
|
||||
"extra" => {
|
||||
"hostname" => "%{[host][name]}"
|
||||
"source_type" => "%{type}"
|
||||
"tags" => "%{tags}"
|
||||
"original_message" => "%{message}"
|
||||
"project" => "backend-fallback"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,6 +60,37 @@ server {
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
# ============================================================================
|
||||
# Bugsink Sentry API Proxy (for frontend error reporting)
|
||||
# ============================================================================
|
||||
# The frontend Sentry SDK cannot reach localhost:8000 directly from the browser
|
||||
# because port 8000 is only accessible within the container network.
|
||||
# This proxy allows the browser to send errors to https://localhost/bugsink-api/
|
||||
# which NGINX forwards to the Bugsink container on port 8000.
|
||||
#
|
||||
# Frontend DSN format: https://localhost/bugsink-api/<project_id>
|
||||
# Example: https://localhost/bugsink-api/2 for Frontend (Dev) project
|
||||
#
|
||||
# The Sentry SDK sends POST requests to /bugsink-api/<project>/store/
|
||||
# This proxy strips /bugsink-api and forwards to http://localhost:8000/api/
|
||||
# ============================================================================
|
||||
location /bugsink-api/ {
|
||||
proxy_pass http://localhost:8000/api/;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# Allow large error payloads with stack traces
|
||||
client_max_body_size 10M;
|
||||
|
||||
# Timeouts for error reporting (should be fast)
|
||||
proxy_connect_timeout 10s;
|
||||
proxy_send_timeout 30s;
|
||||
proxy_read_timeout 30s;
|
||||
}
|
||||
|
||||
# Proxy WebSocket connections for real-time notifications
|
||||
location /ws {
|
||||
proxy_pass http://localhost:3001;
|
||||
|
||||
@@ -2,6 +2,10 @@
|
||||
# This file is mounted into the PostgreSQL container to enable structured logging
|
||||
# from database functions via fn_log()
|
||||
|
||||
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
|
||||
timezone = 'America/Los_Angeles'
|
||||
log_timezone = 'America/Los_Angeles'
|
||||
|
||||
# Enable logging to files for Logstash pickup
|
||||
logging_collector = on
|
||||
log_destination = 'stderr'
|
||||
|
||||
@@ -28,9 +28,28 @@ The `.env.local` file uses `localhost` while `compose.dev.yml` uses `127.0.0.1`.
|
||||
## HTTPS Setup
|
||||
|
||||
- Self-signed certificates auto-generated with mkcert on container startup
|
||||
- CSRF Protection: Django configured with `SECURE_PROXY_SSL_HEADER` to trust `X-Forwarded-Proto` from nginx
|
||||
- CSRF Protection: Django configured with `CSRF_TRUSTED_ORIGINS` for both `localhost` and `127.0.0.1` (see below)
|
||||
- HTTPS proxy: nginx on port 8443 proxies to Bugsink on port 8000
|
||||
- HTTPS is for UI access only - Sentry SDK uses HTTP directly
|
||||
|
||||
### CSRF Configuration
|
||||
|
||||
Django 4.0+ requires `CSRF_TRUSTED_ORIGINS` for HTTPS POST requests. The Bugsink configuration (`Dockerfile.dev`) includes:
|
||||
|
||||
```python
|
||||
CSRF_TRUSTED_ORIGINS = [
|
||||
"https://localhost:8443",
|
||||
"https://127.0.0.1:8443",
|
||||
"http://localhost:8000",
|
||||
"http://127.0.0.1:8000",
|
||||
]
|
||||
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
|
||||
```
|
||||
|
||||
**Both hostnames are required** because browsers treat `localhost` and `127.0.0.1` as different origins.
|
||||
|
||||
If you get "CSRF verification failed" errors, see [BUGSINK-SETUP.md](tools/BUGSINK-SETUP.md#csrf-verification-failed) for troubleshooting.
|
||||
|
||||
## Isolation Benefits
|
||||
|
||||
- Dev errors stay local, don't pollute production/test dashboards
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
**Date**: 2026-01-11
|
||||
|
||||
**Status**: Proposed
|
||||
**Status**: Accepted (Fully Implemented)
|
||||
|
||||
**Related**: [ADR-015](0015-application-performance-monitoring-and-error-tracking.md), [ADR-004](0004-standardized-application-wide-structured-logging.md)
|
||||
|
||||
|
||||
@@ -2,7 +2,9 @@
|
||||
|
||||
**Date**: 2026-01-11
|
||||
|
||||
**Status**: Proposed
|
||||
**Status**: Accepted (Fully Implemented)
|
||||
|
||||
**Related**: [ADR-004](0004-standardized-application-wide-structured-logging.md)
|
||||
|
||||
## Context
|
||||
|
||||
@@ -17,7 +19,9 @@ We will adopt a namespace-based debug filter pattern, similar to the `debug` npm
|
||||
|
||||
## Implementation
|
||||
|
||||
In `src/services/logger.server.ts`:
|
||||
### Core Implementation (Completed 2026-01-11)
|
||||
|
||||
Implemented in [src/services/logger.server.ts:140-150](src/services/logger.server.ts#L140-L150):
|
||||
|
||||
```typescript
|
||||
const debugModules = (process.env.DEBUG_MODULES || '').split(',').map((s) => s.trim());
|
||||
@@ -33,10 +37,100 @@ export const createScopedLogger = (moduleName: string) => {
|
||||
};
|
||||
```
|
||||
|
||||
### Adopted Services (Completed 2026-01-26)
|
||||
|
||||
Services currently using `createScopedLogger`:
|
||||
|
||||
- `ai-service` - AI/Gemini integration ([src/services/aiService.server.ts:1020](src/services/aiService.server.ts#L1020))
|
||||
- `flyer-processing-service` - Flyer upload and processing ([src/services/flyerProcessingService.server.ts:20](src/services/flyerProcessingService.server.ts#L20))
|
||||
|
||||
## Usage
|
||||
|
||||
To debug only AI and Database interactions:
|
||||
### Enable Debug Logging for Specific Modules
|
||||
|
||||
To debug only AI and flyer processing:
|
||||
|
||||
```bash
|
||||
DEBUG_MODULES=ai-service,db-repo npm run dev
|
||||
DEBUG_MODULES=ai-service,flyer-processing-service npm run dev
|
||||
```
|
||||
|
||||
### Enable All Debug Logging
|
||||
|
||||
Use wildcard to enable debug logging for all modules:
|
||||
|
||||
```bash
|
||||
DEBUG_MODULES=* npm run dev
|
||||
```
|
||||
|
||||
### Common Module Names
|
||||
|
||||
| Module Name | Purpose | File |
|
||||
| -------------------------- | ---------------------------------------- | ----------------------------------------------- |
|
||||
| `ai-service` | AI/Gemini API interactions | `src/services/aiService.server.ts` |
|
||||
| `flyer-processing-service` | Flyer upload, validation, and processing | `src/services/flyerProcessingService.server.ts` |
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use Scoped Loggers for Long-Running Services**: Services with complex workflows or external API calls should use `createScopedLogger` to allow targeted debugging.
|
||||
|
||||
2. **Use Child Loggers for Contextual Data**: Even within scoped loggers, create child loggers with job/request-specific context:
|
||||
|
||||
```typescript
|
||||
const logger = createScopedLogger('my-service');
|
||||
|
||||
async function processJob(job: Job) {
|
||||
const jobLogger = logger.child({ jobId: job.id, jobName: job.name });
|
||||
jobLogger.debug('Starting job processing');
|
||||
}
|
||||
```
|
||||
|
||||
3. **Module Naming Convention**: Use kebab-case suffixed with `-service` or `-worker` (e.g., `ai-service`, `email-worker`).
|
||||
|
||||
4. **Production Usage**: `DEBUG_MODULES` can be set in production for temporary debugging, but should not be used continuously due to increased log volume.
|
||||
|
||||
## Examples
|
||||
|
||||
### Development Debugging
|
||||
|
||||
Debug AI service issues during development:
|
||||
|
||||
```bash
|
||||
# Dev container
|
||||
DEBUG_MODULES=ai-service npm run dev
|
||||
|
||||
# Or via PM2
|
||||
DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api-dev
|
||||
```
|
||||
|
||||
### Production Troubleshooting
|
||||
|
||||
Temporarily enable debug logging for a specific subsystem:
|
||||
|
||||
```bash
|
||||
# SSH into production server
|
||||
ssh root@projectium.com
|
||||
|
||||
# Set environment variable and restart
|
||||
DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api
|
||||
|
||||
# View logs
|
||||
pm2 logs flyer-crawler-api --lines 100
|
||||
|
||||
# Disable debug logging
|
||||
pm2 unset DEBUG_MODULES flyer-crawler-api
|
||||
pm2 restart flyer-crawler-api
|
||||
```
|
||||
|
||||
## Consequences
|
||||
|
||||
**Positive**:
|
||||
|
||||
- Developers can inspect detailed logs for specific subsystems without log flooding
|
||||
- Production debugging becomes more targeted and efficient
|
||||
- No performance impact when debug logging is disabled
|
||||
- Compatible with existing Pino logging infrastructure
|
||||
|
||||
**Negative**:
|
||||
|
||||
- Requires developers to know module names (mitigated by documentation above)
|
||||
- Not all services have adopted scoped loggers yet (gradual migration)
|
||||
|
||||
@@ -2,7 +2,14 @@
|
||||
|
||||
**Date**: 2026-01-11
|
||||
|
||||
**Status**: Proposed
|
||||
**Status**: Accepted (Fully Implemented)
|
||||
|
||||
**Implementation Status**:
|
||||
|
||||
- ✅ BullMQ worker stall configuration (complete)
|
||||
- ✅ Basic health endpoints (/live, /ready, /redis, etc.)
|
||||
- ✅ /health/queues endpoint (complete)
|
||||
- ✅ Worker heartbeat mechanism (complete)
|
||||
|
||||
## Context
|
||||
|
||||
@@ -60,3 +67,76 @@ The `/health/queues` endpoint will:
|
||||
**Negative**:
|
||||
|
||||
- Requires configuring external monitoring to poll the new endpoint.
|
||||
|
||||
## Implementation Notes
|
||||
|
||||
### Completed (2026-01-11)
|
||||
|
||||
1. **BullMQ Stall Configuration** - `src/config/workerOptions.ts`
|
||||
- All workers use `defaultWorkerOptions` with:
|
||||
- `stalledInterval: 30000` (30s)
|
||||
- `maxStalledCount: 3`
|
||||
- `lockDuration: 30000` (30s)
|
||||
- Applied to all 9 workers: flyer, email, analytics, cleanup, weekly-analytics, token-cleanup, receipt, expiry-alert, barcode
|
||||
|
||||
2. **Basic Health Endpoints** - `src/routes/health.routes.ts`
|
||||
- `/health/live` - Liveness probe
|
||||
- `/health/ready` - Readiness probe (checks DB, Redis, storage)
|
||||
- `/health/startup` - Startup probe
|
||||
- `/health/redis` - Redis connectivity
|
||||
- `/health/db-pool` - Database connection pool status
|
||||
|
||||
### Implementation Completed (2026-01-26)
|
||||
|
||||
1. **`/health/queues` Endpoint** ✅
|
||||
- Added route to `src/routes/health.routes.ts:511-674`
|
||||
- Iterates through all 9 queues from `src/services/queues.server.ts`
|
||||
- Fetches job counts using BullMQ Queue API: `getJobCounts()`
|
||||
- Returns structured response including both queue metrics and worker heartbeats:
|
||||
|
||||
```typescript
|
||||
{
|
||||
status: 'healthy' | 'unhealthy',
|
||||
timestamp: string,
|
||||
queues: {
|
||||
[queueName]: {
|
||||
waiting: number,
|
||||
active: number,
|
||||
failed: number,
|
||||
delayed: number
|
||||
}
|
||||
},
|
||||
workers: {
|
||||
[workerName]: {
|
||||
alive: boolean,
|
||||
lastSeen?: string,
|
||||
pid?: number,
|
||||
host?: string
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
- Returns 200 OK if all healthy, 503 if any queue/worker unavailable
|
||||
- Full OpenAPI documentation included
|
||||
|
||||
2. **Worker Heartbeat Mechanism** ✅
|
||||
- Added `updateWorkerHeartbeat()` and `startWorkerHeartbeat()` in `src/services/workers.server.ts:100-149`
|
||||
- Key pattern: `worker:heartbeat:<worker-name>`
|
||||
- Stores: `{ timestamp: ISO8601, pid: number, host: string }`
|
||||
- Updates every 30s with 90s TTL
|
||||
- Integrated with `/health/queues` endpoint (checks if heartbeat < 60s old)
|
||||
- Heartbeat intervals properly cleaned up in `closeWorkers()` and `gracefulShutdown()`
|
||||
|
||||
3. **Comprehensive Tests** ✅
|
||||
- Added 5 test cases in `src/routes/health.routes.test.ts:623-858`
|
||||
- Tests cover: healthy state, queue failures, stale heartbeats, missing heartbeats, Redis errors
|
||||
- All tests follow existing patterns with proper mocking
|
||||
|
||||
### Future Enhancements (Not Implemented)
|
||||
|
||||
1. **Queue Depth Alerting** (Low Priority)
|
||||
- Add configurable thresholds per queue type
|
||||
- Return 500 if `waiting` count exceeds threshold for extended period
|
||||
- Consider using Redis for storing threshold breach timestamps
|
||||
- **Estimate**: 1-2 hours
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# ADR-023: Database Normalization and Referential Integrity
|
||||
# ADR-055: Database Normalization and Referential Integrity
|
||||
|
||||
**Date:** 2026-01-19
|
||||
**Status:** Accepted
|
||||
@@ -15,8 +15,8 @@ This document tracks the implementation status and estimated effort for all Arch
|
||||
|
||||
| Status | Count |
|
||||
| ---------------------------- | ----- |
|
||||
| Accepted (Fully Implemented) | 30 |
|
||||
| Partially Implemented | 2 |
|
||||
| Accepted (Fully Implemented) | 36 |
|
||||
| Partially Implemented | 3 |
|
||||
| Proposed (Not Started) | 16 |
|
||||
|
||||
---
|
||||
@@ -62,25 +62,30 @@ This document tracks the implementation status and estimated effort for all Arch
|
||||
| [ADR-029](./0029-secret-rotation-and-key-management.md) | Secret Rotation | Proposed | L | Infrastructure changes needed |
|
||||
| [ADR-032](./0032-rate-limiting-strategy.md) | Rate Limiting | Accepted | - | Fully implemented |
|
||||
| [ADR-033](./0033-file-upload-and-storage-strategy.md) | File Upload & Storage | Accepted | - | Fully implemented |
|
||||
| [ADR-048](./0048-authentication-strategy.md) | Authentication | Partial | M | JWT done, OAuth pending |
|
||||
|
||||
### Category 5: Observability & Monitoring
|
||||
|
||||
| ADR | Title | Status | Effort | Notes |
|
||||
| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | --------------------------------- |
|
||||
| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented |
|
||||
| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration |
|
||||
| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Proposed | M | Depends on ADR-015 implementation |
|
||||
| ADR | Title | Status | Effort | Notes |
|
||||
| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | ----------------------- |
|
||||
| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented |
|
||||
| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration |
|
||||
| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Accepted | - | Fully implemented |
|
||||
| [ADR-051](./0051-asynchronous-context-propagation.md) | Context Propagation | Accepted | - | Fully implemented |
|
||||
| [ADR-052](./0052-granular-debug-logging-strategy.md) | Granular Debug Logging | Accepted | - | Fully implemented |
|
||||
|
||||
### Category 6: Deployment & Operations
|
||||
|
||||
| ADR | Title | Status | Effort | Notes |
|
||||
| -------------------------------------------------------------- | ----------------- | -------- | ------ | -------------------------- |
|
||||
| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented |
|
||||
| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending |
|
||||
| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented |
|
||||
| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed |
|
||||
| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented |
|
||||
| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented |
|
||||
| ADR | Title | Status | Effort | Notes |
|
||||
| -------------------------------------------------------------- | ------------------ | -------- | ------ | -------------------------- |
|
||||
| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented |
|
||||
| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending |
|
||||
| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented |
|
||||
| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed |
|
||||
| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented |
|
||||
| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented |
|
||||
| [ADR-053](./0053-worker-health-checks.md) | Worker Health | Accepted | - | Fully implemented |
|
||||
| [ADR-054](./0054-bugsink-gitea-issue-sync.md) | Bugsink-Gitea Sync | Proposed | L | Automated issue creation |
|
||||
|
||||
### Category 7: Frontend / User Interface
|
||||
|
||||
@@ -99,22 +104,24 @@ This document tracks the implementation status and estimated effort for all Arch
|
||||
| [ADR-010](./0010-testing-strategy-and-standards.md) | Testing Strategy | Accepted | - | Fully implemented |
|
||||
| [ADR-021](./0021-code-formatting-and-linting-unification.md) | Formatting & Linting | Accepted | - | Fully implemented |
|
||||
| [ADR-027](./0027-standardized-naming-convention-for-ai-and-database-types.md) | Naming Conventions | Accepted | - | Fully implemented |
|
||||
| [ADR-040](./0040-testing-economics-and-priorities.md) | Testing Economics | Accepted | - | Fully implemented |
|
||||
| [ADR-045](./0045-test-data-factories-and-fixtures.md) | Test Data Factories | Accepted | - | Fully implemented |
|
||||
| [ADR-047](./0047-project-file-and-folder-organization.md) | Project Organization | Proposed | XL | Major reorganization |
|
||||
|
||||
### Category 9: Architecture Patterns
|
||||
|
||||
| ADR | Title | Status | Effort | Notes |
|
||||
| -------------------------------------------------------- | --------------------- | -------- | ------ | ----------------- |
|
||||
| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented |
|
||||
| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented |
|
||||
| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented |
|
||||
| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented |
|
||||
| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented |
|
||||
| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented |
|
||||
| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented |
|
||||
| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented |
|
||||
| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented |
|
||||
| ADR | Title | Status | Effort | Notes |
|
||||
| --------------------------------------------------------------------- | --------------------- | -------- | ------ | ------------------------- |
|
||||
| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented |
|
||||
| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented |
|
||||
| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented |
|
||||
| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented |
|
||||
| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented |
|
||||
| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented |
|
||||
| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented |
|
||||
| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented |
|
||||
| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented |
|
||||
| [ADR-055](./0055-database-normalization-and-referential-integrity.md) | DB Normalization | Accepted | M | API uses IDs, not strings |
|
||||
|
||||
---
|
||||
|
||||
@@ -122,38 +129,49 @@ This document tracks the implementation status and estimated effort for all Arch
|
||||
|
||||
These ADRs are proposed but not yet implemented, ordered by suggested implementation priority:
|
||||
|
||||
| Priority | ADR | Title | Effort | Rationale |
|
||||
| -------- | ------- | --------------------------- | ------ | ------------------------------------------------- |
|
||||
| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging |
|
||||
| 1b | ADR-050 | PostgreSQL Fn Observability | M | Database function visibility (depends on ADR-015) |
|
||||
| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing |
|
||||
| 3 | ADR-023 | Schema Migrations v2 | L | Database evolution support |
|
||||
| 4 | ADR-029 | Secret Rotation | L | Security improvement |
|
||||
| 5 | ADR-008 | API Versioning | L | Future API evolution |
|
||||
| 6 | ADR-030 | Circuit Breaker | L | Resilience improvement |
|
||||
| 7 | ADR-022 | Real-time Notifications | XL | Major feature enhancement |
|
||||
| 8 | ADR-011 | Authorization & RBAC | XL | Advanced permission system |
|
||||
| 9 | ADR-025 | i18n & l10n | XL | Multi-language support |
|
||||
| 10 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements |
|
||||
| Priority | ADR | Title | Effort | Rationale |
|
||||
| -------- | ------- | ------------------------ | ------ | ------------------------------------ |
|
||||
| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging |
|
||||
| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing |
|
||||
| 3 | ADR-054 | Bugsink-Gitea Sync | L | Automated issue tracking from errors |
|
||||
| 4 | ADR-023 | Schema Migrations v2 | L | Database evolution support |
|
||||
| 5 | ADR-029 | Secret Rotation | L | Security improvement |
|
||||
| 6 | ADR-008 | API Versioning | L | Future API evolution |
|
||||
| 7 | ADR-030 | Circuit Breaker | L | Resilience improvement |
|
||||
| 8 | ADR-022 | Real-time Notifications | XL | Major feature enhancement |
|
||||
| 9 | ADR-011 | Authorization & RBAC | XL | Advanced permission system |
|
||||
| 10 | ADR-025 | i18n & l10n | XL | Multi-language support |
|
||||
| 11 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements |
|
||||
|
||||
---
|
||||
|
||||
## Recent Implementation History
|
||||
|
||||
| Date | ADR | Change |
|
||||
| ---------- | ------- | ---------------------------------------------------------------------- |
|
||||
| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash |
|
||||
| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs |
|
||||
| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements |
|
||||
| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan |
|
||||
| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting |
|
||||
| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing |
|
||||
| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns |
|
||||
| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization |
|
||||
| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation |
|
||||
| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping |
|
||||
| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger |
|
||||
| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated |
|
||||
| Date | ADR | Change |
|
||||
| ---------- | ------- | ------------------------------------------------------------------------------------ |
|
||||
| 2026-01-26 | ADR-052 | Marked as fully implemented - createScopedLogger with DEBUG_MODULES support complete |
|
||||
| 2026-01-26 | ADR-053 | Marked as fully implemented - /health/queues endpoint and worker heartbeats complete |
|
||||
| 2026-01-26 | ADR-050 | Marked as fully implemented - PostgreSQL function observability complete |
|
||||
| 2026-01-26 | ADR-055 | Created (renumbered from duplicate ADR-023) - Database normalization |
|
||||
| 2026-01-26 | ADR-054 | Added to tracker - Bugsink to Gitea issue synchronization |
|
||||
| 2026-01-26 | ADR-053 | Added to tracker - Worker health checks and monitoring |
|
||||
| 2026-01-26 | ADR-052 | Added to tracker - Granular debug logging strategy |
|
||||
| 2026-01-26 | ADR-051 | Added to tracker - Asynchronous context propagation |
|
||||
| 2026-01-26 | ADR-048 | Added to tracker - Authentication strategy |
|
||||
| 2026-01-26 | ADR-040 | Added to tracker - Testing economics and priorities |
|
||||
| 2026-01-17 | ADR-054 | Created - Bugsink-Gitea sync worker proposal |
|
||||
| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash |
|
||||
| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs |
|
||||
| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements |
|
||||
| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan |
|
||||
| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting |
|
||||
| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing |
|
||||
| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns |
|
||||
| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization |
|
||||
| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation |
|
||||
| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping |
|
||||
| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger |
|
||||
| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ This directory contains a log of the architectural decisions made for the Flyer
|
||||
|
||||
**[ADR-003](./0003-standardized-input-validation-using-middleware.md)**: Standardized Input Validation using Middleware (Accepted)
|
||||
**[ADR-008](./0008-api-versioning-strategy.md)**: API Versioning Strategy (Proposed)
|
||||
**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Proposed)
|
||||
**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Accepted)
|
||||
**[ADR-022](./0022-real-time-notification-system.md)**: Real-time Notification System (Proposed)
|
||||
**[ADR-028](./0028-api-response-standardization.md)**: API Response Standardization and Envelope Pattern (Implemented)
|
||||
|
||||
@@ -39,6 +39,9 @@ This directory contains a log of the architectural decisions made for the Flyer
|
||||
|
||||
**[ADR-004](./0004-standardized-application-wide-structured-logging.md)**: Standardized Application-Wide Structured Logging (Accepted)
|
||||
**[ADR-015](./0015-application-performance-monitoring-and-error-tracking.md)**: Application Performance Monitoring (APM) and Error Tracking (Proposed)
|
||||
**[ADR-050](./0050-postgresql-function-observability.md)**: PostgreSQL Function Observability (Proposed)
|
||||
**[ADR-051](./0051-asynchronous-context-propagation.md)**: Asynchronous Context Propagation (Accepted)
|
||||
**[ADR-052](./0052-granular-debug-logging-strategy.md)**: Granular Debug Logging Strategy (Proposed)
|
||||
|
||||
## 6. Deployment & Operations
|
||||
|
||||
@@ -48,13 +51,15 @@ This directory contains a log of the architectural decisions made for the Flyer
|
||||
**[ADR-024](./0024-feature-flagging-strategy.md)**: Feature Flagging Strategy (Proposed)
|
||||
**[ADR-037](./0037-scheduled-jobs-and-cron-pattern.md)**: Scheduled Jobs and Cron Pattern (Accepted)
|
||||
**[ADR-038](./0038-graceful-shutdown-pattern.md)**: Graceful Shutdown Pattern (Accepted)
|
||||
**[ADR-053](./0053-worker-health-checks-and-monitoring.md)**: Worker Health Checks and Monitoring (Proposed)
|
||||
**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed)
|
||||
|
||||
## 7. Frontend / User Interface
|
||||
|
||||
**[ADR-005](./0005-frontend-state-management-and-server-cache-strategy.md)**: Frontend State Management and Server Cache Strategy (Accepted)
|
||||
**[ADR-012](./0012-frontend-component-library-and-design-system.md)**: Frontend Component Library and Design System (Partially Implemented)
|
||||
**[ADR-025](./0025-internationalization-and-localization-strategy.md)**: Internationalization (i18n) and Localization (l10n) Strategy (Proposed)
|
||||
**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Proposed)
|
||||
**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Accepted)
|
||||
**[ADR-044](./0044-frontend-feature-organization.md)**: Frontend Feature Organization Pattern (Accepted)
|
||||
|
||||
## 8. Development Workflow & Quality
|
||||
@@ -76,3 +81,5 @@ This directory contains a log of the architectural decisions made for the Flyer
|
||||
**[ADR-042](./0042-email-and-notification-architecture.md)**: Email and Notification Architecture (Accepted)
|
||||
**[ADR-043](./0043-express-middleware-pipeline.md)**: Express Middleware Pipeline Architecture (Accepted)
|
||||
**[ADR-046](./0046-image-processing-pipeline.md)**: Image Processing Pipeline (Accepted)
|
||||
**[ADR-049](./0049-gamification-and-achievement-system.md)**: Gamification and Achievement System (Accepted)
|
||||
**[ADR-055](./0055-database-normalization-and-referential-integrity.md)**: Database Normalization and Referential Integrity (Accepted)
|
||||
|
||||
@@ -175,29 +175,30 @@ npm run dev:pm2:logs
|
||||
|
||||
### Log Flow Architecture (ADR-050)
|
||||
|
||||
All application logs flow through Logstash to Bugsink:
|
||||
All application logs flow through Logstash to Bugsink using a 3-project architecture:
|
||||
|
||||
```
|
||||
```text
|
||||
+------------------+ +------------------+ +------------------+
|
||||
| PM2 Logs | | PostgreSQL | | Redis Logs |
|
||||
| PM2 Logs | | PostgreSQL | | Redis/NGINX |
|
||||
| /var/log/pm2/ | | /var/log/ | | /var/log/redis/ |
|
||||
+--------+---------+ | postgresql/ | +--------+---------+
|
||||
| +--------+---------+ |
|
||||
| (API + Worker) | | postgresql/ | | /var/log/nginx/ |
|
||||
+--------+---------+ +--------+---------+ +--------+---------+
|
||||
| | |
|
||||
v v v
|
||||
+------------------------------------------------------------------------+
|
||||
| LOGSTASH |
|
||||
| /etc/logstash/conf.d/bugsink.conf |
|
||||
| (Routes by log type) |
|
||||
+------------------------------------------------------------------------+
|
||||
| | |
|
||||
| +---------+---------+ |
|
||||
| | | |
|
||||
v v v v
|
||||
+------------------+ +------------------+ +------------------+
|
||||
| Errors -> | | Operational -> | | NGINX Logs -> |
|
||||
| Bugsink API | | /var/log/ | | /var/log/ |
|
||||
| (Project 1) | | logstash/*.log | | logstash/*.log |
|
||||
+------------------+ +------------------+ +------------------+
|
||||
v v v
|
||||
+------------------+ +------------------+ +------------------+
|
||||
| Backend API | | Frontend (Dev) | | Infrastructure |
|
||||
| (Project 1) | | (Project 2) | | (Project 4) |
|
||||
| - Pino errors | | - Browser SDK | | - Redis warnings |
|
||||
| - PostgreSQL | | (not Logstash) | | - NGINX errors |
|
||||
+------------------+ +------------------+ | - Vite errors |
|
||||
+------------------+
|
||||
```
|
||||
|
||||
### Log Sources
|
||||
@@ -231,8 +232,11 @@ podman exec flyer-crawler-dev curl -s localhost:9600/_node/stats/pipelines?prett
|
||||
- **URL**: `https://localhost:8443`
|
||||
- **Login**: `admin@localhost` / `admin`
|
||||
- **Projects**:
|
||||
- Project 1: Backend API (errors from Pino, PostgreSQL, Redis)
|
||||
- Project 2: Frontend (errors from Sentry SDK in browser)
|
||||
- Project 1: Backend API (Dev) - Pino app errors, PostgreSQL errors
|
||||
- Project 2: Frontend (Dev) - Browser errors via Sentry SDK
|
||||
- Project 4: Infrastructure (Dev) - Redis warnings, NGINX errors, Vite build errors
|
||||
|
||||
**Note**: Frontend DSN uses nginx proxy (`/bugsink-api/`) because browsers cannot reach `localhost:8000` directly. See [BUGSINK-SETUP.md](../tools/BUGSINK-SETUP.md#frontend-nginx-proxy) for details.
|
||||
|
||||
---
|
||||
|
||||
@@ -268,14 +272,41 @@ podman-compose -f compose.dev.yml down
|
||||
|
||||
Key environment variables are set in `compose.dev.yml`:
|
||||
|
||||
| Variable | Value | Purpose |
|
||||
| ----------------- | ----------------------------- | -------------------- |
|
||||
| `NODE_ENV` | `development` | Environment mode |
|
||||
| `DB_HOST` | `postgres` | PostgreSQL hostname |
|
||||
| `REDIS_URL` | `redis://redis:6379` | Redis connection URL |
|
||||
| `FRONTEND_URL` | `https://localhost` | CORS origin |
|
||||
| `SENTRY_DSN` | `http://...@127.0.0.1:8000/1` | Backend Bugsink DSN |
|
||||
| `VITE_SENTRY_DSN` | `http://...@127.0.0.1:8000/2` | Frontend Bugsink DSN |
|
||||
| Variable | Value | Purpose |
|
||||
| ----------------- | ----------------------------- | --------------------------- |
|
||||
| `TZ` | `America/Los_Angeles` | Timezone (PST) for all logs |
|
||||
| `NODE_ENV` | `development` | Environment mode |
|
||||
| `DB_HOST` | `postgres` | PostgreSQL hostname |
|
||||
| `REDIS_URL` | `redis://redis:6379` | Redis connection URL |
|
||||
| `FRONTEND_URL` | `https://localhost` | CORS origin |
|
||||
| `SENTRY_DSN` | `http://...@127.0.0.1:8000/1` | Backend Bugsink DSN |
|
||||
| `VITE_SENTRY_DSN` | `http://...@127.0.0.1:8000/2` | Frontend Bugsink DSN |
|
||||
|
||||
### Timezone Configuration
|
||||
|
||||
All dev container services are configured to use PST (America/Los_Angeles) timezone for consistent log timestamps:
|
||||
|
||||
| Service | Configuration | Notes |
|
||||
| ---------- | ------------------------------------------------ | ------------------------------ |
|
||||
| App | `TZ=America/Los_Angeles` in compose.dev.yml | Also set via dev-entrypoint.sh |
|
||||
| PostgreSQL | `timezone` and `log_timezone` in postgres config | Logs timestamps in PST |
|
||||
| Redis | `TZ=America/Los_Angeles` in compose.dev.yml | Alpine uses TZ env var |
|
||||
| PM2 | `TZ` in ecosystem.dev.config.cjs | Pino timestamps use local time |
|
||||
|
||||
**Verifying Timezone**:
|
||||
|
||||
```bash
|
||||
# Check container timezone
|
||||
podman exec flyer-crawler-dev date
|
||||
|
||||
# Check PostgreSQL timezone
|
||||
podman exec flyer-crawler-postgres psql -U postgres -c "SHOW timezone;"
|
||||
|
||||
# Check Redis log timestamps
|
||||
MSYS_NO_PATHCONV=1 podman exec flyer-crawler-redis cat /var/log/redis/redis-server.log | head -5
|
||||
```
|
||||
|
||||
**Note**: If you need UTC timestamps for production compatibility, change `TZ=UTC` in compose.dev.yml and restart containers.
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -101,6 +101,7 @@ MSYS_NO_PATHCONV=1 podman exec flyer-crawler-dev ls -la /var/log/redis/
|
||||
| NGINX logs missing | Output directory | `ls -lh /var/log/logstash/nginx-access-*.log` |
|
||||
| Redis logs missing | Shared volume | Dev: Check `redis_logs` volume mounted; Prod: Check `/var/log/redis/redis-server.log` exists |
|
||||
| High disk usage | Log rotation | Verify `/etc/logrotate.d/logstash` configured |
|
||||
| varchar(7) error | Level validation | Add Ruby filter to validate/normalize `sentry_level` before output |
|
||||
|
||||
## Related Documentation
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ This runbook provides step-by-step diagnostics and solutions for common Logstash
|
||||
| Wrong Bugsink project | Environment detection failed | Verify `pg_database` field extraction |
|
||||
| 403 authentication error | Missing/wrong DSN key | Check `X-Sentry-Auth` header |
|
||||
| 500 error from Bugsink | Invalid event format | Verify `event_id` and required fields |
|
||||
| varchar(7) constraint | Unresolved `%{sentry_level}` | Add Ruby filter for level validation |
|
||||
|
||||
---
|
||||
|
||||
@@ -385,7 +386,88 @@ systemctl status logstash
|
||||
|
||||
---
|
||||
|
||||
### Issue 7: Log File Rotation Issues
|
||||
### Issue 7: Level Field Constraint Violation (varchar(7))
|
||||
|
||||
**Symptoms:**
|
||||
|
||||
- Bugsink returns HTTP 500 errors
|
||||
- PostgreSQL errors: `value too long for type character varying(7)`
|
||||
- Events fail to insert with literal `%{sentry_level}` string (16 characters)
|
||||
|
||||
**Root Cause:**
|
||||
|
||||
When Logstash cannot determine the log level (no error patterns matched), the `sentry_level` field remains as the unresolved placeholder `%{sentry_level}`. Bugsink's PostgreSQL schema has a `varchar(7)` constraint on the level field.
|
||||
|
||||
Valid Sentry levels (all <= 7 characters): `fatal`, `error`, `warning`, `info`, `debug`
|
||||
|
||||
**Diagnosis:**
|
||||
|
||||
```bash
|
||||
# Check for HTTP 500 responses in Logstash logs
|
||||
podman exec flyer-crawler-dev cat /var/log/logstash/logstash.log | grep "500"
|
||||
|
||||
# Check Bugsink for constraint violation errors
|
||||
# Via MCP:
|
||||
mcp__localerrors__list_issues({ project_id: 1, status: 'unresolved' })
|
||||
```
|
||||
|
||||
**Solution:**
|
||||
|
||||
Add a Ruby filter block in `docker/logstash/bugsink.conf` to validate and normalize the `sentry_level` field before sending to Bugsink:
|
||||
|
||||
```ruby
|
||||
# Add this AFTER all mutate filters that set sentry_level
|
||||
# and BEFORE the output section
|
||||
|
||||
ruby {
|
||||
code => '
|
||||
level = event.get("sentry_level")
|
||||
# Check if level is invalid (nil, empty, contains placeholder, or too long)
|
||||
if level.nil? || level.to_s.empty? || level.to_s.include?("%{") || level.to_s.length > 7
|
||||
# Default to "error" for error-tagged events, "info" otherwise
|
||||
if event.get("tags")&.include?("error")
|
||||
event.set("sentry_level", "error")
|
||||
else
|
||||
event.set("sentry_level", "info")
|
||||
end
|
||||
else
|
||||
# Normalize to lowercase and validate
|
||||
normalized = level.to_s.downcase
|
||||
valid_levels = ["fatal", "error", "warning", "info", "debug"]
|
||||
unless valid_levels.include?(normalized)
|
||||
normalized = "error"
|
||||
end
|
||||
event.set("sentry_level", normalized)
|
||||
end
|
||||
'
|
||||
}
|
||||
```
|
||||
|
||||
**Key validations performed:**
|
||||
|
||||
1. Checks for nil or empty values
|
||||
2. Detects unresolved placeholders (`%{...}`)
|
||||
3. Enforces 7-character maximum length
|
||||
4. Normalizes to lowercase
|
||||
5. Validates against allowed Sentry levels
|
||||
6. Defaults to "error" for error-tagged events, "info" otherwise
|
||||
|
||||
**Verification:**
|
||||
|
||||
```bash
|
||||
# Restart Logstash
|
||||
podman exec flyer-crawler-dev systemctl restart logstash
|
||||
|
||||
# Generate a test log that triggers the filter
|
||||
podman exec flyer-crawler-dev pm2 restart flyer-crawler-api-dev
|
||||
|
||||
# Check no new HTTP 500 errors
|
||||
podman exec flyer-crawler-dev cat /var/log/logstash/logstash.log | tail -50 | grep -E "(500|error)"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Issue 8: Log File Rotation Issues
|
||||
|
||||
**Symptoms:**
|
||||
|
||||
|
||||
@@ -49,18 +49,24 @@ Bugsink is a lightweight, self-hosted error tracking platform that is fully comp
|
||||
| Web UI | `https://localhost:8443` (nginx proxy) |
|
||||
| Internal URL | `http://localhost:8000` (direct) |
|
||||
| Credentials | `admin@localhost` / `admin` |
|
||||
| Backend Project | Project ID 1 - `flyer-crawler-dev-backend` |
|
||||
| Frontend Project | Project ID 2 - `flyer-crawler-dev-frontend` |
|
||||
| Backend Project | Project ID 1 - `Backend API (Dev)` |
|
||||
| Frontend Project | Project ID 2 - `Frontend (Dev)` |
|
||||
| Infra Project | Project ID 4 - `Infrastructure (Dev)` |
|
||||
| Backend DSN | `http://<key>@localhost:8000/1` |
|
||||
| Frontend DSN | `http://<key>@localhost:8000/2` |
|
||||
| Frontend DSN | `https://<key>@localhost/bugsink-api/2` (via nginx proxy) |
|
||||
| Infra DSN | `http://<key>@localhost:8000/4` (Logstash only) |
|
||||
| Database | `postgresql://bugsink:bugsink_dev_password@postgres:5432/bugsink` |
|
||||
|
||||
**Important:** The Frontend DSN uses an nginx proxy (`/bugsink-api/`) because the browser cannot reach `localhost:8000` directly (container-internal port). See [Frontend Nginx Proxy](#frontend-nginx-proxy) for details.
|
||||
|
||||
**Configuration Files:**
|
||||
|
||||
| File | Purpose |
|
||||
| ----------------- | ----------------------------------------------------------------- |
|
||||
| `compose.dev.yml` | Initial DSNs using `127.0.0.1:8000` (container startup) |
|
||||
| `.env.local` | **OVERRIDES** compose.dev.yml with `localhost:8000` (app runtime) |
|
||||
| File | Purpose |
|
||||
| ------------------------------ | ------------------------------------------------------- |
|
||||
| `compose.dev.yml` | Initial DSNs using `127.0.0.1:8000` (container startup) |
|
||||
| `.env.local` | **OVERRIDES** compose.dev.yml (app runtime) |
|
||||
| `docker/nginx/dev.conf` | Nginx proxy for Bugsink API (frontend error reporting) |
|
||||
| `docker/logstash/bugsink.conf` | Log routing to Backend/Infrastructure projects |
|
||||
|
||||
**Note:** `.env.local` takes precedence over `compose.dev.yml` environment variables.
|
||||
|
||||
@@ -360,75 +366,127 @@ const config = {
|
||||
|
||||
---
|
||||
|
||||
## Frontend Nginx Proxy
|
||||
|
||||
The frontend Sentry SDK runs in the browser, which cannot directly reach `localhost:8000` (the Bugsink container-internal port). To solve this, we use an nginx proxy.
|
||||
|
||||
### How It Works
|
||||
|
||||
```text
|
||||
Browser --HTTPS--> https://localhost/bugsink-api/2/store/
|
||||
|
|
||||
v (nginx proxy)
|
||||
http://localhost:8000/api/2/store/
|
||||
|
|
||||
v
|
||||
Bugsink (internal)
|
||||
```
|
||||
|
||||
### Nginx Configuration
|
||||
|
||||
Location: `docker/nginx/dev.conf`
|
||||
|
||||
```nginx
|
||||
# Proxy Bugsink Sentry API for frontend error reporting
|
||||
location /bugsink-api/ {
|
||||
proxy_pass http://localhost:8000/api/;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# Allow large error payloads with stack traces
|
||||
client_max_body_size 10M;
|
||||
|
||||
# Timeouts for error reporting
|
||||
proxy_connect_timeout 10s;
|
||||
proxy_send_timeout 30s;
|
||||
proxy_read_timeout 30s;
|
||||
}
|
||||
```
|
||||
|
||||
### Frontend DSN Format
|
||||
|
||||
```bash
|
||||
# .env.local
|
||||
# Uses nginx proxy path instead of direct port
|
||||
VITE_SENTRY_DSN=https://<key>@localhost/bugsink-api/2
|
||||
```
|
||||
|
||||
### Testing Frontend Error Reporting
|
||||
|
||||
1. Open browser console at `https://localhost`
|
||||
|
||||
2. Trigger a test error:
|
||||
|
||||
```javascript
|
||||
throw new Error('Test frontend error from browser');
|
||||
```
|
||||
|
||||
3. Check Bugsink Frontend (Dev) project for the error
|
||||
|
||||
4. Verify browser console shows Sentry SDK activity (if VITE_SENTRY_DEBUG=true)
|
||||
|
||||
---
|
||||
|
||||
## Logstash Integration
|
||||
|
||||
Logstash aggregates logs from multiple sources and forwards error patterns to Bugsink.
|
||||
|
||||
**Note:** See [ADR-015](../adr/0015-application-performance-monitoring-and-error-tracking.md) for the full architecture.
|
||||
|
||||
### 3-Project Architecture
|
||||
|
||||
Logstash routes errors to different Bugsink projects based on log source:
|
||||
|
||||
| Project | ID | Receives |
|
||||
| -------------------- | --- | --------------------------------------------- |
|
||||
| Backend API (Dev) | 1 | Pino app errors, PostgreSQL errors |
|
||||
| Frontend (Dev) | 2 | Browser errors (via Sentry SDK, not Logstash) |
|
||||
| Infrastructure (Dev) | 4 | Redis warnings, NGINX errors, Vite errors |
|
||||
|
||||
### Log Sources
|
||||
|
||||
| Source | Log Path | Error Detection |
|
||||
| ---------- | ---------------------- | ------------------------- |
|
||||
| Pino (app) | `/app/logs/*.log` | level >= 50 (error/fatal) |
|
||||
| Redis | `/var/log/redis/*.log` | WARNING/ERROR log levels |
|
||||
| PostgreSQL | (future) | ERROR/FATAL log levels |
|
||||
| Source | Log Path | Project Destination | Error Detection |
|
||||
| ---------- | --------------------------- | ------------------- | ------------------------- |
|
||||
| PM2 API | `/var/log/pm2/api-*.log` | Backend (1) | level >= 50 (error/fatal) |
|
||||
| PM2 Worker | `/var/log/pm2/worker-*.log` | Backend (1) | level >= 50 (error/fatal) |
|
||||
| PM2 Vite | `/var/log/pm2/vite-*.log` | Infrastructure (4) | error keyword patterns |
|
||||
| PostgreSQL | `/var/log/postgresql/*.log` | Backend (1) | ERROR/FATAL log levels |
|
||||
| Redis | `/var/log/redis/*.log` | Infrastructure (4) | WARNING level (`#`) |
|
||||
| NGINX | `/var/log/nginx/error.log` | Infrastructure (4) | error/crit/alert/emerg |
|
||||
|
||||
### Pipeline Configuration
|
||||
|
||||
**Location:** `/etc/logstash/conf.d/bugsink.conf`
|
||||
**Location:** `/etc/logstash/conf.d/bugsink.conf` (or `docker/logstash/bugsink.conf` in project)
|
||||
|
||||
```conf
|
||||
# === INPUTS ===
|
||||
input {
|
||||
file {
|
||||
path => "/app/logs/*.log"
|
||||
codec => json
|
||||
type => "pino"
|
||||
tags => ["app"]
|
||||
}
|
||||
The configuration:
|
||||
|
||||
file {
|
||||
path => "/var/log/redis/*.log"
|
||||
type => "redis"
|
||||
tags => ["redis"]
|
||||
}
|
||||
1. **Inputs**: Reads from PM2 logs, PostgreSQL logs, Redis logs, NGINX logs
|
||||
2. **Filters**: Detects errors and assigns tags based on log type
|
||||
3. **Outputs**: Routes to appropriate Bugsink project based on log source
|
||||
|
||||
**Key Routing Logic:**
|
||||
|
||||
```ruby
|
||||
# Infrastructure logs -> Project 4
|
||||
if "error" in [tags] and ([type] == "redis" or [type] == "nginx_error" or [type] == "pm2_vite") {
|
||||
http { url => "http://localhost:8000/api/4/store/" ... }
|
||||
}
|
||||
|
||||
# === FILTERS ===
|
||||
filter {
|
||||
if [type] == "pino" and [level] >= 50 {
|
||||
mutate { add_tag => ["error"] }
|
||||
}
|
||||
|
||||
if [type] == "redis" {
|
||||
grok {
|
||||
match => { "message" => "%{POSINT:pid}:%{WORD:role} %{MONTHDAY} %{MONTH} %{TIME} %{WORD:loglevel} %{GREEDYDATA:redis_message}" }
|
||||
}
|
||||
if [loglevel] in ["WARNING", "ERROR"] {
|
||||
mutate { add_tag => ["error"] }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# === OUTPUT ===
|
||||
output {
|
||||
if "error" in [tags] {
|
||||
http {
|
||||
url => "http://localhost:8000/api/store/"
|
||||
http_method => "post"
|
||||
format => "json"
|
||||
}
|
||||
}
|
||||
# Backend logs -> Project 1
|
||||
else if "error" in [tags] and ([type] in ["pm2_api", "pm2_worker", "pino", "postgres"]) {
|
||||
http { url => "http://localhost:8000/api/1/store/" ... }
|
||||
}
|
||||
```
|
||||
|
||||
### Benefits
|
||||
|
||||
1. **Secondary Capture Path**: Catches errors before SDK initialization
|
||||
2. **Log-Based Errors**: Captures errors that don't throw exceptions
|
||||
3. **Infrastructure Monitoring**: Redis connection issues, slow commands
|
||||
4. **Historical Analysis**: Process existing log files
|
||||
1. **Separation of Concerns**: Application errors separate from infrastructure issues
|
||||
2. **Secondary Capture Path**: Catches errors before SDK initialization
|
||||
3. **Log-Based Errors**: Captures errors that don't throw exceptions
|
||||
4. **Infrastructure Monitoring**: Redis, NGINX, build tooling issues
|
||||
5. **Historical Analysis**: Process existing log files
|
||||
|
||||
---
|
||||
|
||||
@@ -743,6 +801,228 @@ podman exec flyer-crawler-dev psql -U postgres -h postgres -c "\l" | grep bugsin
|
||||
ssh root@projectium.com "cd /opt/bugsink && bugsink-manage check"
|
||||
```
|
||||
|
||||
### PostgreSQL Sequence Out of Sync (Duplicate Key Errors)
|
||||
|
||||
**Symptoms:**
|
||||
|
||||
- Bugsink throws `duplicate key value violates unique constraint "projects_project_pkey"`
|
||||
- Error detail shows: `Key (id)=(1) already exists`
|
||||
- New projects or other entities fail to create
|
||||
|
||||
**Root Cause:**
|
||||
|
||||
PostgreSQL sequences can become out of sync with actual data after:
|
||||
|
||||
- Manual data insertion or database seeding
|
||||
- Restoring from backup
|
||||
- Copying data between environments
|
||||
|
||||
The sequence generates IDs that already exist in the table.
|
||||
|
||||
**Diagnosis:**
|
||||
|
||||
```bash
|
||||
# Dev Container - Check sequence vs max ID
|
||||
podman exec flyer-crawler-dev psql -U bugsink -h postgres -d bugsink -c "
|
||||
SELECT
|
||||
(SELECT MAX(id) FROM projects_project) as max_id,
|
||||
(SELECT last_value FROM projects_project_id_seq) as seq_last_value,
|
||||
CASE
|
||||
WHEN (SELECT MAX(id) FROM projects_project) <= (SELECT last_value FROM projects_project_id_seq)
|
||||
THEN 'OK'
|
||||
ELSE 'OUT OF SYNC - Needs reset'
|
||||
END as status;
|
||||
"
|
||||
|
||||
# Production
|
||||
ssh root@projectium.com "cd /opt/bugsink && bugsink-manage dbshell" <<< "
|
||||
SELECT MAX(id) as max_id, (SELECT last_value FROM projects_project_id_seq) as seq_value FROM projects_project;
|
||||
"
|
||||
```
|
||||
|
||||
**Solution:**
|
||||
|
||||
Reset the sequence to the maximum existing ID:
|
||||
|
||||
```bash
|
||||
# Dev Container
|
||||
podman exec flyer-crawler-dev psql -U bugsink -h postgres -d bugsink -c "
|
||||
SELECT setval('projects_project_id_seq', COALESCE((SELECT MAX(id) FROM projects_project), 1), true);
|
||||
"
|
||||
|
||||
# Production
|
||||
ssh root@projectium.com "cd /opt/bugsink && bugsink-manage dbshell" <<< "
|
||||
SELECT setval('projects_project_id_seq', COALESCE((SELECT MAX(id) FROM projects_project), 1), true);
|
||||
"
|
||||
```
|
||||
|
||||
**Verification:**
|
||||
|
||||
After running the fix, verify:
|
||||
|
||||
```bash
|
||||
# Next ID should be max_id + 1
|
||||
podman exec flyer-crawler-dev psql -U bugsink -h postgres -d bugsink -c "
|
||||
SELECT nextval('projects_project_id_seq') - 1 as current_seq_value;
|
||||
"
|
||||
```
|
||||
|
||||
**Prevention:**
|
||||
|
||||
When manually inserting data or restoring backups, always reset sequences:
|
||||
|
||||
```sql
|
||||
-- Generic pattern for any table/sequence
|
||||
SELECT setval('SEQUENCE_NAME', COALESCE((SELECT MAX(id) FROM TABLE_NAME), 1), true);
|
||||
|
||||
-- Common Bugsink sequences that may need reset:
|
||||
SELECT setval('projects_project_id_seq', COALESCE((SELECT MAX(id) FROM projects_project), 1), true);
|
||||
SELECT setval('teams_team_id_seq', COALESCE((SELECT MAX(id) FROM teams_team), 1), true);
|
||||
SELECT setval('releases_release_id_seq', COALESCE((SELECT MAX(id) FROM releases_release), 1), true);
|
||||
```
|
||||
|
||||
### Logstash Level Field Constraint Violation
|
||||
|
||||
**Symptoms:**
|
||||
|
||||
- Bugsink errors: `value too long for type character varying(7)`
|
||||
- Errors in Backend API project from Logstash
|
||||
- Log shows `%{sentry_level}` literal string being sent
|
||||
|
||||
**Root Cause:**
|
||||
|
||||
Logstash sends the literal placeholder `%{sentry_level}` (16 characters) to Bugsink when:
|
||||
|
||||
- No error pattern is detected in the log message
|
||||
- The `sentry_level` field is not properly initialized
|
||||
- Bugsink's `level` column has a `varchar(7)` constraint
|
||||
|
||||
Valid Sentry levels are: `fatal`, `error`, `warning`, `info`, `debug` (all <= 7 characters).
|
||||
|
||||
**Diagnosis:**
|
||||
|
||||
```bash
|
||||
# Check for recent level constraint errors in Bugsink
|
||||
# Via MCP:
|
||||
mcp__localerrors__list_issues({ project_id: 1, status: 'unresolved' })
|
||||
|
||||
# Or check Logstash logs for HTTP 500 responses
|
||||
podman exec flyer-crawler-dev cat /var/log/logstash/logstash.log | grep "500"
|
||||
```
|
||||
|
||||
**Solution:**
|
||||
|
||||
The fix requires updating the Logstash configuration (`docker/logstash/bugsink.conf`) to:
|
||||
|
||||
1. Validate `sentry_level` is not nil, empty, or contains placeholder text
|
||||
2. Set a default value of "error" for any error-tagged event without a valid level
|
||||
3. Normalize levels to lowercase
|
||||
|
||||
**Key filter block (Ruby):**
|
||||
|
||||
```ruby
|
||||
ruby {
|
||||
code => '
|
||||
level = event.get("sentry_level")
|
||||
# Check if level is invalid (nil, empty, contains placeholder, or invalid value)
|
||||
if level.nil? || level.to_s.empty? || level.to_s.include?("%{") || level.to_s.length > 7
|
||||
# Default to "error" for error-tagged events, "info" otherwise
|
||||
if event.get("tags")&.include?("error")
|
||||
event.set("sentry_level", "error")
|
||||
else
|
||||
event.set("sentry_level", "info")
|
||||
end
|
||||
else
|
||||
# Normalize to lowercase and validate
|
||||
normalized = level.to_s.downcase
|
||||
valid_levels = ["fatal", "error", "warning", "info", "debug"]
|
||||
unless valid_levels.include?(normalized)
|
||||
normalized = "error"
|
||||
end
|
||||
event.set("sentry_level", normalized)
|
||||
end
|
||||
'
|
||||
}
|
||||
```
|
||||
|
||||
**Verification:**
|
||||
|
||||
After applying the fix:
|
||||
|
||||
1. Restart Logstash: `podman exec flyer-crawler-dev systemctl restart logstash`
|
||||
2. Generate a test error and verify it appears in Bugsink without level errors
|
||||
3. Check no new "value too long" errors appear in the project
|
||||
|
||||
### CSRF Verification Failed
|
||||
|
||||
**Symptoms:** "CSRF verification failed. Request aborted." error when performing actions in Bugsink UI (resolving issues, changing settings, etc.)
|
||||
|
||||
**Root Cause:**
|
||||
|
||||
Django 4.0+ requires `CSRF_TRUSTED_ORIGINS` to be explicitly configured for HTTPS POST requests. The error occurs because:
|
||||
|
||||
1. Bugsink is accessed via `https://localhost:8443` (nginx HTTPS proxy)
|
||||
2. Django's CSRF protection validates the `Origin` header against `CSRF_TRUSTED_ORIGINS`
|
||||
3. Without explicit configuration, Django rejects POST requests from HTTPS origins
|
||||
|
||||
**Why localhost vs 127.0.0.1 Matters:**
|
||||
|
||||
- `localhost` and `127.0.0.1` are treated as DIFFERENT origins by browsers
|
||||
- If you access Bugsink via `https://localhost:8443`, Django must trust `https://localhost:8443`
|
||||
- If you access via `https://127.0.0.1:8443`, Django must trust `https://127.0.0.1:8443`
|
||||
- The fix includes BOTH to allow either access pattern
|
||||
|
||||
**Configuration (Already Applied):**
|
||||
|
||||
The Bugsink Django configuration in `Dockerfile.dev` includes:
|
||||
|
||||
```python
|
||||
# CSRF Trusted Origins (Django 4.0+ requires full origin for HTTPS POST requests)
|
||||
CSRF_TRUSTED_ORIGINS = [
|
||||
"https://localhost:8443",
|
||||
"https://127.0.0.1:8443",
|
||||
"http://localhost:8000",
|
||||
"http://127.0.0.1:8000",
|
||||
]
|
||||
|
||||
# HTTPS proxy support (nginx reverse proxy on port 8443)
|
||||
SECURE_PROXY_SSL_HEADER = ("HTTP_X_FORWARDED_PROTO", "https")
|
||||
```
|
||||
|
||||
**Verification:**
|
||||
|
||||
```bash
|
||||
# Verify CSRF_TRUSTED_ORIGINS is configured
|
||||
podman exec flyer-crawler-dev sh -c 'cat /opt/bugsink/conf/bugsink_conf.py | grep -A 6 CSRF_TRUSTED'
|
||||
|
||||
# Expected output:
|
||||
# CSRF_TRUSTED_ORIGINS = [
|
||||
# "https://localhost:8443",
|
||||
# "https://127.0.0.1:8443",
|
||||
# "http://localhost:8000",
|
||||
# "http://127.0.0.1:8000",
|
||||
# ]
|
||||
```
|
||||
|
||||
**If Issue Persists After Fix:**
|
||||
|
||||
1. **Rebuild the container image** (configuration is baked into the image):
|
||||
|
||||
```bash
|
||||
podman-compose -f compose.dev.yml down
|
||||
podman build -f Dockerfile.dev -t localhost/flyer-crawler-dev:latest .
|
||||
podman-compose -f compose.dev.yml up -d
|
||||
```
|
||||
|
||||
2. **Clear browser cookies** for localhost:8443
|
||||
|
||||
3. **Check nginx X-Forwarded-Proto header** - the nginx config must set this header for Django to recognize HTTPS:
|
||||
|
||||
```bash
|
||||
podman exec flyer-crawler-dev cat /etc/nginx/sites-available/bugsink | grep X-Forwarded-Proto
|
||||
# Should show: proxy_set_header X-Forwarded-Proto $scheme;
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Related Documentation
|
||||
|
||||
@@ -44,6 +44,8 @@ if (missingVars.length > 0) {
|
||||
// --- Shared Environment Variables ---
|
||||
// These come from compose.dev.yml environment section
|
||||
const sharedEnv = {
|
||||
// Timezone: PST (America/Los_Angeles) for consistent log timestamps
|
||||
TZ: process.env.TZ || 'America/Los_Angeles',
|
||||
NODE_ENV: 'development',
|
||||
DB_HOST: process.env.DB_HOST || 'postgres',
|
||||
DB_PORT: process.env.DB_PORT || '5432',
|
||||
@@ -160,6 +162,8 @@ module.exports = {
|
||||
min_uptime: '5s',
|
||||
// Environment
|
||||
env: {
|
||||
// Timezone: PST (America/Los_Angeles) for consistent log timestamps
|
||||
TZ: process.env.TZ || 'America/Los_Angeles',
|
||||
NODE_ENV: 'development',
|
||||
// Vite-specific env vars (VITE_ prefix)
|
||||
VITE_SENTRY_DSN: process.env.VITE_SENTRY_DSN,
|
||||
|
||||
4
package-lock.json
generated
4
package-lock.json
generated
@@ -1,12 +1,12 @@
|
||||
{
|
||||
"name": "flyer-crawler",
|
||||
"version": "0.12.8",
|
||||
"version": "0.12.14",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "flyer-crawler",
|
||||
"version": "0.12.8",
|
||||
"version": "0.12.14",
|
||||
"dependencies": {
|
||||
"@bull-board/api": "^6.14.2",
|
||||
"@bull-board/express": "^6.14.2",
|
||||
|
||||
12
package.json
12
package.json
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"name": "flyer-crawler",
|
||||
"private": true,
|
||||
"version": "0.12.8",
|
||||
"version": "0.12.14",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "concurrently \"npm:start:dev\" \"vite\"",
|
||||
@@ -14,12 +14,12 @@
|
||||
"start": "npm run start:prod",
|
||||
"build": "vite build",
|
||||
"preview": "vite preview",
|
||||
"test": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx ./node_modules/vitest/vitest.mjs run",
|
||||
"test-wsl": "cross-env NODE_ENV=test vitest run",
|
||||
"test": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx ./node_modules/vitest/vitest.mjs run",
|
||||
"test-wsl": "cross-env NODE_ENV=test TZ= vitest run",
|
||||
"test:coverage": "npm run clean && npm run test:unit -- --coverage && npm run test:integration -- --coverage",
|
||||
"test:unit": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project unit -c vite.config.ts",
|
||||
"test:integration": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project integration -c vitest.config.integration.ts",
|
||||
"test:e2e": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --config vitest.config.e2e.ts",
|
||||
"test:unit": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project unit -c vite.config.ts",
|
||||
"test:integration": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project integration -c vitest.config.integration.ts",
|
||||
"test:e2e": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --config vitest.config.e2e.ts",
|
||||
"format": "prettier --write .",
|
||||
"lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
|
||||
"type-check": "tsc --noEmit",
|
||||
|
||||
@@ -23,6 +23,26 @@ set -e
|
||||
|
||||
echo "Starting Flyer Crawler Dev Container..."
|
||||
|
||||
# ============================================================================
|
||||
# Timezone Configuration
|
||||
# ============================================================================
|
||||
# Ensure TZ is set for consistent log timestamps across all services.
|
||||
# TZ should be set via compose.dev.yml environment (default: America/Los_Angeles)
|
||||
# ============================================================================
|
||||
if [ -n "$TZ" ]; then
|
||||
echo "Timezone configured: $TZ"
|
||||
# Link timezone data if available (for date command and other tools)
|
||||
if [ -f "/usr/share/zoneinfo/$TZ" ]; then
|
||||
ln -sf "/usr/share/zoneinfo/$TZ" /etc/localtime
|
||||
echo "$TZ" > /etc/timezone
|
||||
echo "System timezone set to: $(date +%Z) ($(date))"
|
||||
else
|
||||
echo "Warning: Timezone data not found for $TZ, using TZ environment variable only"
|
||||
fi
|
||||
else
|
||||
echo "Warning: TZ environment variable not set, using container default timezone"
|
||||
fi
|
||||
|
||||
# Configure Bugsink HTTPS (ADR-015)
|
||||
echo "Configuring Bugsink HTTPS..."
|
||||
mkdir -p /etc/bugsink/ssl
|
||||
|
||||
@@ -27,9 +27,13 @@ const defaultProps = {
|
||||
};
|
||||
|
||||
const setupSuccessMocks = () => {
|
||||
// The API returns {success, data: {userprofile, token}}, and the mutation extracts .data
|
||||
const mockAuthResponse = {
|
||||
userprofile: createMockUserProfile({ user: { user_id: '123', email: 'test@example.com' } }),
|
||||
token: 'mock-token',
|
||||
success: true,
|
||||
data: {
|
||||
userprofile: createMockUserProfile({ user: { user_id: '123', email: 'test@example.com' } }),
|
||||
token: 'mock-token',
|
||||
},
|
||||
};
|
||||
(mockedApiClient.loginUser as Mock).mockResolvedValue(
|
||||
new Response(JSON.stringify(mockAuthResponse)),
|
||||
|
||||
@@ -82,7 +82,11 @@ const defaultAuthenticatedProps = {
|
||||
};
|
||||
|
||||
const setupSuccessMocks = () => {
|
||||
const mockAuthResponse = { userprofile: authenticatedProfile, token: 'mock-token' };
|
||||
// The API returns {success, data: {userprofile, token}}, and the mutation extracts .data
|
||||
const mockAuthResponse = {
|
||||
success: true,
|
||||
data: { userprofile: authenticatedProfile, token: 'mock-token' },
|
||||
};
|
||||
(mockedApiClient.loginUser as Mock).mockResolvedValue(
|
||||
new Response(JSON.stringify(mockAuthResponse)),
|
||||
);
|
||||
|
||||
@@ -619,4 +619,240 @@ describe('Health Routes (/api/health)', () => {
|
||||
expect(response.body.error.details.database.message).toBe('Database connection failed');
|
||||
});
|
||||
});
|
||||
|
||||
// =============================================================================
|
||||
// QUEUE HEALTH MONITORING (ADR-053)
|
||||
// =============================================================================
|
||||
|
||||
describe('GET /queues', () => {
|
||||
// Mock the queues module
|
||||
beforeEach(async () => {
|
||||
vi.resetModules();
|
||||
// Re-import after mocks are set up
|
||||
});
|
||||
|
||||
it('should return 200 OK with queue metrics and worker heartbeats when all healthy', async () => {
|
||||
// Arrange: Mock queue getJobCounts() and Redis heartbeats
|
||||
const mockQueues = await import('../services/queues.server');
|
||||
const mockQueue = {
|
||||
getJobCounts: vi.fn().mockResolvedValue({
|
||||
waiting: 5,
|
||||
active: 2,
|
||||
failed: 1,
|
||||
delayed: 0,
|
||||
}),
|
||||
};
|
||||
|
||||
// Mock all queues
|
||||
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
|
||||
// Mock Redis heartbeat responses (all healthy, last seen < 60s ago)
|
||||
const recentTimestamp = new Date(Date.now() - 10000).toISOString(); // 10 seconds ago
|
||||
const heartbeatValue = JSON.stringify({
|
||||
timestamp: recentTimestamp,
|
||||
pid: 1234,
|
||||
host: 'test-host',
|
||||
});
|
||||
|
||||
mockedRedisConnection.get = vi.fn().mockResolvedValue(heartbeatValue);
|
||||
|
||||
// Act
|
||||
const response = await supertest(app).get('/api/health/queues');
|
||||
|
||||
// Assert
|
||||
expect(response.status).toBe(200);
|
||||
expect(response.body.success).toBe(true);
|
||||
expect(response.body.data.status).toBe('healthy');
|
||||
expect(response.body.data.queues).toBeDefined();
|
||||
expect(response.body.data.workers).toBeDefined();
|
||||
|
||||
// Verify queue metrics structure
|
||||
expect(response.body.data.queues['flyer-processing']).toEqual({
|
||||
waiting: 5,
|
||||
active: 2,
|
||||
failed: 1,
|
||||
delayed: 0,
|
||||
});
|
||||
|
||||
// Verify worker heartbeat structure
|
||||
expect(response.body.data.workers['flyer-processing']).toEqual({
|
||||
alive: true,
|
||||
lastSeen: recentTimestamp,
|
||||
pid: 1234,
|
||||
host: 'test-host',
|
||||
});
|
||||
});
|
||||
|
||||
it('should return 503 when a queue is unavailable', async () => {
|
||||
// Arrange: Mock one queue to fail
|
||||
const mockQueues = await import('../services/queues.server');
|
||||
const healthyQueue = {
|
||||
getJobCounts: vi.fn().mockResolvedValue({
|
||||
waiting: 0,
|
||||
active: 0,
|
||||
failed: 0,
|
||||
delayed: 0,
|
||||
}),
|
||||
};
|
||||
const failingQueue = {
|
||||
getJobCounts: vi.fn().mockRejectedValue(new Error('Redis connection lost')),
|
||||
};
|
||||
|
||||
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(failingQueue as never);
|
||||
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(healthyQueue as never);
|
||||
|
||||
mockedRedisConnection.get = vi.fn().mockResolvedValue(null);
|
||||
|
||||
// Act
|
||||
const response = await supertest(app).get('/api/health/queues');
|
||||
|
||||
// Assert
|
||||
expect(response.status).toBe(503);
|
||||
expect(response.body.success).toBe(false);
|
||||
expect(response.body.error.message).toBe('One or more queues or workers unavailable');
|
||||
expect(response.body.error.details.status).toBe('unhealthy');
|
||||
expect(response.body.error.details.queues['flyer-processing']).toEqual({
|
||||
error: 'Redis connection lost',
|
||||
});
|
||||
});
|
||||
|
||||
it('should return 503 when a worker heartbeat is stale', async () => {
|
||||
// Arrange: Mock queues as healthy but one worker heartbeat as stale
|
||||
const mockQueues = await import('../services/queues.server');
|
||||
const mockQueue = {
|
||||
getJobCounts: vi.fn().mockResolvedValue({
|
||||
waiting: 0,
|
||||
active: 0,
|
||||
failed: 0,
|
||||
delayed: 0,
|
||||
}),
|
||||
};
|
||||
|
||||
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
|
||||
// Mock heartbeat - one worker is stale (> 60s ago)
|
||||
const staleTimestamp = new Date(Date.now() - 120000).toISOString(); // 120 seconds ago
|
||||
const staleHeartbeat = JSON.stringify({
|
||||
timestamp: staleTimestamp,
|
||||
pid: 1234,
|
||||
host: 'test-host',
|
||||
});
|
||||
|
||||
// First call returns stale heartbeat for flyer-processing, rest return null (no heartbeat)
|
||||
let callCount = 0;
|
||||
mockedRedisConnection.get = vi.fn().mockImplementation(() => {
|
||||
callCount++;
|
||||
return Promise.resolve(callCount === 1 ? staleHeartbeat : null);
|
||||
});
|
||||
|
||||
// Act
|
||||
const response = await supertest(app).get('/api/health/queues');
|
||||
|
||||
// Assert
|
||||
expect(response.status).toBe(503);
|
||||
expect(response.body.success).toBe(false);
|
||||
expect(response.body.error.details.status).toBe('unhealthy');
|
||||
expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false });
|
||||
});
|
||||
|
||||
it('should return 503 when worker heartbeat is missing', async () => {
|
||||
// Arrange: Mock queues as healthy but no worker heartbeats in Redis
|
||||
const mockQueues = await import('../services/queues.server');
|
||||
const mockQueue = {
|
||||
getJobCounts: vi.fn().mockResolvedValue({
|
||||
waiting: 0,
|
||||
active: 0,
|
||||
failed: 0,
|
||||
delayed: 0,
|
||||
}),
|
||||
};
|
||||
|
||||
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
|
||||
// Mock Redis to return null (no heartbeat found)
|
||||
mockedRedisConnection.get = vi.fn().mockResolvedValue(null);
|
||||
|
||||
// Act
|
||||
const response = await supertest(app).get('/api/health/queues');
|
||||
|
||||
// Assert
|
||||
expect(response.status).toBe(503);
|
||||
expect(response.body.success).toBe(false);
|
||||
expect(response.body.error.details.status).toBe('unhealthy');
|
||||
expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false });
|
||||
});
|
||||
|
||||
it('should handle Redis connection errors gracefully', async () => {
|
||||
// Arrange: Mock queues to succeed but Redis get() to fail
|
||||
const mockQueues = await import('../services/queues.server');
|
||||
const mockQueue = {
|
||||
getJobCounts: vi.fn().mockResolvedValue({
|
||||
waiting: 0,
|
||||
active: 0,
|
||||
failed: 0,
|
||||
delayed: 0,
|
||||
}),
|
||||
};
|
||||
|
||||
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
|
||||
|
||||
// Mock Redis get() to throw error
|
||||
mockedRedisConnection.get = vi.fn().mockRejectedValue(new Error('Redis connection lost'));
|
||||
|
||||
// Act
|
||||
const response = await supertest(app).get('/api/health/queues');
|
||||
|
||||
// Assert: Should still return queue metrics but mark workers as unhealthy
|
||||
expect(response.status).toBe(503);
|
||||
expect(response.body.error.details.queues['flyer-processing']).toEqual({
|
||||
waiting: 0,
|
||||
active: 0,
|
||||
failed: 0,
|
||||
delayed: 0,
|
||||
});
|
||||
expect(response.body.error.details.workers['flyer-processing']).toEqual({
|
||||
alive: false,
|
||||
error: 'Redis connection lost',
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -15,6 +15,17 @@ import fs from 'node:fs/promises';
|
||||
import { getSimpleWeekAndYear } from '../utils/dateUtils';
|
||||
import { validateRequest } from '../middleware/validation.middleware';
|
||||
import { sendSuccess, sendError, ErrorCode } from '../utils/apiResponse';
|
||||
import {
|
||||
flyerQueue,
|
||||
emailQueue,
|
||||
analyticsQueue,
|
||||
weeklyAnalyticsQueue,
|
||||
cleanupQueue,
|
||||
tokenCleanupQueue,
|
||||
receiptQueue,
|
||||
expiryAlertQueue,
|
||||
barcodeQueue,
|
||||
} from '../services/queues.server';
|
||||
|
||||
const router = Router();
|
||||
|
||||
@@ -442,4 +453,224 @@ router.get(
|
||||
},
|
||||
);
|
||||
|
||||
// =============================================================================
|
||||
// QUEUE HEALTH MONITORING (ADR-053)
|
||||
// =============================================================================
|
||||
|
||||
/**
|
||||
* @openapi
|
||||
* /health/queues:
|
||||
* get:
|
||||
* summary: Queue health and metrics with worker heartbeats
|
||||
* description: |
|
||||
* Returns job counts for all BullMQ queues and worker heartbeat status.
|
||||
* Use this endpoint to monitor queue depths and detect stuck/frozen workers.
|
||||
* Implements ADR-053: Worker Health Checks and Stalled Job Monitoring.
|
||||
* tags:
|
||||
* - Health
|
||||
* responses:
|
||||
* 200:
|
||||
* description: Queue metrics and worker heartbeats retrieved successfully
|
||||
* content:
|
||||
* application/json:
|
||||
* schema:
|
||||
* type: object
|
||||
* properties:
|
||||
* success:
|
||||
* type: boolean
|
||||
* example: true
|
||||
* data:
|
||||
* type: object
|
||||
* properties:
|
||||
* status:
|
||||
* type: string
|
||||
* enum: [healthy, unhealthy]
|
||||
* timestamp:
|
||||
* type: string
|
||||
* format: date-time
|
||||
* queues:
|
||||
* type: object
|
||||
* additionalProperties:
|
||||
* type: object
|
||||
* properties:
|
||||
* waiting:
|
||||
* type: integer
|
||||
* active:
|
||||
* type: integer
|
||||
* failed:
|
||||
* type: integer
|
||||
* delayed:
|
||||
* type: integer
|
||||
* workers:
|
||||
* type: object
|
||||
* additionalProperties:
|
||||
* type: object
|
||||
* properties:
|
||||
* alive:
|
||||
* type: boolean
|
||||
* lastSeen:
|
||||
* type: string
|
||||
* format: date-time
|
||||
* pid:
|
||||
* type: integer
|
||||
* host:
|
||||
* type: string
|
||||
* 503:
|
||||
* description: Redis unavailable or workers not responding
|
||||
* content:
|
||||
* application/json:
|
||||
* schema:
|
||||
* $ref: '#/components/schemas/ErrorResponse'
|
||||
*/
|
||||
router.get(
|
||||
'/queues',
|
||||
validateRequest(emptySchema),
|
||||
async (req: Request, res: Response, next: NextFunction) => {
|
||||
try {
|
||||
// Define all queues to monitor
|
||||
const queues = [
|
||||
{ name: 'flyer-processing', queue: flyerQueue },
|
||||
{ name: 'email-sending', queue: emailQueue },
|
||||
{ name: 'analytics-reporting', queue: analyticsQueue },
|
||||
{ name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue },
|
||||
{ name: 'file-cleanup', queue: cleanupQueue },
|
||||
{ name: 'token-cleanup', queue: tokenCleanupQueue },
|
||||
{ name: 'receipt-processing', queue: receiptQueue },
|
||||
{ name: 'expiry-alerts', queue: expiryAlertQueue },
|
||||
{ name: 'barcode-detection', queue: barcodeQueue },
|
||||
];
|
||||
|
||||
// Fetch job counts for all queues in parallel
|
||||
const queueMetrics = await Promise.all(
|
||||
queues.map(async ({ name, queue }) => {
|
||||
try {
|
||||
const counts = await queue.getJobCounts();
|
||||
return {
|
||||
name,
|
||||
counts: {
|
||||
waiting: counts.waiting || 0,
|
||||
active: counts.active || 0,
|
||||
failed: counts.failed || 0,
|
||||
delayed: counts.delayed || 0,
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
// If individual queue fails, return error state
|
||||
return {
|
||||
name,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
};
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
// Fetch worker heartbeats in parallel
|
||||
const workerNames = queues.map((q) => q.name);
|
||||
const workerHeartbeats = await Promise.all(
|
||||
workerNames.map(async (name) => {
|
||||
try {
|
||||
const key = `worker:heartbeat:${name}`;
|
||||
const value = await redisConnection.get(key);
|
||||
|
||||
if (!value) {
|
||||
return { name, alive: false };
|
||||
}
|
||||
|
||||
const heartbeat = JSON.parse(value) as {
|
||||
timestamp: string;
|
||||
pid: number;
|
||||
host: string;
|
||||
};
|
||||
const lastSeenMs = new Date(heartbeat.timestamp).getTime();
|
||||
const nowMs = Date.now();
|
||||
const ageSeconds = (nowMs - lastSeenMs) / 1000;
|
||||
|
||||
// Consider alive if last heartbeat < 60 seconds ago
|
||||
const alive = ageSeconds < 60;
|
||||
|
||||
return {
|
||||
name,
|
||||
alive,
|
||||
lastSeen: heartbeat.timestamp,
|
||||
pid: heartbeat.pid,
|
||||
host: heartbeat.host,
|
||||
};
|
||||
} catch (error) {
|
||||
// If heartbeat check fails, mark as unknown
|
||||
return {
|
||||
name,
|
||||
alive: false,
|
||||
error: error instanceof Error ? error.message : 'Unknown error',
|
||||
};
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
// Build response objects
|
||||
const queuesData: Record<
|
||||
string,
|
||||
{ waiting: number; active: number; failed: number; delayed: number } | { error: string }
|
||||
> = {};
|
||||
const workersData: Record<
|
||||
string,
|
||||
| { alive: boolean; lastSeen?: string; pid?: number; host?: string }
|
||||
| { alive: boolean; error: string }
|
||||
> = {};
|
||||
let hasErrors = false;
|
||||
|
||||
for (const metric of queueMetrics) {
|
||||
if ('error' in metric) {
|
||||
queuesData[metric.name] = { error: metric.error };
|
||||
hasErrors = true;
|
||||
} else {
|
||||
queuesData[metric.name] = metric.counts;
|
||||
}
|
||||
}
|
||||
|
||||
for (const heartbeat of workerHeartbeats) {
|
||||
if ('error' in heartbeat) {
|
||||
workersData[heartbeat.name] = { alive: false, error: heartbeat.error };
|
||||
} else if (!heartbeat.alive) {
|
||||
workersData[heartbeat.name] = { alive: false };
|
||||
hasErrors = true;
|
||||
} else {
|
||||
workersData[heartbeat.name] = {
|
||||
alive: heartbeat.alive,
|
||||
lastSeen: heartbeat.lastSeen,
|
||||
pid: heartbeat.pid,
|
||||
host: heartbeat.host,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const response = {
|
||||
status: hasErrors ? ('unhealthy' as const) : ('healthy' as const),
|
||||
timestamp: new Date().toISOString(),
|
||||
queues: queuesData,
|
||||
workers: workersData,
|
||||
};
|
||||
|
||||
if (hasErrors) {
|
||||
return sendError(
|
||||
res,
|
||||
ErrorCode.SERVICE_UNAVAILABLE,
|
||||
'One or more queues or workers unavailable',
|
||||
503,
|
||||
response,
|
||||
);
|
||||
}
|
||||
|
||||
return sendSuccess(res, response);
|
||||
} catch (error: unknown) {
|
||||
// Redis connection error or other unexpected failure
|
||||
if (error instanceof Error) {
|
||||
return next(error);
|
||||
}
|
||||
const message =
|
||||
(error as { message?: string })?.message || 'Failed to retrieve queue metrics';
|
||||
return next(new Error(message));
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -132,7 +132,8 @@ describe('API Client', () => {
|
||||
.mockResolvedValueOnce({
|
||||
ok: true,
|
||||
status: 200,
|
||||
json: () => Promise.resolve({ token: 'new-refreshed-token' }),
|
||||
// The API returns {success, data: {token}} wrapper format
|
||||
json: () => Promise.resolve({ success: true, data: { token: 'new-refreshed-token' } }),
|
||||
} as Response)
|
||||
.mockResolvedValueOnce({
|
||||
ok: true,
|
||||
@@ -218,7 +219,7 @@ describe('API Client', () => {
|
||||
localStorage.setItem('authToken', 'expired-token');
|
||||
// Mock the global fetch to return a sequence of responses:
|
||||
// 1. 401 Unauthorized (initial API call)
|
||||
// 2. 200 OK (token refresh call)
|
||||
// 2. 200 OK (token refresh call) - uses API wrapper format {success, data: {token}}
|
||||
// 3. 200 OK (retry of the initial API call)
|
||||
vi.mocked(global.fetch)
|
||||
.mockResolvedValueOnce({
|
||||
@@ -229,7 +230,8 @@ describe('API Client', () => {
|
||||
.mockResolvedValueOnce({
|
||||
ok: true,
|
||||
status: 200,
|
||||
json: () => Promise.resolve({ token: 'new-refreshed-token' }),
|
||||
// The API returns {success, data: {token}} wrapper format
|
||||
json: () => Promise.resolve({ success: true, data: { token: 'new-refreshed-token' } }),
|
||||
} as Response)
|
||||
.mockResolvedValueOnce({
|
||||
ok: true,
|
||||
|
||||
@@ -62,12 +62,33 @@ vi.mock('./logger.server', () => ({
|
||||
vi.mock('bullmq', () => ({
|
||||
Worker: mocks.MockWorker,
|
||||
Queue: vi.fn(function () {
|
||||
return { add: vi.fn() };
|
||||
return { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) };
|
||||
}),
|
||||
// Add UnrecoverableError to the mock so it can be used in tests
|
||||
UnrecoverableError: class UnrecoverableError extends Error {},
|
||||
}));
|
||||
|
||||
// Mock redis.server to prevent real Redis connection attempts
|
||||
vi.mock('./redis.server', () => ({
|
||||
connection: {
|
||||
on: vi.fn(),
|
||||
quit: vi.fn().mockResolvedValue(undefined),
|
||||
},
|
||||
}));
|
||||
|
||||
// Mock queues.server to provide mock queue instances
|
||||
vi.mock('./queues.server', () => ({
|
||||
flyerQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
emailQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
analyticsQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
cleanupQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
weeklyAnalyticsQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
tokenCleanupQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
receiptQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
expiryAlertQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
barcodeQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
|
||||
}));
|
||||
|
||||
// Mock flyerProcessingService.server as flyerWorker and cleanupWorker depend on it
|
||||
vi.mock('./flyerProcessingService.server', () => {
|
||||
// Mock the constructor to return an object with the mocked methods
|
||||
@@ -88,6 +109,67 @@ vi.mock('./flyerDataTransformer', () => ({
|
||||
},
|
||||
}));
|
||||
|
||||
// Mock aiService.server to prevent initialization issues
|
||||
vi.mock('./aiService.server', () => ({
|
||||
aiService: {
|
||||
extractAndValidateData: vi.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
// Mock db/index.db to prevent database connections
|
||||
vi.mock('./db/index.db', () => ({
|
||||
personalizationRepo: {},
|
||||
}));
|
||||
|
||||
// Mock flyerAiProcessor.server
|
||||
vi.mock('./flyerAiProcessor.server', () => ({
|
||||
FlyerAiProcessor: vi.fn().mockImplementation(function () {
|
||||
return { processFlyer: vi.fn() };
|
||||
}),
|
||||
}));
|
||||
|
||||
// Mock flyerPersistenceService.server
|
||||
vi.mock('./flyerPersistenceService.server', () => ({
|
||||
FlyerPersistenceService: vi.fn().mockImplementation(function () {
|
||||
return { persistFlyerData: vi.fn() };
|
||||
}),
|
||||
}));
|
||||
|
||||
// Mock db/connection.db to prevent database connections
|
||||
vi.mock('./db/connection.db', () => ({
|
||||
withTransaction: vi.fn(),
|
||||
}));
|
||||
|
||||
// Mock receiptService.server
|
||||
vi.mock('./receiptService.server', () => ({
|
||||
processReceiptJob: vi.fn().mockResolvedValue(undefined),
|
||||
}));
|
||||
|
||||
// Mock expiryService.server
|
||||
vi.mock('./expiryService.server', () => ({
|
||||
processExpiryAlertJob: vi.fn().mockResolvedValue(undefined),
|
||||
}));
|
||||
|
||||
// Mock barcodeService.server
|
||||
vi.mock('./barcodeService.server', () => ({
|
||||
processBarcodeDetectionJob: vi.fn().mockResolvedValue(undefined),
|
||||
}));
|
||||
|
||||
// Mock flyerFileHandler.server
|
||||
vi.mock('./flyerFileHandler.server', () => ({
|
||||
FlyerFileHandler: vi.fn().mockImplementation(function () {
|
||||
return { handleFile: vi.fn() };
|
||||
}),
|
||||
}));
|
||||
|
||||
// Mock workerOptions config
|
||||
vi.mock('../config/workerOptions', () => ({
|
||||
defaultWorkerOptions: {
|
||||
lockDuration: 30000,
|
||||
stalledInterval: 30000,
|
||||
},
|
||||
}));
|
||||
|
||||
// Helper to create a mock BullMQ Job object
|
||||
const createMockJob = <T>(data: T): Job<T> => {
|
||||
return {
|
||||
|
||||
@@ -3,6 +3,7 @@ import { Worker, Job } from 'bullmq';
|
||||
import fsPromises from 'node:fs/promises';
|
||||
import { exec } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import os from 'os';
|
||||
|
||||
import { logger } from './logger.server';
|
||||
import { connection } from './redis.server';
|
||||
@@ -91,6 +92,45 @@ const createWorkerProcessor = <T, R>(processor: (job: Job<T>) => Promise<R>) =>
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Updates the worker heartbeat in Redis.
|
||||
* Stores timestamp, PID, and hostname to detect frozen/hung workers.
|
||||
* TTL is 90s, so if heartbeat isn't updated for 90s, the key expires.
|
||||
* Implements ADR-053: Worker Health Checks.
|
||||
*/
|
||||
const updateWorkerHeartbeat = async (workerName: string) => {
|
||||
const key = `worker:heartbeat:${workerName}`;
|
||||
const value = JSON.stringify({
|
||||
timestamp: new Date().toISOString(),
|
||||
pid: process.pid,
|
||||
host: os.hostname(),
|
||||
});
|
||||
|
||||
try {
|
||||
await connection.set(key, value, 'EX', 90);
|
||||
} catch (error) {
|
||||
logger.error({ err: error, workerName }, `Failed to update heartbeat for worker ${workerName}`);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Starts periodic heartbeat updates for a worker.
|
||||
* Updates every 30 seconds with 90s TTL.
|
||||
*/
|
||||
const startWorkerHeartbeat = (worker: Worker) => {
|
||||
// Initial heartbeat
|
||||
updateWorkerHeartbeat(worker.name);
|
||||
|
||||
// Periodic heartbeat updates
|
||||
const heartbeatInterval = setInterval(() => {
|
||||
updateWorkerHeartbeat(worker.name);
|
||||
}, 30000); // 30 seconds
|
||||
|
||||
// Store interval on worker for cleanup
|
||||
(worker as unknown as { heartbeatInterval?: NodeJS.Timeout }).heartbeatInterval =
|
||||
heartbeatInterval;
|
||||
};
|
||||
|
||||
const attachWorkerEventListeners = (worker: Worker) => {
|
||||
worker.on('completed', (job: Job, returnValue: unknown) => {
|
||||
logger.info({ returnValue }, `[${worker.name}] Job ${job.id} completed successfully.`);
|
||||
@@ -102,6 +142,9 @@ const attachWorkerEventListeners = (worker: Worker) => {
|
||||
`[${worker.name}] Job ${job?.id} has ultimately failed after all attempts.`,
|
||||
);
|
||||
});
|
||||
|
||||
// Start heartbeat monitoring for this worker
|
||||
startWorkerHeartbeat(worker);
|
||||
};
|
||||
|
||||
export const flyerWorker = new Worker<FlyerJobData>(
|
||||
@@ -219,17 +262,28 @@ const SHUTDOWN_TIMEOUT = 30000; // 30 seconds
|
||||
* without exiting the process.
|
||||
*/
|
||||
export const closeWorkers = async () => {
|
||||
await Promise.all([
|
||||
flyerWorker.close(),
|
||||
emailWorker.close(),
|
||||
analyticsWorker.close(),
|
||||
cleanupWorker.close(),
|
||||
weeklyAnalyticsWorker.close(),
|
||||
tokenCleanupWorker.close(),
|
||||
receiptWorker.close(),
|
||||
expiryAlertWorker.close(),
|
||||
barcodeWorker.close(),
|
||||
]);
|
||||
// Clear heartbeat intervals
|
||||
const workers = [
|
||||
flyerWorker,
|
||||
emailWorker,
|
||||
analyticsWorker,
|
||||
cleanupWorker,
|
||||
weeklyAnalyticsWorker,
|
||||
tokenCleanupWorker,
|
||||
receiptWorker,
|
||||
expiryAlertWorker,
|
||||
barcodeWorker,
|
||||
];
|
||||
|
||||
workers.forEach((worker) => {
|
||||
const interval = (worker as unknown as { heartbeatInterval?: NodeJS.Timeout })
|
||||
.heartbeatInterval;
|
||||
if (interval) {
|
||||
clearInterval(interval);
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(workers.map((w) => w.close()));
|
||||
};
|
||||
|
||||
export const gracefulShutdown = async (signal: string) => {
|
||||
|
||||
Reference in New Issue
Block a user