Compare commits

..

11 Commits

Author SHA1 Message Date
Gitea Actions
4346332bbf ci: Bump version to 0.12.15 [skip ci] 2026-01-27 00:54:43 +05:00
61cfb518e6 ADR-015 done
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 1m13s
2026-01-26 11:48:42 -08:00
Gitea Actions
e86ce51b6c ci: Bump version to 0.12.14 [skip ci] 2026-01-26 17:52:02 +05:00
840a7a62d3 adr work
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 1m15s
2026-01-26 04:51:10 -08:00
5720820d95 adr-053 done 2026-01-26 04:51:09 -08:00
Gitea Actions
e5cdb54308 ci: Bump version to 0.12.13 [skip ci] 2026-01-24 02:48:50 +05:00
a3f212ff81 Primary Issue: TZ Environment Variable Breaking Tests
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 18m47s
2026-01-23 13:40:48 -08:00
Gitea Actions
de263f74b0 ci: Bump version to 0.12.12 [skip ci] 2026-01-24 00:30:16 +05:00
a71e41302b no TZ in tests - who knew?
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 18m35s
2026-01-23 11:28:45 -08:00
Gitea Actions
3575803252 ci: Bump version to 0.12.11 [skip ci] 2026-01-23 12:40:09 +05:00
d03900cefe set PST as common time zone for log matching ease
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 19m4s
2026-01-22 23:38:45 -08:00
29 changed files with 1760 additions and 545 deletions

View File

@@ -118,7 +118,10 @@
"mcp__localerrors__get_project",
"mcp__localerrors__get_issue",
"mcp__localerrors__get_event",
"mcp__localerrors__list_teams"
"mcp__localerrors__list_teams",
"WebSearch",
"Bash(for trigger in update_price_history_on_flyer_item_insert update_recipe_rating_aggregates log_new_recipe log_new_flyer)",
"Bash(do echo \"=== $trigger ===\")"
]
},
"enabledMcpjsonServers": [

View File

@@ -222,6 +222,7 @@ Common issues with solutions:
4. **Filename collisions** - Multer predictable names → Use `${Date.now()}-${Math.round(Math.random() * 1e9)}`
5. **Response format mismatches** - API format changes → Log response bodies, update assertions
6. **External service failures** - PM2/Redis unavailable → try/catch with graceful degradation
7. **TZ environment variable breaks async hooks** - `TZ=America/Los_Angeles` causes `RangeError: Invalid triggerAsyncId value: NaN` → Tests now explicitly set `TZ=` (empty) in package.json scripts
**Full Details**: See test issues section at end of this document or [docs/development/TESTING.md](docs/development/TESTING.md)
@@ -377,3 +378,28 @@ API formats change: `data.jobId` vs `data.job.id`, nested vs flat, string vs num
PM2/Redis health checks fail when unavailable.
**Solution**: try/catch with graceful degradation or mock
### 7. TZ Environment Variable Breaking Async Hooks
**Problem**: When `TZ=America/Los_Angeles` (or other timezone values) is set in the environment, Node.js async_hooks module can produce `RangeError: Invalid triggerAsyncId value: NaN`. This breaks React Testing Library's `render()` function which uses async hooks internally.
**Root Cause**: Setting `TZ` to certain timezone values interferes with Node.js's internal async tracking mechanism, causing invalid async IDs to be generated.
**Symptoms**:
```text
RangeError: Invalid triggerAsyncId value: NaN
process.env.NODE_ENV.queueSeveralMicrotasks node_modules/react/cjs/react.development.js:751:15
process.env.NODE_ENV.exports.act node_modules/react/cjs/react.development.js:886:11
node_modules/@testing-library/react/dist/act-compat.js:46:25
renderRoot node_modules/@testing-library/react/dist/pure.js:189:26
```
**Solution**: Explicitly unset `TZ` in all test scripts by adding `TZ=` (empty value) to cross-env:
```json
"test:unit": "cross-env NODE_ENV=test TZ= tsx ..."
"test:integration": "cross-env NODE_ENV=test TZ= tsx ..."
```
**Context**: This issue was introduced in commit `d03900c` which added `TZ: 'America/Los_Angeles'` to PM2 ecosystem configs for consistent log timestamps in production/dev environments. Tests must explicitly override this to prevent the async hooks error.

View File

@@ -57,6 +57,8 @@ services:
- '8000:8000' # Bugsink error tracking HTTP (ADR-015)
- '8443:8443' # Bugsink error tracking HTTPS (ADR-015)
environment:
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
- TZ=America/Los_Angeles
# Core settings
- NODE_ENV=development
# Database - use service name for Docker networking
@@ -122,6 +124,10 @@ services:
ports:
- '5432:5432'
environment:
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: America/Los_Angeles
# PostgreSQL timezone setting (used by log_timezone and timezone parameters)
PGTZ: America/Los_Angeles
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: flyer_crawler_dev
@@ -142,6 +148,8 @@ services:
postgres
-c config_file=/var/lib/postgresql/data/postgresql.conf
-c hba_file=/var/lib/postgresql/data/pg_hba.conf
-c timezone=America/Los_Angeles
-c log_timezone=America/Los_Angeles
-c log_min_messages=notice
-c client_min_messages=notice
-c logging_collector=on
@@ -175,6 +183,9 @@ services:
user: root
ports:
- '6379:6379'
environment:
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: America/Los_Angeles
volumes:
- redis_data:/data
# Create log volume for Logstash access (ADR-050)

View File

@@ -121,7 +121,8 @@ input {
# ============================================================================
# Captures PostgreSQL log output including fn_log() structured JSON messages.
# PostgreSQL is configured to write logs to /var/log/postgresql/ (shared volume).
# Log format: "2026-01-22 00:00:00 UTC [5724] postgres@flyer_crawler_dev LOG: message"
# Log format: "2026-01-22 14:30:00 PST [5724] postgres@flyer_crawler_dev LOG: message"
# Note: Timestamps are in PST (America/Los_Angeles) timezone as configured in compose.dev.yml
file {
path => "/var/log/postgresql/*.log"
type => "postgres"
@@ -226,10 +227,11 @@ filter {
# PostgreSQL Log Processing (ADR-050)
# ============================================================================
# PostgreSQL log format in dev container:
# "2026-01-22 00:00:00 UTC [5724] postgres@flyer_crawler_dev LOG: message"
# "2026-01-22 07:06:03 UTC [19851] postgres@flyer_crawler_dev ERROR: column "id" does not exist"
# "2026-01-22 14:30:00 PST [5724] postgres@flyer_crawler_dev LOG: message"
# "2026-01-22 15:06:03 PST [19851] postgres@flyer_crawler_dev ERROR: column "id" does not exist"
# Note: Timestamps are in PST (America/Los_Angeles) timezone
if [type] == "postgres" {
# Parse PostgreSQL log prefix with UTC timezone
# Parse PostgreSQL log prefix with timezone (PST in dev, may vary in prod)
grok {
match => { "message" => "%{YEAR}-%{MONTHNUM}-%{MONTHDAY} %{TIME} %{WORD:pg_timezone} \[%{POSINT:pg_pid}\] %{DATA:pg_user}@%{DATA:pg_database} %{WORD:pg_level}: ?%{GREEDYDATA:pg_message}" }
tag_on_failure => ["_postgres_grok_failure"]

View File

@@ -2,6 +2,10 @@
# This file is mounted into the PostgreSQL container to enable structured logging
# from database functions via fn_log()
# Timezone: PST (America/Los_Angeles) for consistent log timestamps
timezone = 'America/Los_Angeles'
log_timezone = 'America/Los_Angeles'
# Enable logging to files for Logstash pickup
logging_collector = on
log_destination = 'stderr'

View File

@@ -1,324 +0,0 @@
# ADR-015: Application Performance Monitoring (APM) and Error Tracking
**Date**: 2025-12-12
**Status**: Accepted
**Updated**: 2026-01-11
## Context
While `ADR-004` established structured logging with Pino, the application lacks a high-level, aggregated view of its health, performance, and errors. It's difficult to spot trends, identify slow API endpoints, or be proactively notified of new types of errors.
Key requirements:
1. **Self-hosted**: No external SaaS dependencies for error tracking
2. **Sentry SDK compatible**: Leverage mature, well-documented SDKs
3. **Lightweight**: Minimal resource overhead in the dev container
4. **Production-ready**: Same architecture works on bare-metal production servers
5. **AI-accessible**: MCP server integration for Claude Code and other AI tools
## Decision
We will implement a self-hosted error tracking stack using **Bugsink** as the Sentry-compatible backend, with the following components:
### 1. Error Tracking Backend: Bugsink
**Bugsink** is a lightweight, self-hosted Sentry alternative that:
- Runs as a single process (no Kafka, Redis, ClickHouse required)
- Is fully compatible with Sentry SDKs
- Supports ARM64 and AMD64 architectures
- Can use SQLite (dev) or PostgreSQL (production)
**Deployment**:
- **Dev container**: Installed as a systemd service inside the container
- **Production**: Runs as a systemd service on bare-metal, listening on localhost only
- **Database**: Uses PostgreSQL with a dedicated `bugsink` user and `bugsink` database (same PostgreSQL instance as the main application)
### 2. Backend Integration: @sentry/node
The Express backend will integrate `@sentry/node` SDK to:
- Capture unhandled exceptions before PM2/process manager restarts
- Report errors with full stack traces and context
- Integrate with Pino logger for breadcrumbs
- Track transaction performance (optional)
### 3. Frontend Integration: @sentry/react
The React frontend will integrate `@sentry/react` SDK to:
- Wrap the app in a Sentry Error Boundary
- Capture unhandled JavaScript errors
- Report errors with component stack traces
- Track user session context
- **Frontend Error Correlation**: The global API client (Axios/Fetch wrapper) MUST intercept 4xx/5xx responses. It MUST extract the `x-request-id` header (if present) and attach it to the Sentry scope as a tag `api_request_id` before re-throwing the error. This allows developers to copy the ID from Sentry and search for it in backend logs.
### 4. Log Aggregation: Logstash
**Logstash** parses application and infrastructure logs, forwarding error patterns to Bugsink:
- **Installation**: Installed inside the dev container (and on bare-metal prod servers)
- **Inputs**:
- Pino JSON logs from the Node.js application
- Redis logs (connection errors, memory warnings, slow commands)
- PostgreSQL function logs (future - see Implementation Steps)
- **Filter**: Identifies error-level logs (5xx responses, unhandled exceptions, Redis errors)
- **Output**: Sends to Bugsink via Sentry-compatible HTTP API
This provides a secondary error capture path for:
- Errors that occur before Sentry SDK initialization
- Log-based errors that don't throw exceptions
- Redis connection/performance issues
- Database function errors and slow queries
- Historical error analysis from log files
### 5. MCP Server Integration: bugsink-mcp
For AI tool integration (Claude Code, Cursor, etc.), we use the open-source [bugsink-mcp](https://github.com/j-shelfwood/bugsink-mcp) server:
- **No code changes required**: Configurable via environment variables
- **Capabilities**: List projects, get issues, view events, get stacktraces, manage releases
- **Configuration**:
- `BUGSINK_URL`: Points to Bugsink instance (`http://localhost:8000` for dev, `https://bugsink.projectium.com` for prod)
- `BUGSINK_API_TOKEN`: API token from Bugsink (created via Django management command)
- `BUGSINK_ORG_SLUG`: Organization identifier (usually "sentry")
**Note:** Despite the name `sentry-selfhosted-mcp` mentioned in earlier drafts of this ADR, the actual MCP server used is `bugsink-mcp` which is specifically designed for Bugsink's API structure.
## Architecture
```text
┌─────────────────────────────────────────────────────────────────────────┐
│ Dev Container / Production Server │
├─────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────────────┐ ┌──────────────────┐ │
│ │ Frontend │ │ Backend │ │
│ │ (React) │ │ (Express) │ │
│ │ @sentry/react │ │ @sentry/node │ │
│ └────────┬─────────┘ └────────┬─────────┘ │
│ │ │ │
│ │ Sentry SDK Protocol │ │
│ └───────────┬───────────────┘ │
│ │ │
│ ▼ │
│ ┌──────────────────────┐ │
│ │ Bugsink │ │
│ │ (localhost:8000) │◄──────────────────┐ │
│ │ │ │ │
│ │ PostgreSQL backend │ │ │
│ └──────────────────────┘ │ │
│ │ │
│ ┌──────────────────────┐ │ │
│ │ Logstash │───────────────────┘ │
│ │ (Log Aggregator) │ Sentry Output │
│ │ │ │
│ │ Inputs: │ │
│ │ - Pino app logs │ │
│ │ - Redis logs │ │
│ │ - PostgreSQL (future) │
│ └──────────────────────┘ │
│ ▲ ▲ ▲ │
│ │ │ │ │
│ ┌───────────┘ │ └───────────┐ │
│ │ │ │ │
│ ┌────┴─────┐ ┌─────┴────┐ ┌──────┴─────┐ │
│ │ Pino │ │ Redis │ │ PostgreSQL │ │
│ │ Logs │ │ Logs │ │ Logs (TBD) │ │
│ └──────────┘ └──────────┘ └────────────┘ │
│ │
│ ┌──────────────────────┐ │
│ │ PostgreSQL │ │
│ │ ┌────────────────┐ │ │
│ │ │ flyer_crawler │ │ (main app database) │
│ │ ├────────────────┤ │ │
│ │ │ bugsink │ │ (error tracking database) │
│ │ └────────────────┘ │ │
│ └──────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────────┘
External (Developer Machine):
┌──────────────────────────────────────┐
│ Claude Code / Cursor / VS Code │
│ ┌────────────────────────────────┐ │
│ │ bugsink-mcp │ │
│ │ (MCP Server) │ │
│ │ │ │
│ │ BUGSINK_URL=http://localhost:8000
│ │ BUGSINK_API_TOKEN=... │ │
│ │ BUGSINK_ORG_SLUG=... │ │
│ └────────────────────────────────┘ │
└──────────────────────────────────────┘
```
## Configuration
### Environment Variables
| Variable | Description | Default (Dev) |
| ------------------ | ------------------------------ | -------------------------- |
| `BUGSINK_DSN` | Sentry-compatible DSN for SDKs | Set after project creation |
| `BUGSINK_ENABLED` | Enable/disable error reporting | `true` |
| `BUGSINK_BASE_URL` | Bugsink web UI URL (internal) | `http://localhost:8000` |
### PostgreSQL Setup
```sql
-- Create dedicated Bugsink database and user
CREATE USER bugsink WITH PASSWORD 'bugsink_dev_password';
CREATE DATABASE bugsink OWNER bugsink;
GRANT ALL PRIVILEGES ON DATABASE bugsink TO bugsink;
```
### Bugsink Configuration
```bash
# Environment variables for Bugsink service
SECRET_KEY=<random-50-char-string>
DATABASE_URL=postgresql://bugsink:bugsink_dev_password@localhost:5432/bugsink
BASE_URL=http://localhost:8000
PORT=8000
```
### Logstash Pipeline
```conf
# /etc/logstash/conf.d/bugsink.conf
# === INPUTS ===
input {
# Pino application logs
file {
path => "/app/logs/*.log"
codec => json
type => "pino"
tags => ["app"]
}
# Redis logs
file {
path => "/var/log/redis/*.log"
type => "redis"
tags => ["redis"]
}
# PostgreSQL logs (for function logging - future)
# file {
# path => "/var/log/postgresql/*.log"
# type => "postgres"
# tags => ["postgres"]
# }
}
# === FILTERS ===
filter {
# Pino error detection (level 50 = error, 60 = fatal)
if [type] == "pino" and [level] >= 50 {
mutate { add_tag => ["error"] }
}
# Redis error detection
if [type] == "redis" {
grok {
match => { "message" => "%{POSINT:pid}:%{WORD:role} %{MONTHDAY} %{MONTH} %{TIME} %{WORD:loglevel} %{GREEDYDATA:redis_message}" }
}
if [loglevel] in ["WARNING", "ERROR"] {
mutate { add_tag => ["error"] }
}
}
# PostgreSQL function error detection (future)
# if [type] == "postgres" {
# # Parse PostgreSQL log format and detect ERROR/FATAL levels
# }
}
# === OUTPUT ===
output {
if "error" in [tags] {
http {
url => "http://localhost:8000/api/store/"
http_method => "post"
format => "json"
# Sentry envelope format
}
}
}
```
## Implementation Steps
1. **Update Dockerfile.dev**:
- Install Bugsink (pip package or binary)
- Install Logstash (Elastic APT repository)
- Add systemd service files for both
2. **PostgreSQL initialization**:
- Add Bugsink user/database creation to `sql/00-init-extensions.sql`
3. **Backend SDK integration**:
- Install `@sentry/node`
- Initialize in `server.ts` before Express app
- Configure error handler middleware integration
4. **Frontend SDK integration**:
- Install `@sentry/react`
- Wrap `App` component with `Sentry.ErrorBoundary`
- Configure in `src/index.tsx`
5. **Environment configuration**:
- Add Bugsink variables to `src/config/env.ts`
- Update `.env.example` and `compose.dev.yml`
6. **Logstash configuration**:
- Create pipeline config for Pino → Bugsink
- Configure Pino to write to log file in addition to stdout
- Configure Redis log monitoring (connection errors, slow commands)
7. **MCP server documentation**:
- Document `bugsink-mcp` setup in CLAUDE.md
8. **PostgreSQL function logging** (future):
- Configure PostgreSQL to log function execution errors
- Add Logstash input for PostgreSQL logs
- Define filter rules for function-level error detection
- _Note: Ask for implementation details when this step is reached_
## Consequences
### Positive
- **Full observability**: Aggregated view of errors, trends, and performance
- **Self-hosted**: No external SaaS dependencies or subscription costs
- **SDK compatibility**: Leverages mature Sentry SDKs with excellent documentation
- **AI integration**: MCP server enables Claude Code to query and analyze errors
- **Unified architecture**: Same setup works in dev container and production
- **Lightweight**: Bugsink runs in a single process, unlike full Sentry (16GB+ RAM)
### Negative
- **Additional services**: Bugsink and Logstash add complexity to the container
- **PostgreSQL overhead**: Additional database for error tracking
- **Initial setup**: Requires configuration of multiple components
- **Logstash learning curve**: Pipeline configuration requires Logstash knowledge
## Alternatives Considered
1. **Full Sentry self-hosted**: Rejected due to complexity (Kafka, Redis, ClickHouse, 16GB+ RAM minimum)
2. **GlitchTip**: Considered, but Bugsink is lighter weight and easier to deploy
3. **Sentry SaaS**: Rejected due to self-hosted requirement
4. **Custom error aggregation**: Rejected in favor of proven Sentry SDK ecosystem
## References
- [Bugsink Documentation](https://www.bugsink.com/docs/)
- [Bugsink Docker Install](https://www.bugsink.com/docs/docker-install/)
- [@sentry/node Documentation](https://docs.sentry.io/platforms/javascript/guides/node/)
- [@sentry/react Documentation](https://docs.sentry.io/platforms/javascript/guides/react/)
- [bugsink-mcp](https://github.com/j-shelfwood/bugsink-mcp)
- [Logstash Reference](https://www.elastic.co/guide/en/logstash/current/index.html)

View File

@@ -0,0 +1,272 @@
# ADR-015: Error Tracking and Observability
**Date**: 2025-12-12
**Status**: Accepted (Fully Implemented)
**Updated**: 2026-01-26 (user context integration completed)
**Related**: [ADR-056](./0056-application-performance-monitoring.md) (Application Performance Monitoring)
## Context
While ADR-004 established structured logging with Pino, the application lacks a high-level, aggregated view of its health and errors. It's difficult to spot trends, identify recurring issues, or be proactively notified of new types of errors.
Key requirements:
1. **Self-hosted**: No external SaaS dependencies for error tracking
2. **Sentry SDK compatible**: Leverage mature, well-documented SDKs
3. **Lightweight**: Minimal resource overhead in the dev container
4. **Production-ready**: Same architecture works on bare-metal production servers
5. **AI-accessible**: MCP server integration for Claude Code and other AI tools
**Note**: Application Performance Monitoring (APM) and distributed tracing are covered separately in [ADR-056](./0056-application-performance-monitoring.md).
## Decision
We implement a self-hosted error tracking stack using **Bugsink** as the Sentry-compatible backend, with the following components:
### 1. Error Tracking Backend: Bugsink
**Bugsink** is a lightweight, self-hosted Sentry alternative that:
- Runs as a single process (no Kafka, Redis, ClickHouse required)
- Is fully compatible with Sentry SDKs
- Supports ARM64 and AMD64 architectures
- Can use SQLite (dev) or PostgreSQL (production)
**Deployment**:
- **Dev container**: Installed as a systemd service inside the container
- **Production**: Runs as a systemd service on bare-metal, listening on localhost only
- **Database**: Uses PostgreSQL with a dedicated `bugsink` user and `bugsink` database (same PostgreSQL instance as the main application)
### 2. Backend Integration: @sentry/node
The Express backend integrates `@sentry/node` SDK to:
- Capture unhandled exceptions before PM2/process manager restarts
- Report errors with full stack traces and context
- Integrate with Pino logger for breadcrumbs
- Filter errors by severity (only 5xx errors sent by default)
### 3. Frontend Integration: @sentry/react
The React frontend integrates `@sentry/react` SDK to:
- Wrap the app in an Error Boundary for graceful error handling
- Capture unhandled JavaScript errors
- Report errors with component stack traces
- Filter out browser extension errors
- **Frontend Error Correlation**: The global API client intercepts 4xx/5xx responses and can attach the `x-request-id` header to Sentry scope for correlation with backend logs
### 4. Log Aggregation: Logstash
**Logstash** parses application and infrastructure logs, forwarding error patterns to Bugsink:
- **Installation**: Installed inside the dev container (and on bare-metal prod servers)
- **Inputs**:
- Pino JSON logs from the Node.js application (PM2 managed)
- Redis logs (connection errors, memory warnings, slow commands)
- PostgreSQL function logs (via `fn_log()` - see ADR-050)
- NGINX access/error logs
- **Filter**: Identifies error-level logs (5xx responses, unhandled exceptions, Redis errors)
- **Output**: Sends to Bugsink via Sentry-compatible HTTP API
This provides a secondary error capture path for:
- Errors that occur before Sentry SDK initialization
- Log-based errors that don't throw exceptions
- Redis connection/performance issues
- Database function errors and slow queries
- Historical error analysis from log files
### 5. MCP Server Integration: bugsink-mcp
For AI tool integration (Claude Code, Cursor, etc.), we use the open-source [bugsink-mcp](https://github.com/j-shelfwood/bugsink-mcp) server:
- **No code changes required**: Configurable via environment variables
- **Capabilities**: List projects, get issues, view events, get stacktraces, manage releases
- **Configuration**:
- `BUGSINK_URL`: Points to Bugsink instance (`http://localhost:8000` for dev, `https://bugsink.projectium.com` for prod)
- `BUGSINK_API_TOKEN`: API token from Bugsink (created via Django management command)
- `BUGSINK_ORG_SLUG`: Organization identifier (usually "sentry")
## Architecture
```text
+---------------------------------------------------------------------------+
| Dev Container / Production Server |
+---------------------------------------------------------------------------+
| |
| +------------------+ +------------------+ |
| | Frontend | | Backend | |
| | (React) | | (Express) | |
| | @sentry/react | | @sentry/node | |
| +--------+---------+ +--------+---------+ |
| | | |
| | Sentry SDK Protocol | |
| +-----------+---------------+ |
| | |
| v |
| +----------------------+ |
| | Bugsink | |
| | (localhost:8000) |<------------------+ |
| | | | |
| | PostgreSQL backend | | |
| +----------------------+ | |
| | |
| +----------------------+ | |
| | Logstash |-------------------+ |
| | (Log Aggregator) | Sentry Output |
| | | |
| | Inputs: | |
| | - PM2/Pino logs | |
| | - Redis logs | |
| | - PostgreSQL logs | |
| | - NGINX logs | |
| +----------------------+ |
| ^ ^ ^ ^ |
| | | | | |
| +-----------+ | | +-----------+ |
| | | | | |
| +----+-----+ +-----+----+ +-----+----+ +-----+----+ |
| | PM2 | | Redis | | PostgreSQL| | NGINX | |
| | Logs | | Logs | | Logs | | Logs | |
| +----------+ +----------+ +-----------+ +---------+ |
| |
| +----------------------+ |
| | PostgreSQL | |
| | +----------------+ | |
| | | flyer_crawler | | (main app database) |
| | +----------------+ | |
| | | bugsink | | (error tracking database) |
| | +----------------+ | |
| +----------------------+ |
| |
+---------------------------------------------------------------------------+
External (Developer Machine):
+--------------------------------------+
| Claude Code / Cursor / VS Code |
| +--------------------------------+ |
| | bugsink-mcp | |
| | (MCP Server) | |
| | | |
| | BUGSINK_URL=http://localhost:8000
| | BUGSINK_API_TOKEN=... | |
| | BUGSINK_ORG_SLUG=... | |
| +--------------------------------+ |
+--------------------------------------+
```
## Implementation Status
### Completed
- [x] Bugsink installed and configured in dev container
- [x] PostgreSQL `bugsink` database and user created
- [x] `@sentry/node` SDK integrated in backend (`src/services/sentry.server.ts`)
- [x] `@sentry/react` SDK integrated in frontend (`src/services/sentry.client.ts`)
- [x] ErrorBoundary component created (`src/components/ErrorBoundary.tsx`)
- [x] ErrorBoundary wrapped around app (`src/providers/AppProviders.tsx`)
- [x] Logstash pipeline configured for PM2/Pino, Redis, PostgreSQL, NGINX logs
- [x] MCP server (`bugsink-mcp`) documented and configured
- [x] Environment variables added to `src/config/env.ts` and frontend `src/config.ts`
- [x] Browser extension errors filtered in `beforeSend`
- [x] 5xx error filtering in backend error handler
### Recently Completed (2026-01-26)
- [x] **User context after authentication**: Integrated `setUser()` calls in `AuthProvider.tsx` to associate errors with authenticated users
- Called on profile fetch from query (line 44-49)
- Called on direct login with profile (line 94-99)
- Called on login with profile fetch (line 124-129)
- Cleared on logout (line 76-77)
- Maps `user_id``id`, `email``email`, `full_name``username`
This completes the error tracking implementation - all errors are now associated with the authenticated user who encountered them, enabling user-specific error analysis and debugging.
## Configuration
### Environment Variables
| Variable | Description | Default (Dev) |
| -------------------- | -------------------------------- | -------------------------- |
| `SENTRY_DSN` | Sentry-compatible DSN (backend) | Set after project creation |
| `VITE_SENTRY_DSN` | Sentry-compatible DSN (frontend) | Set after project creation |
| `SENTRY_ENVIRONMENT` | Environment name | `development` |
| `SENTRY_DEBUG` | Enable debug logging | `false` |
| `SENTRY_ENABLED` | Enable/disable error reporting | `true` |
### PostgreSQL Setup
```sql
-- Create dedicated Bugsink database and user
CREATE USER bugsink WITH PASSWORD 'bugsink_dev_password';
CREATE DATABASE bugsink OWNER bugsink;
GRANT ALL PRIVILEGES ON DATABASE bugsink TO bugsink;
```
### Bugsink Configuration
```bash
# Environment variables for Bugsink service
SECRET_KEY=<random-50-char-string>
DATABASE_URL=postgresql://bugsink:bugsink_dev_password@localhost:5432/bugsink
BASE_URL=http://localhost:8000
PORT=8000
```
### Logstash Pipeline
See `docker/logstash/bugsink.conf` for the full pipeline configuration.
Key routing:
| Source | Bugsink Project |
| --------------- | --------------- |
| Backend (Pino) | Backend API |
| Worker (Pino) | Backend API |
| PostgreSQL logs | Backend API |
| Vite logs | Infrastructure |
| Redis logs | Infrastructure |
| NGINX logs | Infrastructure |
| Frontend errors | Frontend |
## Consequences
### Positive
- **Full observability**: Aggregated view of errors and trends
- **Self-hosted**: No external SaaS dependencies or subscription costs
- **SDK compatibility**: Leverages mature Sentry SDKs with excellent documentation
- **AI integration**: MCP server enables Claude Code to query and analyze errors
- **Unified architecture**: Same setup works in dev container and production
- **Lightweight**: Bugsink runs in a single process, unlike full Sentry (16GB+ RAM)
- **Error correlation**: Request IDs allow correlation between frontend errors and backend logs
### Negative
- **Additional services**: Bugsink and Logstash add complexity to the container
- **PostgreSQL overhead**: Additional database for error tracking
- **Initial setup**: Requires configuration of multiple components
- **Logstash learning curve**: Pipeline configuration requires Logstash knowledge
## Alternatives Considered
1. **Full Sentry self-hosted**: Rejected due to complexity (Kafka, Redis, ClickHouse, 16GB+ RAM minimum)
2. **GlitchTip**: Considered, but Bugsink is lighter weight and easier to deploy
3. **Sentry SaaS**: Rejected due to self-hosted requirement
4. **Custom error aggregation**: Rejected in favor of proven Sentry SDK ecosystem
## References
- [Bugsink Documentation](https://www.bugsink.com/docs/)
- [Bugsink Docker Install](https://www.bugsink.com/docs/docker-install/)
- [@sentry/node Documentation](https://docs.sentry.io/platforms/javascript/guides/node/)
- [@sentry/react Documentation](https://docs.sentry.io/platforms/javascript/guides/react/)
- [bugsink-mcp](https://github.com/j-shelfwood/bugsink-mcp)
- [Logstash Reference](https://www.elastic.co/guide/en/logstash/current/index.html)
- [ADR-050: PostgreSQL Function Observability](./0050-postgresql-function-observability.md)
- [ADR-056: Application Performance Monitoring](./0056-application-performance-monitoring.md)

View File

@@ -2,22 +2,22 @@
**Date**: 2026-01-09
**Status**: Partially Implemented
**Status**: Accepted (Fully Implemented)
**Implemented**: 2026-01-09 (Local auth only)
**Implemented**: 2026-01-09 (Local auth + JWT), 2026-01-26 (OAuth enabled)
## Context
The application requires a secure authentication system that supports both traditional email/password login and social OAuth providers (Google, GitHub). The system must handle user sessions, token refresh, account security (lockout after failed attempts), and integrate seamlessly with the existing Express middleware pipeline.
Currently, **only local authentication is enabled**. OAuth strategies are fully implemented but commented out, pending configuration of OAuth provider credentials.
**All authentication methods are now fully implemented**: Local authentication (email/password), JWT tokens, and OAuth (Google + GitHub). OAuth strategies use conditional registration - they activate automatically when the corresponding environment variables are configured.
## Decision
We will implement a stateless JWT-based authentication system with the following components:
1. **Local Authentication**: Email/password login with bcrypt hashing.
2. **OAuth Authentication**: Google and GitHub OAuth 2.0 (currently disabled).
2. **OAuth Authentication**: Google and GitHub OAuth 2.0 (conditionally enabled via environment variables).
3. **JWT Access Tokens**: Short-lived tokens (15 minutes) for API authentication.
4. **Refresh Tokens**: Long-lived tokens (7 days) stored in HTTP-only cookies.
5. **Account Security**: Lockout after 5 failed login attempts for 15 minutes.
@@ -59,7 +59,7 @@ We will implement a stateless JWT-based authentication system with the following
│ │ │ │ │
│ │ ┌──────────┐ │ │ │
│ └────────>│ OAuth │─────────────┘ │ │
(disabled) │ Provider │ │ │
│ Provider │ │ │
│ └──────────┘ │ │
│ │ │
│ ┌──────────┐ ┌──────────┐ │ │
@@ -130,72 +130,139 @@ passport.use(
- Refresh token: 7 days expiry, 64-byte random hex
- Refresh token stored in HTTP-only cookie with `secure` flag in production
### OAuth Strategies (Disabled)
### OAuth Strategies (Conditionally Enabled)
OAuth strategies are **fully implemented** and activate automatically when the corresponding environment variables are set. The strategies use conditional registration to gracefully handle missing credentials.
#### Google OAuth
Located in `src/routes/passport.routes.ts` (lines 167-217, commented):
Located in `src/config/passport.ts` (lines 167-235):
```typescript
// passport.use(new GoogleStrategy({
// clientID: process.env.GOOGLE_CLIENT_ID!,
// clientSecret: process.env.GOOGLE_CLIENT_SECRET!,
// callbackURL: '/api/auth/google/callback',
// scope: ['profile', 'email']
// },
// async (accessToken, refreshToken, profile, done) => {
// const email = profile.emails?.[0]?.value;
// const user = await db.findUserByEmail(email);
// if (user) {
// return done(null, user);
// }
// // Create new user with null password_hash
// const newUser = await db.createUser(email, null, {
// full_name: profile.displayName,
// avatar_url: profile.photos?.[0]?.value
// });
// return done(null, newUser);
// }
// ));
// Only register the strategy if the required environment variables are set.
if (process.env.GOOGLE_CLIENT_ID && process.env.GOOGLE_CLIENT_SECRET) {
passport.use(
new GoogleStrategy(
{
clientID: process.env.GOOGLE_CLIENT_ID,
clientSecret: process.env.GOOGLE_CLIENT_SECRET,
callbackURL: '/api/auth/google/callback',
scope: ['profile', 'email'],
},
async (_accessToken, _refreshToken, profile, done) => {
const email = profile.emails?.[0]?.value;
if (!email) {
return done(new Error('No email found in Google profile.'), false);
}
const existingUserProfile = await db.userRepo.findUserWithProfileByEmail(email, logger);
if (existingUserProfile) {
// User exists, log them in (strip sensitive fields)
return done(null, cleanUserProfile);
} else {
// Create new user with null password_hash for OAuth users
const newUserProfile = await db.userRepo.createUser(
email,
null,
{
full_name: profile.displayName,
avatar_url: profile.photos?.[0]?.value,
},
logger,
);
return done(null, newUserProfile);
}
},
),
);
logger.info('[Passport] Google OAuth strategy registered.');
} else {
logger.warn('[Passport] Google OAuth strategy NOT registered: credentials not set.');
}
```
#### GitHub OAuth
Located in `src/routes/passport.routes.ts` (lines 219-269, commented):
Located in `src/config/passport.ts` (lines 237-310):
```typescript
// passport.use(new GitHubStrategy({
// clientID: process.env.GITHUB_CLIENT_ID!,
// clientSecret: process.env.GITHUB_CLIENT_SECRET!,
// callbackURL: '/api/auth/github/callback',
// scope: ['user:email']
// },
// async (accessToken, refreshToken, profile, done) => {
// const email = profile.emails?.[0]?.value;
// // Similar flow to Google OAuth
// }
// ));
// Only register the strategy if the required environment variables are set.
if (process.env.GITHUB_CLIENT_ID && process.env.GITHUB_CLIENT_SECRET) {
passport.use(
new GitHubStrategy(
{
clientID: process.env.GITHUB_CLIENT_ID,
clientSecret: process.env.GITHUB_CLIENT_SECRET,
callbackURL: '/api/auth/github/callback',
scope: ['user:email'],
},
async (_accessToken, _refreshToken, profile, done) => {
const email = profile.emails?.[0]?.value;
if (!email) {
return done(new Error('No public email found in GitHub profile.'), false);
}
// Same flow as Google OAuth - find or create user
},
),
);
logger.info('[Passport] GitHub OAuth strategy registered.');
} else {
logger.warn('[Passport] GitHub OAuth strategy NOT registered: credentials not set.');
}
```
#### OAuth Routes (Disabled)
#### OAuth Routes (Active)
Located in `src/routes/auth.routes.ts` (lines 289-315, commented):
Located in `src/routes/auth.routes.ts` (lines 587-609):
```typescript
// const handleOAuthCallback = (req, res) => {
// const user = req.user;
// const accessToken = jwt.sign(payload, JWT_SECRET, { expiresIn: '15m' });
// const refreshToken = crypto.randomBytes(64).toString('hex');
//
// await db.saveRefreshToken(user.user_id, refreshToken);
// res.cookie('refreshToken', refreshToken, { httpOnly: true, secure: true });
// res.redirect(`${FRONTEND_URL}/auth/callback?token=${accessToken}`);
// };
// Google OAuth routes
router.get('/google', passport.authenticate('google', { session: false }));
router.get(
'/google/callback',
passport.authenticate('google', {
session: false,
failureRedirect: '/?error=google_auth_failed',
}),
createOAuthCallbackHandler('google'),
);
// router.get('/google', passport.authenticate('google', { session: false }));
// router.get('/google/callback', passport.authenticate('google', { ... }), handleOAuthCallback);
// router.get('/github', passport.authenticate('github', { session: false }));
// router.get('/github/callback', passport.authenticate('github', { ... }), handleOAuthCallback);
// GitHub OAuth routes
router.get('/github', passport.authenticate('github', { session: false }));
router.get(
'/github/callback',
passport.authenticate('github', {
session: false,
failureRedirect: '/?error=github_auth_failed',
}),
createOAuthCallbackHandler('github'),
);
```
#### OAuth Callback Handler
The callback handler generates tokens and redirects to the frontend:
```typescript
const createOAuthCallbackHandler = (provider: 'google' | 'github') => {
return async (req: Request, res: Response) => {
const userProfile = req.user as UserProfile;
const { accessToken, refreshToken } = await authService.handleSuccessfulLogin(
userProfile,
req.log,
);
res.cookie('refreshToken', refreshToken, {
httpOnly: true,
secure: process.env.NODE_ENV === 'production',
maxAge: 30 * 24 * 60 * 60 * 1000, // 30 days
});
// Redirect to frontend with provider-specific token param
const tokenParam = provider === 'google' ? 'googleAuthToken' : 'githubAuthToken';
res.redirect(`${process.env.FRONTEND_URL}/?${tokenParam}=${accessToken}`);
};
};
```
### Database Schema
@@ -248,11 +315,13 @@ export const mockAuth = (req, res, next) => {
};
```
## Enabling OAuth
## Configuring OAuth Providers
OAuth is fully implemented and activates automatically when credentials are provided. No code changes are required.
### Step 1: Set Environment Variables
Add to `.env`:
Add to your environment (`.env.local` for development, Gitea secrets for production):
```bash
# Google OAuth
@@ -283,54 +352,29 @@ GITHUB_CLIENT_SECRET=your-github-client-secret
- Development: `http://localhost:3001/api/auth/github/callback`
- Production: `https://your-domain.com/api/auth/github/callback`
### Step 3: Uncomment Backend Code
### Step 3: Restart the Application
**In `src/routes/passport.routes.ts`**:
After setting the environment variables, restart PM2:
1. Uncomment import statements (lines 5-6):
```typescript
import { Strategy as GoogleStrategy } from 'passport-google-oauth20';
import { Strategy as GitHubStrategy } from 'passport-github2';
```
2. Uncomment Google strategy (lines 167-217)
3. Uncomment GitHub strategy (lines 219-269)
**In `src/routes/auth.routes.ts`**:
1. Uncomment `handleOAuthCallback` function (lines 291-309)
2. Uncomment OAuth routes (lines 311-315)
### Step 4: Add Frontend OAuth Buttons
Create login buttons that redirect to:
- Google: `GET /api/auth/google`
- GitHub: `GET /api/auth/github`
Handle callback at `/auth/callback?token=<accessToken>`:
1. Extract token from URL
2. Store in client-side token storage
3. Redirect to dashboard
### Step 5: Handle OAuth Callback Page
Create `src/pages/AuthCallback.tsx`:
```typescript
const AuthCallback = () => {
const token = new URLSearchParams(location.search).get('token');
if (token) {
setToken(token);
navigate('/dashboard');
} else {
navigate('/login?error=auth_failed');
}
};
```bash
podman exec -it flyer-crawler-dev pm2 restart all
```
The Passport configuration will automatically register the OAuth strategies when it detects the credentials. Check the logs for confirmation:
```text
[Passport] Google OAuth strategy registered.
[Passport] GitHub OAuth strategy registered.
```
### Frontend Integration
OAuth login buttons are implemented in `src/client/pages/AuthView.tsx`. The frontend:
1. Redirects users to `/api/auth/google` or `/api/auth/github`
2. Handles the callback via the `useAppInitialization` hook which looks for `googleAuthToken` or `githubAuthToken` query parameters
3. Stores the token and redirects to the dashboard
## Known Limitations
1. **No OAuth Provider ID Mapping**: Users are identified by email only. If a user has accounts with different emails on Google and GitHub, they create separate accounts.
@@ -372,31 +416,32 @@ const AuthCallback = () => {
- **Stateless Architecture**: No session storage required; scales horizontally.
- **Secure by Default**: HTTP-only cookies, short token expiry, bcrypt hashing.
- **Account Protection**: Lockout prevents brute-force attacks.
- **Flexible OAuth**: Can enable/disable OAuth without code changes (just env vars + uncommenting).
- **Graceful Degradation**: System works with local auth only.
- **Flexible OAuth**: OAuth activates automatically when credentials are set - no code changes needed.
- **Graceful Degradation**: System works with local auth only when OAuth credentials are not configured.
- **Full Feature Set**: Both local and OAuth authentication are production-ready.
### Negative
- **OAuth Disabled by Default**: Requires manual uncommenting to enable.
- **No Account Linking**: Multiple OAuth providers create separate accounts.
- **Frontend Work Required**: OAuth login buttons don't exist yet.
- **Token in URL**: OAuth callback passes token in URL (visible in browser history).
- **No Account Linking**: Multiple OAuth providers create separate accounts if emails differ.
- **Token in URL**: OAuth callback passes token in URL query parameter (visible in browser history).
- **Email-Based Identity**: OAuth users are identified by email only, not provider-specific IDs.
### Mitigation
- Document OAuth enablement steps clearly (see [../architecture/AUTHENTICATION.md](../architecture/AUTHENTICATION.md)).
- Document OAuth configuration steps clearly (see [../architecture/AUTHENTICATION.md](../architecture/AUTHENTICATION.md)).
- Consider adding OAuth provider ID columns for future account linking.
- Use URL fragment (`#token=`) instead of query parameter for callback.
- Consider using URL fragment (`#token=`) instead of query parameter for callback in future enhancement.
## Key Files
| File | Purpose |
| ------------------------------------------------------ | ------------------------------------------------ |
| `src/routes/passport.routes.ts` | Passport strategies (local, JWT, OAuth) |
| `src/config/passport.ts` | Passport strategies (local, JWT, OAuth) |
| `src/routes/auth.routes.ts` | Auth endpoints (login, register, refresh, OAuth) |
| `src/services/authService.ts` | Auth business logic |
| `src/services/db/user.db.ts` | User database operations |
| `src/config/env.ts` | Environment variable validation |
| `src/client/pages/AuthView.tsx` | Frontend login/register UI with OAuth buttons |
| [AUTHENTICATION.md](../architecture/AUTHENTICATION.md) | OAuth setup guide |
| `.env.example` | Environment variable template |
@@ -409,11 +454,11 @@ const AuthCallback = () => {
## Future Enhancements
1. **Enable OAuth**: Uncomment strategies and configure providers.
2. **Add OAuth Provider Mapping Table**: Store `googleId`, `githubId` for account linking.
3. **Implement Account Linking**: Allow users to connect multiple OAuth providers.
4. **Add Password to OAuth Users**: Allow OAuth users to set a password.
5. **Implement PKCE**: Add PKCE flow for enhanced OAuth security.
6. **Token in Fragment**: Use URL fragment for OAuth callback token.
7. **OAuth Token Storage**: Store OAuth refresh tokens for provider API access.
8. **Magic Link Login**: Add passwordless email login option.
1. **Add OAuth Provider Mapping Table**: Store `googleId`, `githubId` for account linking.
2. **Implement Account Linking**: Allow users to connect multiple OAuth providers.
3. **Add Password to OAuth Users**: Allow OAuth users to set a password for local login.
4. **Implement PKCE**: Add PKCE flow for enhanced OAuth security.
5. **Token in Fragment**: Use URL fragment for OAuth callback token instead of query parameter.
6. **OAuth Token Storage**: Store OAuth refresh tokens for provider API access.
7. **Magic Link Login**: Add passwordless email login option.
8. **Additional OAuth Providers**: Support for Apple, Microsoft, or other providers.

View File

@@ -2,7 +2,7 @@
**Date**: 2026-01-11
**Status**: Proposed
**Status**: Accepted (Fully Implemented)
**Related**: [ADR-015](0015-application-performance-monitoring-and-error-tracking.md), [ADR-004](0004-standardized-application-wide-structured-logging.md)

View File

@@ -2,7 +2,9 @@
**Date**: 2026-01-11
**Status**: Proposed
**Status**: Accepted (Fully Implemented)
**Related**: [ADR-004](0004-standardized-application-wide-structured-logging.md)
## Context
@@ -17,7 +19,9 @@ We will adopt a namespace-based debug filter pattern, similar to the `debug` npm
## Implementation
In `src/services/logger.server.ts`:
### Core Implementation (Completed 2026-01-11)
Implemented in [src/services/logger.server.ts:140-150](src/services/logger.server.ts#L140-L150):
```typescript
const debugModules = (process.env.DEBUG_MODULES || '').split(',').map((s) => s.trim());
@@ -33,10 +37,100 @@ export const createScopedLogger = (moduleName: string) => {
};
```
### Adopted Services (Completed 2026-01-26)
Services currently using `createScopedLogger`:
- `ai-service` - AI/Gemini integration ([src/services/aiService.server.ts:1020](src/services/aiService.server.ts#L1020))
- `flyer-processing-service` - Flyer upload and processing ([src/services/flyerProcessingService.server.ts:20](src/services/flyerProcessingService.server.ts#L20))
## Usage
To debug only AI and Database interactions:
### Enable Debug Logging for Specific Modules
To debug only AI and flyer processing:
```bash
DEBUG_MODULES=ai-service,db-repo npm run dev
DEBUG_MODULES=ai-service,flyer-processing-service npm run dev
```
### Enable All Debug Logging
Use wildcard to enable debug logging for all modules:
```bash
DEBUG_MODULES=* npm run dev
```
### Common Module Names
| Module Name | Purpose | File |
| -------------------------- | ---------------------------------------- | ----------------------------------------------- |
| `ai-service` | AI/Gemini API interactions | `src/services/aiService.server.ts` |
| `flyer-processing-service` | Flyer upload, validation, and processing | `src/services/flyerProcessingService.server.ts` |
## Best Practices
1. **Use Scoped Loggers for Long-Running Services**: Services with complex workflows or external API calls should use `createScopedLogger` to allow targeted debugging.
2. **Use Child Loggers for Contextual Data**: Even within scoped loggers, create child loggers with job/request-specific context:
```typescript
const logger = createScopedLogger('my-service');
async function processJob(job: Job) {
const jobLogger = logger.child({ jobId: job.id, jobName: job.name });
jobLogger.debug('Starting job processing');
}
```
3. **Module Naming Convention**: Use kebab-case suffixed with `-service` or `-worker` (e.g., `ai-service`, `email-worker`).
4. **Production Usage**: `DEBUG_MODULES` can be set in production for temporary debugging, but should not be used continuously due to increased log volume.
## Examples
### Development Debugging
Debug AI service issues during development:
```bash
# Dev container
DEBUG_MODULES=ai-service npm run dev
# Or via PM2
DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api-dev
```
### Production Troubleshooting
Temporarily enable debug logging for a specific subsystem:
```bash
# SSH into production server
ssh root@projectium.com
# Set environment variable and restart
DEBUG_MODULES=ai-service pm2 restart flyer-crawler-api
# View logs
pm2 logs flyer-crawler-api --lines 100
# Disable debug logging
pm2 unset DEBUG_MODULES flyer-crawler-api
pm2 restart flyer-crawler-api
```
## Consequences
**Positive**:
- Developers can inspect detailed logs for specific subsystems without log flooding
- Production debugging becomes more targeted and efficient
- No performance impact when debug logging is disabled
- Compatible with existing Pino logging infrastructure
**Negative**:
- Requires developers to know module names (mitigated by documentation above)
- Not all services have adopted scoped loggers yet (gradual migration)

View File

@@ -2,7 +2,14 @@
**Date**: 2026-01-11
**Status**: Proposed
**Status**: Accepted (Fully Implemented)
**Implementation Status**:
- ✅ BullMQ worker stall configuration (complete)
- ✅ Basic health endpoints (/live, /ready, /redis, etc.)
- ✅ /health/queues endpoint (complete)
- ✅ Worker heartbeat mechanism (complete)
## Context
@@ -60,3 +67,76 @@ The `/health/queues` endpoint will:
**Negative**:
- Requires configuring external monitoring to poll the new endpoint.
## Implementation Notes
### Completed (2026-01-11)
1. **BullMQ Stall Configuration** - `src/config/workerOptions.ts`
- All workers use `defaultWorkerOptions` with:
- `stalledInterval: 30000` (30s)
- `maxStalledCount: 3`
- `lockDuration: 30000` (30s)
- Applied to all 9 workers: flyer, email, analytics, cleanup, weekly-analytics, token-cleanup, receipt, expiry-alert, barcode
2. **Basic Health Endpoints** - `src/routes/health.routes.ts`
- `/health/live` - Liveness probe
- `/health/ready` - Readiness probe (checks DB, Redis, storage)
- `/health/startup` - Startup probe
- `/health/redis` - Redis connectivity
- `/health/db-pool` - Database connection pool status
### Implementation Completed (2026-01-26)
1. **`/health/queues` Endpoint** ✅
- Added route to `src/routes/health.routes.ts:511-674`
- Iterates through all 9 queues from `src/services/queues.server.ts`
- Fetches job counts using BullMQ Queue API: `getJobCounts()`
- Returns structured response including both queue metrics and worker heartbeats:
```typescript
{
status: 'healthy' | 'unhealthy',
timestamp: string,
queues: {
[queueName]: {
waiting: number,
active: number,
failed: number,
delayed: number
}
},
workers: {
[workerName]: {
alive: boolean,
lastSeen?: string,
pid?: number,
host?: string
}
}
}
```
- Returns 200 OK if all healthy, 503 if any queue/worker unavailable
- Full OpenAPI documentation included
2. **Worker Heartbeat Mechanism** ✅
- Added `updateWorkerHeartbeat()` and `startWorkerHeartbeat()` in `src/services/workers.server.ts:100-149`
- Key pattern: `worker:heartbeat:<worker-name>`
- Stores: `{ timestamp: ISO8601, pid: number, host: string }`
- Updates every 30s with 90s TTL
- Integrated with `/health/queues` endpoint (checks if heartbeat < 60s old)
- Heartbeat intervals properly cleaned up in `closeWorkers()` and `gracefulShutdown()`
3. **Comprehensive Tests** ✅
- Added 5 test cases in `src/routes/health.routes.test.ts:623-858`
- Tests cover: healthy state, queue failures, stale heartbeats, missing heartbeats, Redis errors
- All tests follow existing patterns with proper mocking
### Future Enhancements (Not Implemented)
1. **Queue Depth Alerting** (Low Priority)
- Add configurable thresholds per queue type
- Return 500 if `waiting` count exceeds threshold for extended period
- Consider using Redis for storing threshold breach timestamps
- **Estimate**: 1-2 hours

View File

@@ -1,4 +1,4 @@
# ADR-023: Database Normalization and Referential Integrity
# ADR-055: Database Normalization and Referential Integrity
**Date:** 2026-01-19
**Status:** Accepted

View File

@@ -0,0 +1,262 @@
# ADR-056: Application Performance Monitoring (APM)
**Date**: 2026-01-26
**Status**: Proposed
**Related**: [ADR-015](./0015-error-tracking-and-observability.md) (Error Tracking and Observability)
## Context
Application Performance Monitoring (APM) provides visibility into application behavior through:
- **Distributed Tracing**: Track requests across services, queues, and database calls
- **Performance Metrics**: Response times, throughput, error rates
- **Resource Monitoring**: Memory usage, CPU, database connections
- **Transaction Analysis**: Identify slow endpoints and bottlenecks
While ADR-015 covers error tracking and observability, APM is a distinct concern focused on performance rather than errors. The Sentry SDK supports APM through its tracing features, but this capability is currently **intentionally disabled** in our application.
### Current State
The Sentry SDK is installed and configured for error tracking (see ADR-015), but APM features are disabled:
```typescript
// src/services/sentry.client.ts
Sentry.init({
dsn: config.sentry.dsn,
environment: config.sentry.environment,
// Performance monitoring - disabled for now to keep it simple
tracesSampleRate: 0,
// ...
});
```
```typescript
// src/services/sentry.server.ts
Sentry.init({
dsn: config.sentry.dsn,
environment: config.sentry.environment || config.server.nodeEnv,
// Performance monitoring - disabled for now to keep it simple
tracesSampleRate: 0,
// ...
});
```
### Why APM is Currently Disabled
1. **Complexity**: APM adds overhead and complexity to debugging
2. **Bugsink Limitations**: Bugsink's APM support is less mature than its error tracking
3. **Resource Overhead**: Tracing adds memory and CPU overhead
4. **Focus**: Error tracking provides more immediate value for our current scale
5. **Cost**: High sample rates can significantly increase storage requirements
## Decision
We propose a **staged approach** to APM implementation:
### Phase 1: Selective Backend Tracing (Low Priority)
Enable tracing for specific high-value operations:
```typescript
// Enable tracing for specific transactions only
Sentry.init({
dsn: config.sentry.dsn,
tracesSampleRate: 0, // Keep default at 0
// Trace only specific high-value transactions
tracesSampler: (samplingContext) => {
const transactionName = samplingContext.transactionContext?.name;
// Always trace flyer processing jobs
if (transactionName?.includes('flyer-processing')) {
return 0.1; // 10% sample rate
}
// Always trace AI/Gemini calls
if (transactionName?.includes('gemini')) {
return 0.5; // 50% sample rate
}
// Trace slow endpoints (determined by custom logic)
if (samplingContext.parentSampled) {
return 0.1; // 10% for child transactions
}
return 0; // Don't trace other transactions
},
});
```
### Phase 2: Custom Performance Metrics
Add custom metrics without full tracing overhead:
```typescript
// Custom metric for slow database queries
import { metrics } from '@sentry/node';
// In repository methods
const startTime = performance.now();
const result = await pool.query(sql, params);
const duration = performance.now() - startTime;
metrics.distribution('db.query.duration', duration, {
tags: { query_type: 'select', table: 'flyers' },
});
if (duration > 1000) {
logger.warn({ duration, sql }, 'Slow query detected');
}
```
### Phase 3: Full APM Integration (Future)
When/if full APM is needed:
```typescript
Sentry.init({
dsn: config.sentry.dsn,
tracesSampleRate: 0.1, // 10% of transactions
profilesSampleRate: 0.1, // 10% of traced transactions get profiled
integrations: [
// Database tracing
Sentry.postgresIntegration(),
// Redis tracing
Sentry.redisIntegration(),
// BullMQ job tracing
Sentry.prismaIntegration(), // or custom BullMQ integration
],
});
```
## Implementation Steps
### To Enable Basic APM
1. **Update Sentry Configuration**:
- Set `tracesSampleRate` > 0 in `src/services/sentry.server.ts`
- Set `tracesSampleRate` > 0 in `src/services/sentry.client.ts`
- Add environment variable `SENTRY_TRACES_SAMPLE_RATE` (default: 0)
2. **Add Instrumentation**:
- Enable automatic Express instrumentation
- Add manual spans for BullMQ job processing
- Add database query instrumentation
3. **Frontend Tracing**:
- Add Browser Tracing integration
- Configure page load and navigation tracing
4. **Environment Variables**:
```bash
SENTRY_TRACES_SAMPLE_RATE=0.1 # 10% sampling
SENTRY_PROFILES_SAMPLE_RATE=0 # Profiling disabled
```
5. **Bugsink Configuration**:
- Verify Bugsink supports performance data ingestion
- Configure retention policies for performance data
### Configuration Changes Required
```typescript
// src/config/env.ts - Add new config
sentry: {
dsn: env.SENTRY_DSN,
environment: env.SENTRY_ENVIRONMENT,
debug: env.SENTRY_DEBUG === 'true',
tracesSampleRate: parseFloat(env.SENTRY_TRACES_SAMPLE_RATE || '0'),
profilesSampleRate: parseFloat(env.SENTRY_PROFILES_SAMPLE_RATE || '0'),
},
```
```typescript
// src/services/sentry.server.ts - Updated init
Sentry.init({
dsn: config.sentry.dsn,
environment: config.sentry.environment,
tracesSampleRate: config.sentry.tracesSampleRate,
profilesSampleRate: config.sentry.profilesSampleRate,
// ... rest of config
});
```
## Trade-offs
### Enabling APM
**Benefits**:
- Identify performance bottlenecks
- Track distributed transactions across services
- Profile slow endpoints
- Monitor resource utilization trends
**Costs**:
- Increased memory usage (~5-15% overhead)
- Additional CPU for trace processing
- Increased storage in Bugsink/Sentry
- More complex debugging (noise in traces)
- Potential latency from tracing overhead
### Keeping APM Disabled
**Benefits**:
- Simpler operation and debugging
- Lower resource overhead
- Focused on error tracking (higher priority)
- No additional storage costs
**Costs**:
- No automated performance insights
- Manual profiling required for bottleneck detection
- Limited visibility into slow transactions
## Alternatives Considered
1. **OpenTelemetry**: More vendor-neutral, but adds another dependency and complexity
2. **Prometheus + Grafana**: Good for metrics, but doesn't provide distributed tracing
3. **Jaeger/Zipkin**: Purpose-built for tracing, but requires additional infrastructure
4. **New Relic/Datadog SaaS**: Full-featured but conflicts with self-hosted requirement
## Current Recommendation
**Keep APM disabled** (`tracesSampleRate: 0`) until:
1. Specific performance issues are identified that require tracing
2. Bugsink's APM support is verified and tested
3. Infrastructure can support the additional overhead
4. There is a clear business need for performance visibility
When enabling APM becomes necessary, start with Phase 1 (selective tracing) to minimize overhead while gaining targeted insights.
## Consequences
### Positive (When Implemented)
- Automated identification of slow endpoints
- Distributed trace visualization across async operations
- Correlation between errors and performance issues
- Proactive alerting on performance degradation
### Negative
- Additional infrastructure complexity
- Storage overhead for trace data
- Potential performance impact from tracing itself
- Learning curve for trace analysis
## References
- [Sentry Performance Monitoring](https://docs.sentry.io/product/performance/)
- [@sentry/node Performance](https://docs.sentry.io/platforms/javascript/guides/node/performance/)
- [@sentry/react Performance](https://docs.sentry.io/platforms/javascript/guides/react/performance/)
- [OpenTelemetry](https://opentelemetry.io/) (alternative approach)
- [ADR-015: Error Tracking and Observability](./0015-error-tracking-and-observability.md)

View File

@@ -15,9 +15,9 @@ This document tracks the implementation status and estimated effort for all Arch
| Status | Count |
| ---------------------------- | ----- |
| Accepted (Fully Implemented) | 30 |
| Accepted (Fully Implemented) | 39 |
| Partially Implemented | 2 |
| Proposed (Not Started) | 16 |
| Proposed (Not Started) | 15 |
---
@@ -49,7 +49,7 @@ This document tracks the implementation status and estimated effort for all Arch
| [ADR-003](./0003-standardized-input-validation-using-middleware.md) | Input Validation | Accepted | - | Fully implemented |
| [ADR-008](./0008-api-versioning-strategy.md) | API Versioning | Proposed | L | Major URL/routing changes |
| [ADR-018](./0018-api-documentation-strategy.md) | API Documentation | Accepted | - | OpenAPI/Swagger implemented |
| [ADR-022](./0022-real-time-notification-system.md) | Real-time Notifications | Proposed | XL | WebSocket infrastructure |
| [ADR-022](./0022-real-time-notification-system.md) | Real-time Notifications | Accepted | - | Fully implemented |
| [ADR-028](./0028-api-response-standardization.md) | Response Standardization | Implemented | L | Completed (routes, middleware, tests) |
### Category 4: Security & Compliance
@@ -62,25 +62,31 @@ This document tracks the implementation status and estimated effort for all Arch
| [ADR-029](./0029-secret-rotation-and-key-management.md) | Secret Rotation | Proposed | L | Infrastructure changes needed |
| [ADR-032](./0032-rate-limiting-strategy.md) | Rate Limiting | Accepted | - | Fully implemented |
| [ADR-033](./0033-file-upload-and-storage-strategy.md) | File Upload & Storage | Accepted | - | Fully implemented |
| [ADR-048](./0048-authentication-strategy.md) | Authentication | Accepted | - | Fully implemented |
### Category 5: Observability & Monitoring
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------------------- | --------------------------- | -------- | ------ | --------------------------------- |
| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented |
| [ADR-015](./0015-application-performance-monitoring-and-error-tracking.md) | APM & Error Tracking | Proposed | M | Third-party integration |
| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Proposed | M | Depends on ADR-015 implementation |
| ADR | Title | Status | Effort | Notes |
| --------------------------------------------------------------------- | --------------------------- | -------- | ------ | ------------------------------------------ |
| [ADR-004](./0004-standardized-application-wide-structured-logging.md) | Structured Logging | Accepted | - | Fully implemented |
| [ADR-015](./0015-error-tracking-and-observability.md) | Error Tracking | Accepted | - | Fully implemented |
| [ADR-050](./0050-postgresql-function-observability.md) | PostgreSQL Fn Observability | Accepted | - | Fully implemented |
| [ADR-051](./0051-asynchronous-context-propagation.md) | Context Propagation | Accepted | - | Fully implemented |
| [ADR-052](./0052-granular-debug-logging-strategy.md) | Granular Debug Logging | Accepted | - | Fully implemented |
| [ADR-056](./0056-application-performance-monitoring.md) | APM (Performance) | Proposed | M | tracesSampleRate=0, intentionally disabled |
### Category 6: Deployment & Operations
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------- | ----------------- | -------- | ------ | -------------------------- |
| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented |
| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending |
| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented |
| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed |
| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented |
| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented |
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------------- | ------------------ | -------- | ------ | -------------------------- |
| [ADR-006](./0006-background-job-processing-and-task-queues.md) | Background Jobs | Accepted | - | Fully implemented |
| [ADR-014](./0014-containerization-and-deployment-strategy.md) | Containerization | Partial | M | Docker done, K8s pending |
| [ADR-017](./0017-ci-cd-and-branching-strategy.md) | CI/CD & Branching | Accepted | - | Fully implemented |
| [ADR-024](./0024-feature-flagging-strategy.md) | Feature Flags | Proposed | M | New service/library needed |
| [ADR-037](./0037-scheduled-jobs-and-cron-pattern.md) | Scheduled Jobs | Accepted | - | Fully implemented |
| [ADR-038](./0038-graceful-shutdown-pattern.md) | Graceful Shutdown | Accepted | - | Fully implemented |
| [ADR-053](./0053-worker-health-checks.md) | Worker Health | Accepted | - | Fully implemented |
| [ADR-054](./0054-bugsink-gitea-issue-sync.md) | Bugsink-Gitea Sync | Proposed | L | Automated issue creation |
### Category 7: Frontend / User Interface
@@ -99,61 +105,78 @@ This document tracks the implementation status and estimated effort for all Arch
| [ADR-010](./0010-testing-strategy-and-standards.md) | Testing Strategy | Accepted | - | Fully implemented |
| [ADR-021](./0021-code-formatting-and-linting-unification.md) | Formatting & Linting | Accepted | - | Fully implemented |
| [ADR-027](./0027-standardized-naming-convention-for-ai-and-database-types.md) | Naming Conventions | Accepted | - | Fully implemented |
| [ADR-040](./0040-testing-economics-and-priorities.md) | Testing Economics | Accepted | - | Fully implemented |
| [ADR-045](./0045-test-data-factories-and-fixtures.md) | Test Data Factories | Accepted | - | Fully implemented |
| [ADR-047](./0047-project-file-and-folder-organization.md) | Project Organization | Proposed | XL | Major reorganization |
### Category 9: Architecture Patterns
| ADR | Title | Status | Effort | Notes |
| -------------------------------------------------------- | --------------------- | -------- | ------ | ----------------- |
| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented |
| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented |
| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented |
| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented |
| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented |
| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented |
| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented |
| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented |
| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented |
| ADR | Title | Status | Effort | Notes |
| --------------------------------------------------------------------- | --------------------- | -------- | ------ | ------------------------- |
| [ADR-034](./0034-repository-pattern-standards.md) | Repository Pattern | Accepted | - | Fully implemented |
| [ADR-035](./0035-service-layer-architecture.md) | Service Layer | Accepted | - | Fully implemented |
| [ADR-036](./0036-event-bus-and-pub-sub-pattern.md) | Event Bus | Accepted | - | Fully implemented |
| [ADR-039](./0039-dependency-injection-pattern.md) | Dependency Injection | Accepted | - | Fully implemented |
| [ADR-041](./0041-ai-gemini-integration-architecture.md) | AI/Gemini Integration | Accepted | - | Fully implemented |
| [ADR-042](./0042-email-and-notification-architecture.md) | Email & Notifications | Accepted | - | Fully implemented |
| [ADR-043](./0043-express-middleware-pipeline.md) | Middleware Pipeline | Accepted | - | Fully implemented |
| [ADR-046](./0046-image-processing-pipeline.md) | Image Processing | Accepted | - | Fully implemented |
| [ADR-049](./0049-gamification-and-achievement-system.md) | Gamification System | Accepted | - | Fully implemented |
| [ADR-055](./0055-database-normalization-and-referential-integrity.md) | DB Normalization | Accepted | M | API uses IDs, not strings |
---
## Work Still To Be Completed (Priority Order)
These ADRs are proposed but not yet implemented, ordered by suggested implementation priority:
These ADRs are proposed or partially implemented, ordered by suggested implementation priority:
| Priority | ADR | Title | Effort | Rationale |
| -------- | ------- | --------------------------- | ------ | ------------------------------------------------- |
| 1 | ADR-015 | APM & Error Tracking | M | Production visibility, debugging |
| 1b | ADR-050 | PostgreSQL Fn Observability | M | Database function visibility (depends on ADR-015) |
| 2 | ADR-024 | Feature Flags | M | Safer deployments, A/B testing |
| 3 | ADR-023 | Schema Migrations v2 | L | Database evolution support |
| 4 | ADR-029 | Secret Rotation | L | Security improvement |
| 5 | ADR-008 | API Versioning | L | Future API evolution |
| 6 | ADR-030 | Circuit Breaker | L | Resilience improvement |
| 7 | ADR-022 | Real-time Notifications | XL | Major feature enhancement |
| 8 | ADR-011 | Authorization & RBAC | XL | Advanced permission system |
| 9 | ADR-025 | i18n & l10n | XL | Multi-language support |
| 10 | ADR-031 | Data Retention & Privacy | XL | Compliance requirements |
| Priority | ADR | Title | Status | Effort | Rationale |
| -------- | ------- | ------------------------ | -------- | ------ | ------------------------------------ |
| 1 | ADR-024 | Feature Flags | Proposed | M | Safer deployments, A/B testing |
| 2 | ADR-054 | Bugsink-Gitea Sync | Proposed | L | Automated issue tracking from errors |
| 3 | ADR-023 | Schema Migrations v2 | Proposed | L | Database evolution support |
| 4 | ADR-029 | Secret Rotation | Proposed | L | Security improvement |
| 5 | ADR-008 | API Versioning | Proposed | L | Future API evolution |
| 6 | ADR-030 | Circuit Breaker | Proposed | L | Resilience improvement |
| 7 | ADR-056 | APM (Performance) | Proposed | M | Enable when performance issues arise |
| 8 | ADR-011 | Authorization & RBAC | Proposed | XL | Advanced permission system |
| 9 | ADR-025 | i18n & l10n | Proposed | XL | Multi-language support |
| 10 | ADR-031 | Data Retention & Privacy | Proposed | XL | Compliance requirements |
---
## Recent Implementation History
| Date | ADR | Change |
| ---------- | ------- | ---------------------------------------------------------------------- |
| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() and Logstash |
| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs |
| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing requirements |
| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan |
| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback and rate limiting |
| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ queuing |
| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns |
| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization |
| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation |
| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping |
| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger |
| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated |
| Date | ADR | Change |
| ---------- | ------- | ---------------------------------------------------------------------------- |
| 2026-01-26 | ADR-015 | Completed - Added Sentry user context in AuthProvider, now fully implemented |
| 2026-01-26 | ADR-056 | Created - APM split from ADR-015, status Proposed (tracesSampleRate=0) |
| 2026-01-26 | ADR-015 | Refactored to focus on error tracking only, temporarily status Partial |
| 2026-01-26 | ADR-048 | Verified as fully implemented - JWT + OAuth authentication complete |
| 2026-01-26 | ADR-022 | Verified as fully implemented - WebSocket notifications complete |
| 2026-01-26 | ADR-052 | Marked as fully implemented - createScopedLogger complete |
| 2026-01-26 | ADR-053 | Marked as fully implemented - /health/queues endpoint complete |
| 2026-01-26 | ADR-050 | Marked as fully implemented - PostgreSQL function observability |
| 2026-01-26 | ADR-055 | Created (renumbered from duplicate ADR-023) - DB normalization |
| 2026-01-26 | ADR-054 | Added to tracker - Bugsink to Gitea issue synchronization |
| 2026-01-26 | ADR-053 | Added to tracker - Worker health checks and monitoring |
| 2026-01-26 | ADR-052 | Added to tracker - Granular debug logging strategy |
| 2026-01-26 | ADR-051 | Added to tracker - Asynchronous context propagation |
| 2026-01-26 | ADR-048 | Added to tracker - Authentication strategy |
| 2026-01-26 | ADR-040 | Added to tracker - Testing economics and priorities |
| 2026-01-17 | ADR-054 | Created - Bugsink-Gitea sync worker proposal |
| 2026-01-11 | ADR-050 | Created - PostgreSQL function observability with fn_log() |
| 2026-01-11 | ADR-018 | Implemented - OpenAPI/Swagger documentation at /docs/api-docs |
| 2026-01-11 | ADR-049 | Created - Gamification system, achievements, and testing |
| 2026-01-09 | ADR-047 | Created - Project file/folder organization with migration plan |
| 2026-01-09 | ADR-041 | Created - AI/Gemini integration with model fallback |
| 2026-01-09 | ADR-042 | Created - Email and notification architecture with BullMQ |
| 2026-01-09 | ADR-043 | Created - Express middleware pipeline ordering and patterns |
| 2026-01-09 | ADR-044 | Created - Frontend feature-based folder organization |
| 2026-01-09 | ADR-045 | Created - Test data factory pattern for mock generation |
| 2026-01-09 | ADR-046 | Created - Image processing pipeline with Sharp and EXIF stripping |
| 2026-01-09 | ADR-026 | Fully implemented - client-side structured logger |
| 2026-01-09 | ADR-028 | Fully implemented - all routes, middleware, and tests updated |
---

View File

@@ -21,7 +21,7 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-003](./0003-standardized-input-validation-using-middleware.md)**: Standardized Input Validation using Middleware (Accepted)
**[ADR-008](./0008-api-versioning-strategy.md)**: API Versioning Strategy (Proposed)
**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Proposed)
**[ADR-018](./0018-api-documentation-strategy.md)**: API Documentation Strategy (Accepted)
**[ADR-022](./0022-real-time-notification-system.md)**: Real-time Notification System (Proposed)
**[ADR-028](./0028-api-response-standardization.md)**: API Response Standardization and Envelope Pattern (Implemented)
@@ -38,7 +38,11 @@ This directory contains a log of the architectural decisions made for the Flyer
## 5. Observability & Monitoring
**[ADR-004](./0004-standardized-application-wide-structured-logging.md)**: Standardized Application-Wide Structured Logging (Accepted)
**[ADR-015](./0015-application-performance-monitoring-and-error-tracking.md)**: Application Performance Monitoring (APM) and Error Tracking (Proposed)
**[ADR-015](./0015-error-tracking-and-observability.md)**: Error Tracking and Observability (Partial)
**[ADR-050](./0050-postgresql-function-observability.md)**: PostgreSQL Function Observability (Accepted)
**[ADR-051](./0051-asynchronous-context-propagation.md)**: Asynchronous Context Propagation (Accepted)
**[ADR-052](./0052-granular-debug-logging-strategy.md)**: Granular Debug Logging Strategy (Accepted)
**[ADR-056](./0056-application-performance-monitoring.md)**: Application Performance Monitoring (Proposed)
## 6. Deployment & Operations
@@ -48,13 +52,15 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-024](./0024-feature-flagging-strategy.md)**: Feature Flagging Strategy (Proposed)
**[ADR-037](./0037-scheduled-jobs-and-cron-pattern.md)**: Scheduled Jobs and Cron Pattern (Accepted)
**[ADR-038](./0038-graceful-shutdown-pattern.md)**: Graceful Shutdown Pattern (Accepted)
**[ADR-053](./0053-worker-health-checks-and-monitoring.md)**: Worker Health Checks and Monitoring (Proposed)
**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed)
## 7. Frontend / User Interface
**[ADR-005](./0005-frontend-state-management-and-server-cache-strategy.md)**: Frontend State Management and Server Cache Strategy (Accepted)
**[ADR-012](./0012-frontend-component-library-and-design-system.md)**: Frontend Component Library and Design System (Partially Implemented)
**[ADR-025](./0025-internationalization-and-localization-strategy.md)**: Internationalization (i18n) and Localization (l10n) Strategy (Proposed)
**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Proposed)
**[ADR-026](./0026-standardized-client-side-structured-logging.md)**: Standardized Client-Side Structured Logging (Accepted)
**[ADR-044](./0044-frontend-feature-organization.md)**: Frontend Feature Organization Pattern (Accepted)
## 8. Development Workflow & Quality
@@ -76,3 +82,5 @@ This directory contains a log of the architectural decisions made for the Flyer
**[ADR-042](./0042-email-and-notification-architecture.md)**: Email and Notification Architecture (Accepted)
**[ADR-043](./0043-express-middleware-pipeline.md)**: Express Middleware Pipeline Architecture (Accepted)
**[ADR-046](./0046-image-processing-pipeline.md)**: Image Processing Pipeline (Accepted)
**[ADR-049](./0049-gamification-and-achievement-system.md)**: Gamification and Achievement System (Accepted)
**[ADR-055](./0055-database-normalization-and-referential-integrity.md)**: Database Normalization and Referential Integrity (Accepted)

View File

@@ -272,14 +272,41 @@ podman-compose -f compose.dev.yml down
Key environment variables are set in `compose.dev.yml`:
| Variable | Value | Purpose |
| ----------------- | ----------------------------- | -------------------- |
| `NODE_ENV` | `development` | Environment mode |
| `DB_HOST` | `postgres` | PostgreSQL hostname |
| `REDIS_URL` | `redis://redis:6379` | Redis connection URL |
| `FRONTEND_URL` | `https://localhost` | CORS origin |
| `SENTRY_DSN` | `http://...@127.0.0.1:8000/1` | Backend Bugsink DSN |
| `VITE_SENTRY_DSN` | `http://...@127.0.0.1:8000/2` | Frontend Bugsink DSN |
| Variable | Value | Purpose |
| ----------------- | ----------------------------- | --------------------------- |
| `TZ` | `America/Los_Angeles` | Timezone (PST) for all logs |
| `NODE_ENV` | `development` | Environment mode |
| `DB_HOST` | `postgres` | PostgreSQL hostname |
| `REDIS_URL` | `redis://redis:6379` | Redis connection URL |
| `FRONTEND_URL` | `https://localhost` | CORS origin |
| `SENTRY_DSN` | `http://...@127.0.0.1:8000/1` | Backend Bugsink DSN |
| `VITE_SENTRY_DSN` | `http://...@127.0.0.1:8000/2` | Frontend Bugsink DSN |
### Timezone Configuration
All dev container services are configured to use PST (America/Los_Angeles) timezone for consistent log timestamps:
| Service | Configuration | Notes |
| ---------- | ------------------------------------------------ | ------------------------------ |
| App | `TZ=America/Los_Angeles` in compose.dev.yml | Also set via dev-entrypoint.sh |
| PostgreSQL | `timezone` and `log_timezone` in postgres config | Logs timestamps in PST |
| Redis | `TZ=America/Los_Angeles` in compose.dev.yml | Alpine uses TZ env var |
| PM2 | `TZ` in ecosystem.dev.config.cjs | Pino timestamps use local time |
**Verifying Timezone**:
```bash
# Check container timezone
podman exec flyer-crawler-dev date
# Check PostgreSQL timezone
podman exec flyer-crawler-postgres psql -U postgres -c "SHOW timezone;"
# Check Redis log timestamps
MSYS_NO_PATHCONV=1 podman exec flyer-crawler-redis cat /var/log/redis/redis-server.log | head -5
```
**Note**: If you need UTC timestamps for production compatibility, change `TZ=UTC` in compose.dev.yml and restart containers.
---

View File

@@ -44,6 +44,8 @@ if (missingVars.length > 0) {
// --- Shared Environment Variables ---
// These come from compose.dev.yml environment section
const sharedEnv = {
// Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: process.env.TZ || 'America/Los_Angeles',
NODE_ENV: 'development',
DB_HOST: process.env.DB_HOST || 'postgres',
DB_PORT: process.env.DB_PORT || '5432',
@@ -160,6 +162,8 @@ module.exports = {
min_uptime: '5s',
// Environment
env: {
// Timezone: PST (America/Los_Angeles) for consistent log timestamps
TZ: process.env.TZ || 'America/Los_Angeles',
NODE_ENV: 'development',
// Vite-specific env vars (VITE_ prefix)
VITE_SENTRY_DSN: process.env.VITE_SENTRY_DSN,

4
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "flyer-crawler",
"version": "0.12.10",
"version": "0.12.15",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "flyer-crawler",
"version": "0.12.10",
"version": "0.12.15",
"dependencies": {
"@bull-board/api": "^6.14.2",
"@bull-board/express": "^6.14.2",

View File

@@ -1,7 +1,7 @@
{
"name": "flyer-crawler",
"private": true,
"version": "0.12.10",
"version": "0.12.15",
"type": "module",
"scripts": {
"dev": "concurrently \"npm:start:dev\" \"vite\"",
@@ -14,12 +14,12 @@
"start": "npm run start:prod",
"build": "vite build",
"preview": "vite preview",
"test": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx ./node_modules/vitest/vitest.mjs run",
"test-wsl": "cross-env NODE_ENV=test vitest run",
"test": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx ./node_modules/vitest/vitest.mjs run",
"test-wsl": "cross-env NODE_ENV=test TZ= vitest run",
"test:coverage": "npm run clean && npm run test:unit -- --coverage && npm run test:integration -- --coverage",
"test:unit": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project unit -c vite.config.ts",
"test:integration": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project integration -c vitest.config.integration.ts",
"test:e2e": "node scripts/check-linux.js && cross-env NODE_ENV=test tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --config vitest.config.e2e.ts",
"test:unit": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project unit -c vite.config.ts",
"test:integration": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --project integration -c vitest.config.integration.ts",
"test:e2e": "node scripts/check-linux.js && cross-env NODE_ENV=test TZ= tsx --max-old-space-size=8192 ./node_modules/vitest/vitest.mjs run --config vitest.config.e2e.ts",
"format": "prettier --write .",
"lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
"type-check": "tsc --noEmit",

View File

@@ -23,6 +23,26 @@ set -e
echo "Starting Flyer Crawler Dev Container..."
# ============================================================================
# Timezone Configuration
# ============================================================================
# Ensure TZ is set for consistent log timestamps across all services.
# TZ should be set via compose.dev.yml environment (default: America/Los_Angeles)
# ============================================================================
if [ -n "$TZ" ]; then
echo "Timezone configured: $TZ"
# Link timezone data if available (for date command and other tools)
if [ -f "/usr/share/zoneinfo/$TZ" ]; then
ln -sf "/usr/share/zoneinfo/$TZ" /etc/localtime
echo "$TZ" > /etc/timezone
echo "System timezone set to: $(date +%Z) ($(date))"
else
echo "Warning: Timezone data not found for $TZ, using TZ environment variable only"
fi
else
echo "Warning: TZ environment variable not set, using container default timezone"
fi
# Configure Bugsink HTTPS (ADR-015)
echo "Configuring Bugsink HTTPS..."
mkdir -p /etc/bugsink/ssl

View File

@@ -112,6 +112,15 @@ const googleSchema = z.object({
clientSecret: z.string().optional(),
});
/**
* GitHub OAuth configuration schema.
* Used for GitHub social login functionality.
*/
const githubSchema = z.object({
clientId: z.string().optional(),
clientSecret: z.string().optional(),
});
/**
* Worker concurrency configuration schema.
*/
@@ -157,6 +166,7 @@ const envSchema = z.object({
ai: aiSchema,
upc: upcSchema,
google: googleSchema,
github: githubSchema,
worker: workerSchema,
server: serverSchema,
sentry: sentrySchema,
@@ -209,6 +219,10 @@ function loadEnvVars(): unknown {
clientId: process.env.GOOGLE_CLIENT_ID,
clientSecret: process.env.GOOGLE_CLIENT_SECRET,
},
github: {
clientId: process.env.GITHUB_CLIENT_ID,
clientSecret: process.env.GITHUB_CLIENT_SECRET,
},
worker: {
concurrency: process.env.WORKER_CONCURRENCY,
lockDuration: process.env.WORKER_LOCK_DURATION,
@@ -367,3 +381,13 @@ export const isUpcItemDbConfigured = !!config.upc.upcItemDbApiKey;
* Returns true if Barcode Lookup API is configured.
*/
export const isBarcodeLookupConfigured = !!config.upc.barcodeLookupApiKey;
/**
* Returns true if Google OAuth is configured (both client ID and secret present).
*/
export const isGoogleOAuthConfigured = !!config.google.clientId && !!config.google.clientSecret;
/**
* Returns true if GitHub OAuth is configured (both client ID and secret present).
*/
export const isGithubOAuthConfigured = !!config.github.clientId && !!config.github.clientSecret;

View File

@@ -27,9 +27,13 @@ const defaultProps = {
};
const setupSuccessMocks = () => {
// The API returns {success, data: {userprofile, token}}, and the mutation extracts .data
const mockAuthResponse = {
userprofile: createMockUserProfile({ user: { user_id: '123', email: 'test@example.com' } }),
token: 'mock-token',
success: true,
data: {
userprofile: createMockUserProfile({ user: { user_id: '123', email: 'test@example.com' } }),
token: 'mock-token',
},
};
(mockedApiClient.loginUser as Mock).mockResolvedValue(
new Response(JSON.stringify(mockAuthResponse)),

View File

@@ -82,7 +82,11 @@ const defaultAuthenticatedProps = {
};
const setupSuccessMocks = () => {
const mockAuthResponse = { userprofile: authenticatedProfile, token: 'mock-token' };
// The API returns {success, data: {userprofile, token}}, and the mutation extracts .data
const mockAuthResponse = {
success: true,
data: { userprofile: authenticatedProfile, token: 'mock-token' },
};
(mockedApiClient.loginUser as Mock).mockResolvedValue(
new Response(JSON.stringify(mockAuthResponse)),
);

View File

@@ -7,6 +7,7 @@ import * as apiClient from '../services/apiClient';
import { useAuthProfileQuery, AUTH_PROFILE_QUERY_KEY } from '../hooks/queries/useAuthProfileQuery';
import { getToken, setToken, removeToken } from '../services/tokenStorage';
import { logger } from '../services/logger.client';
import { setUser as setSentryUser } from '../services/sentry.client';
/**
* AuthProvider component that manages authentication state.
@@ -40,6 +41,12 @@ export const AuthProvider: React.FC<{ children: ReactNode }> = ({ children }) =>
logger.info('[AuthProvider] Profile received from query, setting state to AUTHENTICATED.');
setUserProfile(fetchedProfile);
setAuthStatus('AUTHENTICATED');
// Set Sentry user context for error tracking (ADR-015)
setSentryUser({
id: fetchedProfile.user.user_id,
email: fetchedProfile.user.email,
username: fetchedProfile.full_name || fetchedProfile.user.email,
});
} else if (token && isError) {
logger.warn('[AuthProvider] Token was present but validation failed. Signing out.');
removeToken();
@@ -66,6 +73,8 @@ export const AuthProvider: React.FC<{ children: ReactNode }> = ({ children }) =>
setAuthStatus('SIGNED_OUT');
// Clear the auth profile cache on logout
queryClient.removeQueries({ queryKey: AUTH_PROFILE_QUERY_KEY });
// Clear Sentry user context (ADR-015)
setSentryUser(null);
}, [queryClient]);
const login = useCallback(
@@ -82,6 +91,12 @@ export const AuthProvider: React.FC<{ children: ReactNode }> = ({ children }) =>
setAuthStatus('AUTHENTICATED');
// Update the query cache with the provided profile
queryClient.setQueryData(AUTH_PROFILE_QUERY_KEY, profileData);
// Set Sentry user context for error tracking (ADR-015)
setSentryUser({
id: profileData.user.user_id,
email: profileData.user.email,
username: profileData.full_name || profileData.user.email,
});
logger.info('[AuthProvider-Login] Login successful. State set to AUTHENTICATED.', {
user: profileData.user,
});
@@ -106,6 +121,12 @@ export const AuthProvider: React.FC<{ children: ReactNode }> = ({ children }) =>
setAuthStatus('AUTHENTICATED');
// Update the query cache with the fetched profile
queryClient.setQueryData(AUTH_PROFILE_QUERY_KEY, fetchedProfileData);
// Set Sentry user context for error tracking (ADR-015)
setSentryUser({
id: fetchedProfileData.user.user_id,
email: fetchedProfileData.user.email,
username: fetchedProfileData.full_name || fetchedProfileData.user.email,
});
logger.info('[AuthProvider-Login] Profile fetch successful. State set to AUTHENTICATED.');
} catch (e) {
const errorMessage = e instanceof Error ? e.message : String(e);

View File

@@ -619,4 +619,240 @@ describe('Health Routes (/api/health)', () => {
expect(response.body.error.details.database.message).toBe('Database connection failed');
});
});
// =============================================================================
// QUEUE HEALTH MONITORING (ADR-053)
// =============================================================================
describe('GET /queues', () => {
// Mock the queues module
beforeEach(async () => {
vi.resetModules();
// Re-import after mocks are set up
});
it('should return 200 OK with queue metrics and worker heartbeats when all healthy', async () => {
// Arrange: Mock queue getJobCounts() and Redis heartbeats
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 5,
active: 2,
failed: 1,
delayed: 0,
}),
};
// Mock all queues
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock Redis heartbeat responses (all healthy, last seen < 60s ago)
const recentTimestamp = new Date(Date.now() - 10000).toISOString(); // 10 seconds ago
const heartbeatValue = JSON.stringify({
timestamp: recentTimestamp,
pid: 1234,
host: 'test-host',
});
mockedRedisConnection.get = vi.fn().mockResolvedValue(heartbeatValue);
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(200);
expect(response.body.success).toBe(true);
expect(response.body.data.status).toBe('healthy');
expect(response.body.data.queues).toBeDefined();
expect(response.body.data.workers).toBeDefined();
// Verify queue metrics structure
expect(response.body.data.queues['flyer-processing']).toEqual({
waiting: 5,
active: 2,
failed: 1,
delayed: 0,
});
// Verify worker heartbeat structure
expect(response.body.data.workers['flyer-processing']).toEqual({
alive: true,
lastSeen: recentTimestamp,
pid: 1234,
host: 'test-host',
});
});
it('should return 503 when a queue is unavailable', async () => {
// Arrange: Mock one queue to fail
const mockQueues = await import('../services/queues.server');
const healthyQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
const failingQueue = {
getJobCounts: vi.fn().mockRejectedValue(new Error('Redis connection lost')),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(failingQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(healthyQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(healthyQueue as never);
mockedRedisConnection.get = vi.fn().mockResolvedValue(null);
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(503);
expect(response.body.success).toBe(false);
expect(response.body.error.message).toBe('One or more queues or workers unavailable');
expect(response.body.error.details.status).toBe('unhealthy');
expect(response.body.error.details.queues['flyer-processing']).toEqual({
error: 'Redis connection lost',
});
});
it('should return 503 when a worker heartbeat is stale', async () => {
// Arrange: Mock queues as healthy but one worker heartbeat as stale
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock heartbeat - one worker is stale (> 60s ago)
const staleTimestamp = new Date(Date.now() - 120000).toISOString(); // 120 seconds ago
const staleHeartbeat = JSON.stringify({
timestamp: staleTimestamp,
pid: 1234,
host: 'test-host',
});
// First call returns stale heartbeat for flyer-processing, rest return null (no heartbeat)
let callCount = 0;
mockedRedisConnection.get = vi.fn().mockImplementation(() => {
callCount++;
return Promise.resolve(callCount === 1 ? staleHeartbeat : null);
});
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(503);
expect(response.body.success).toBe(false);
expect(response.body.error.details.status).toBe('unhealthy');
expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false });
});
it('should return 503 when worker heartbeat is missing', async () => {
// Arrange: Mock queues as healthy but no worker heartbeats in Redis
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock Redis to return null (no heartbeat found)
mockedRedisConnection.get = vi.fn().mockResolvedValue(null);
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert
expect(response.status).toBe(503);
expect(response.body.success).toBe(false);
expect(response.body.error.details.status).toBe('unhealthy');
expect(response.body.error.details.workers['flyer-processing']).toEqual({ alive: false });
});
it('should handle Redis connection errors gracefully', async () => {
// Arrange: Mock queues to succeed but Redis get() to fail
const mockQueues = await import('../services/queues.server');
const mockQueue = {
getJobCounts: vi.fn().mockResolvedValue({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
}),
};
vi.spyOn(mockQueues, 'flyerQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'emailQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'analyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'weeklyAnalyticsQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'cleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'tokenCleanupQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'receiptQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'expiryAlertQueue', 'get').mockReturnValue(mockQueue as never);
vi.spyOn(mockQueues, 'barcodeQueue', 'get').mockReturnValue(mockQueue as never);
// Mock Redis get() to throw error
mockedRedisConnection.get = vi.fn().mockRejectedValue(new Error('Redis connection lost'));
// Act
const response = await supertest(app).get('/api/health/queues');
// Assert: Should still return queue metrics but mark workers as unhealthy
expect(response.status).toBe(503);
expect(response.body.error.details.queues['flyer-processing']).toEqual({
waiting: 0,
active: 0,
failed: 0,
delayed: 0,
});
expect(response.body.error.details.workers['flyer-processing']).toEqual({
alive: false,
error: 'Redis connection lost',
});
});
});
});

View File

@@ -15,6 +15,17 @@ import fs from 'node:fs/promises';
import { getSimpleWeekAndYear } from '../utils/dateUtils';
import { validateRequest } from '../middleware/validation.middleware';
import { sendSuccess, sendError, ErrorCode } from '../utils/apiResponse';
import {
flyerQueue,
emailQueue,
analyticsQueue,
weeklyAnalyticsQueue,
cleanupQueue,
tokenCleanupQueue,
receiptQueue,
expiryAlertQueue,
barcodeQueue,
} from '../services/queues.server';
const router = Router();
@@ -442,4 +453,224 @@ router.get(
},
);
// =============================================================================
// QUEUE HEALTH MONITORING (ADR-053)
// =============================================================================
/**
* @openapi
* /health/queues:
* get:
* summary: Queue health and metrics with worker heartbeats
* description: |
* Returns job counts for all BullMQ queues and worker heartbeat status.
* Use this endpoint to monitor queue depths and detect stuck/frozen workers.
* Implements ADR-053: Worker Health Checks and Stalled Job Monitoring.
* tags:
* - Health
* responses:
* 200:
* description: Queue metrics and worker heartbeats retrieved successfully
* content:
* application/json:
* schema:
* type: object
* properties:
* success:
* type: boolean
* example: true
* data:
* type: object
* properties:
* status:
* type: string
* enum: [healthy, unhealthy]
* timestamp:
* type: string
* format: date-time
* queues:
* type: object
* additionalProperties:
* type: object
* properties:
* waiting:
* type: integer
* active:
* type: integer
* failed:
* type: integer
* delayed:
* type: integer
* workers:
* type: object
* additionalProperties:
* type: object
* properties:
* alive:
* type: boolean
* lastSeen:
* type: string
* format: date-time
* pid:
* type: integer
* host:
* type: string
* 503:
* description: Redis unavailable or workers not responding
* content:
* application/json:
* schema:
* $ref: '#/components/schemas/ErrorResponse'
*/
router.get(
'/queues',
validateRequest(emptySchema),
async (req: Request, res: Response, next: NextFunction) => {
try {
// Define all queues to monitor
const queues = [
{ name: 'flyer-processing', queue: flyerQueue },
{ name: 'email-sending', queue: emailQueue },
{ name: 'analytics-reporting', queue: analyticsQueue },
{ name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue },
{ name: 'file-cleanup', queue: cleanupQueue },
{ name: 'token-cleanup', queue: tokenCleanupQueue },
{ name: 'receipt-processing', queue: receiptQueue },
{ name: 'expiry-alerts', queue: expiryAlertQueue },
{ name: 'barcode-detection', queue: barcodeQueue },
];
// Fetch job counts for all queues in parallel
const queueMetrics = await Promise.all(
queues.map(async ({ name, queue }) => {
try {
const counts = await queue.getJobCounts();
return {
name,
counts: {
waiting: counts.waiting || 0,
active: counts.active || 0,
failed: counts.failed || 0,
delayed: counts.delayed || 0,
},
};
} catch (error) {
// If individual queue fails, return error state
return {
name,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}),
);
// Fetch worker heartbeats in parallel
const workerNames = queues.map((q) => q.name);
const workerHeartbeats = await Promise.all(
workerNames.map(async (name) => {
try {
const key = `worker:heartbeat:${name}`;
const value = await redisConnection.get(key);
if (!value) {
return { name, alive: false };
}
const heartbeat = JSON.parse(value) as {
timestamp: string;
pid: number;
host: string;
};
const lastSeenMs = new Date(heartbeat.timestamp).getTime();
const nowMs = Date.now();
const ageSeconds = (nowMs - lastSeenMs) / 1000;
// Consider alive if last heartbeat < 60 seconds ago
const alive = ageSeconds < 60;
return {
name,
alive,
lastSeen: heartbeat.timestamp,
pid: heartbeat.pid,
host: heartbeat.host,
};
} catch (error) {
// If heartbeat check fails, mark as unknown
return {
name,
alive: false,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}),
);
// Build response objects
const queuesData: Record<
string,
{ waiting: number; active: number; failed: number; delayed: number } | { error: string }
> = {};
const workersData: Record<
string,
| { alive: boolean; lastSeen?: string; pid?: number; host?: string }
| { alive: boolean; error: string }
> = {};
let hasErrors = false;
for (const metric of queueMetrics) {
if ('error' in metric) {
queuesData[metric.name] = { error: metric.error };
hasErrors = true;
} else {
queuesData[metric.name] = metric.counts;
}
}
for (const heartbeat of workerHeartbeats) {
if ('error' in heartbeat) {
workersData[heartbeat.name] = { alive: false, error: heartbeat.error };
} else if (!heartbeat.alive) {
workersData[heartbeat.name] = { alive: false };
hasErrors = true;
} else {
workersData[heartbeat.name] = {
alive: heartbeat.alive,
lastSeen: heartbeat.lastSeen,
pid: heartbeat.pid,
host: heartbeat.host,
};
}
}
const response = {
status: hasErrors ? ('unhealthy' as const) : ('healthy' as const),
timestamp: new Date().toISOString(),
queues: queuesData,
workers: workersData,
};
if (hasErrors) {
return sendError(
res,
ErrorCode.SERVICE_UNAVAILABLE,
'One or more queues or workers unavailable',
503,
response,
);
}
return sendSuccess(res, response);
} catch (error: unknown) {
// Redis connection error or other unexpected failure
if (error instanceof Error) {
return next(error);
}
const message =
(error as { message?: string })?.message || 'Failed to retrieve queue metrics';
return next(new Error(message));
}
},
);
export default router;

View File

@@ -132,7 +132,8 @@ describe('API Client', () => {
.mockResolvedValueOnce({
ok: true,
status: 200,
json: () => Promise.resolve({ token: 'new-refreshed-token' }),
// The API returns {success, data: {token}} wrapper format
json: () => Promise.resolve({ success: true, data: { token: 'new-refreshed-token' } }),
} as Response)
.mockResolvedValueOnce({
ok: true,
@@ -218,7 +219,7 @@ describe('API Client', () => {
localStorage.setItem('authToken', 'expired-token');
// Mock the global fetch to return a sequence of responses:
// 1. 401 Unauthorized (initial API call)
// 2. 200 OK (token refresh call)
// 2. 200 OK (token refresh call) - uses API wrapper format {success, data: {token}}
// 3. 200 OK (retry of the initial API call)
vi.mocked(global.fetch)
.mockResolvedValueOnce({
@@ -229,7 +230,8 @@ describe('API Client', () => {
.mockResolvedValueOnce({
ok: true,
status: 200,
json: () => Promise.resolve({ token: 'new-refreshed-token' }),
// The API returns {success, data: {token}} wrapper format
json: () => Promise.resolve({ success: true, data: { token: 'new-refreshed-token' } }),
} as Response)
.mockResolvedValueOnce({
ok: true,

View File

@@ -62,12 +62,33 @@ vi.mock('./logger.server', () => ({
vi.mock('bullmq', () => ({
Worker: mocks.MockWorker,
Queue: vi.fn(function () {
return { add: vi.fn() };
return { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) };
}),
// Add UnrecoverableError to the mock so it can be used in tests
UnrecoverableError: class UnrecoverableError extends Error {},
}));
// Mock redis.server to prevent real Redis connection attempts
vi.mock('./redis.server', () => ({
connection: {
on: vi.fn(),
quit: vi.fn().mockResolvedValue(undefined),
},
}));
// Mock queues.server to provide mock queue instances
vi.mock('./queues.server', () => ({
flyerQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
emailQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
analyticsQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
cleanupQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
weeklyAnalyticsQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
tokenCleanupQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
receiptQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
expiryAlertQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
barcodeQueue: { add: vi.fn(), close: vi.fn().mockResolvedValue(undefined) },
}));
// Mock flyerProcessingService.server as flyerWorker and cleanupWorker depend on it
vi.mock('./flyerProcessingService.server', () => {
// Mock the constructor to return an object with the mocked methods
@@ -88,6 +109,67 @@ vi.mock('./flyerDataTransformer', () => ({
},
}));
// Mock aiService.server to prevent initialization issues
vi.mock('./aiService.server', () => ({
aiService: {
extractAndValidateData: vi.fn(),
},
}));
// Mock db/index.db to prevent database connections
vi.mock('./db/index.db', () => ({
personalizationRepo: {},
}));
// Mock flyerAiProcessor.server
vi.mock('./flyerAiProcessor.server', () => ({
FlyerAiProcessor: vi.fn().mockImplementation(function () {
return { processFlyer: vi.fn() };
}),
}));
// Mock flyerPersistenceService.server
vi.mock('./flyerPersistenceService.server', () => ({
FlyerPersistenceService: vi.fn().mockImplementation(function () {
return { persistFlyerData: vi.fn() };
}),
}));
// Mock db/connection.db to prevent database connections
vi.mock('./db/connection.db', () => ({
withTransaction: vi.fn(),
}));
// Mock receiptService.server
vi.mock('./receiptService.server', () => ({
processReceiptJob: vi.fn().mockResolvedValue(undefined),
}));
// Mock expiryService.server
vi.mock('./expiryService.server', () => ({
processExpiryAlertJob: vi.fn().mockResolvedValue(undefined),
}));
// Mock barcodeService.server
vi.mock('./barcodeService.server', () => ({
processBarcodeDetectionJob: vi.fn().mockResolvedValue(undefined),
}));
// Mock flyerFileHandler.server
vi.mock('./flyerFileHandler.server', () => ({
FlyerFileHandler: vi.fn().mockImplementation(function () {
return { handleFile: vi.fn() };
}),
}));
// Mock workerOptions config
vi.mock('../config/workerOptions', () => ({
defaultWorkerOptions: {
lockDuration: 30000,
stalledInterval: 30000,
},
}));
// Helper to create a mock BullMQ Job object
const createMockJob = <T>(data: T): Job<T> => {
return {

View File

@@ -3,6 +3,7 @@ import { Worker, Job } from 'bullmq';
import fsPromises from 'node:fs/promises';
import { exec } from 'child_process';
import { promisify } from 'util';
import os from 'os';
import { logger } from './logger.server';
import { connection } from './redis.server';
@@ -91,6 +92,45 @@ const createWorkerProcessor = <T, R>(processor: (job: Job<T>) => Promise<R>) =>
};
};
/**
* Updates the worker heartbeat in Redis.
* Stores timestamp, PID, and hostname to detect frozen/hung workers.
* TTL is 90s, so if heartbeat isn't updated for 90s, the key expires.
* Implements ADR-053: Worker Health Checks.
*/
const updateWorkerHeartbeat = async (workerName: string) => {
const key = `worker:heartbeat:${workerName}`;
const value = JSON.stringify({
timestamp: new Date().toISOString(),
pid: process.pid,
host: os.hostname(),
});
try {
await connection.set(key, value, 'EX', 90);
} catch (error) {
logger.error({ err: error, workerName }, `Failed to update heartbeat for worker ${workerName}`);
}
};
/**
* Starts periodic heartbeat updates for a worker.
* Updates every 30 seconds with 90s TTL.
*/
const startWorkerHeartbeat = (worker: Worker) => {
// Initial heartbeat
updateWorkerHeartbeat(worker.name);
// Periodic heartbeat updates
const heartbeatInterval = setInterval(() => {
updateWorkerHeartbeat(worker.name);
}, 30000); // 30 seconds
// Store interval on worker for cleanup
(worker as unknown as { heartbeatInterval?: NodeJS.Timeout }).heartbeatInterval =
heartbeatInterval;
};
const attachWorkerEventListeners = (worker: Worker) => {
worker.on('completed', (job: Job, returnValue: unknown) => {
logger.info({ returnValue }, `[${worker.name}] Job ${job.id} completed successfully.`);
@@ -102,6 +142,9 @@ const attachWorkerEventListeners = (worker: Worker) => {
`[${worker.name}] Job ${job?.id} has ultimately failed after all attempts.`,
);
});
// Start heartbeat monitoring for this worker
startWorkerHeartbeat(worker);
};
export const flyerWorker = new Worker<FlyerJobData>(
@@ -219,17 +262,28 @@ const SHUTDOWN_TIMEOUT = 30000; // 30 seconds
* without exiting the process.
*/
export const closeWorkers = async () => {
await Promise.all([
flyerWorker.close(),
emailWorker.close(),
analyticsWorker.close(),
cleanupWorker.close(),
weeklyAnalyticsWorker.close(),
tokenCleanupWorker.close(),
receiptWorker.close(),
expiryAlertWorker.close(),
barcodeWorker.close(),
]);
// Clear heartbeat intervals
const workers = [
flyerWorker,
emailWorker,
analyticsWorker,
cleanupWorker,
weeklyAnalyticsWorker,
tokenCleanupWorker,
receiptWorker,
expiryAlertWorker,
barcodeWorker,
];
workers.forEach((worker) => {
const interval = (worker as unknown as { heartbeatInterval?: NodeJS.Timeout })
.heartbeatInterval;
if (interval) {
clearInterval(interval);
}
});
await Promise.all(workers.map((w) => w.close()));
};
export const gracefulShutdown = async (signal: string) => {