Files
flyer-crawler.projectium.com/src/controllers/health.controller.ts
Torben Sorensen 2d2cd52011
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 3m58s
Massive Dependency Modernization Project
2026-02-13 00:34:22 -08:00

674 lines
20 KiB
TypeScript

// src/controllers/health.controller.ts
// ============================================================================
// HEALTH CONTROLLER
// ============================================================================
// Provides health check endpoints for monitoring the application state,
// implementing ADR-020: Health Checks and Liveness/Readiness Probes.
//
// This controller exposes endpoints for:
// - Liveness probe (/live) - Is the server process running?
// - Readiness probe (/ready) - Is the server ready to accept traffic?
// - Startup probe (/startup) - Has the server completed initialization?
// - Individual service health checks (db, redis, storage, queues)
// ============================================================================
import { Get, Route, Tags, SuccessResponse, Response } from 'tsoa';
import { BaseController } from './base.controller';
import type { SuccessResponse as SuccessResponseType, ErrorResponse, ServiceHealth } from './types';
import { getPoolStatus, getPool, checkTablesExist } from '../services/db/connection.db';
import { connection as redisConnection } from '../services/queueService.server';
import {
flyerQueue,
emailQueue,
analyticsQueue,
weeklyAnalyticsQueue,
cleanupQueue,
tokenCleanupQueue,
receiptQueue,
expiryAlertQueue,
barcodeQueue,
} from '../services/queues.server';
import { getSimpleWeekAndYear } from '../utils/dateUtils';
import fs from 'node:fs/promises';
// ============================================================================
// RESPONSE TYPES
// ============================================================================
// Types for health check responses that will appear in the OpenAPI spec.
// ============================================================================
/**
* Simple ping response.
*/
interface PingResponse {
message: string;
}
/**
* Liveness probe response.
*/
interface LivenessResponse {
status: 'ok';
timestamp: string;
}
/**
* Readiness probe response with service status.
*/
interface ReadinessResponse {
status: 'healthy' | 'degraded' | 'unhealthy';
timestamp: string;
uptime: number;
services: {
database: ServiceHealth;
redis: ServiceHealth;
storage: ServiceHealth;
};
}
/**
* Startup probe response.
*/
interface StartupResponse {
status: 'started' | 'starting';
timestamp: string;
database: ServiceHealth;
}
/**
* Database schema check response.
*/
interface DbSchemaResponse {
message: string;
}
/**
* Storage check response.
*/
interface StorageResponse {
message: string;
}
/**
* Database pool status response.
*/
interface DbPoolResponse {
message: string;
totalCount: number;
idleCount: number;
waitingCount: number;
}
/**
* Server time response.
*/
interface TimeResponse {
currentTime: string;
year: number;
week: number;
}
/**
* Redis health check response.
*/
interface RedisHealthResponse {
message: string;
}
/**
* Queue job counts.
*/
interface QueueJobCounts {
waiting: number;
active: number;
failed: number;
delayed: number;
}
/**
* Worker heartbeat status.
*/
interface WorkerHeartbeat {
alive: boolean;
lastSeen?: string;
pid?: number;
host?: string;
error?: string;
}
/**
* Queue health response with metrics and worker heartbeats.
*/
interface QueuesHealthResponse {
status: 'healthy' | 'unhealthy';
timestamp: string;
queues: Record<string, QueueJobCounts | { error: string }>;
workers: Record<string, WorkerHeartbeat>;
}
// ============================================================================
// HELPER FUNCTIONS
// ============================================================================
// Reusable functions for checking service health.
// ============================================================================
/**
* Checks database connectivity with timing.
*
* @returns ServiceHealth object with database status and latency
*/
async function checkDatabase(): Promise<ServiceHealth> {
const start = Date.now();
try {
const pool = getPool();
await pool.query('SELECT 1');
const latency = Date.now() - start;
const poolStatus = getPoolStatus();
// Consider degraded if waiting connections > 3
const status = poolStatus.waitingCount > 3 ? 'degraded' : 'healthy';
return {
status,
latency,
details: {
totalConnections: poolStatus.totalCount,
idleConnections: poolStatus.idleCount,
waitingConnections: poolStatus.waitingCount,
} as Record<string, unknown>,
};
} catch (error) {
return {
status: 'unhealthy',
latency: Date.now() - start,
message: error instanceof Error ? error.message : 'Database connection failed',
};
}
}
/**
* Checks Redis connectivity with timing.
*
* @returns ServiceHealth object with Redis status and latency
*/
async function checkRedis(): Promise<ServiceHealth> {
const start = Date.now();
try {
const reply = await redisConnection.ping();
const latency = Date.now() - start;
if (reply === 'PONG') {
return { status: 'healthy', latency };
}
return {
status: 'unhealthy',
latency,
message: `Unexpected ping response: ${reply}`,
};
} catch (error) {
return {
status: 'unhealthy',
latency: Date.now() - start,
message: error instanceof Error ? error.message : 'Redis connection failed',
};
}
}
/**
* Checks storage accessibility with timing.
*
* @returns ServiceHealth object with storage status and latency
*/
async function checkStorage(): Promise<ServiceHealth> {
const storagePath =
process.env.STORAGE_PATH || '/var/www/flyer-crawler.projectium.com/flyer-images';
const start = Date.now();
try {
await fs.access(storagePath, fs.constants.W_OK);
return {
status: 'healthy',
latency: Date.now() - start,
details: { path: storagePath },
};
} catch {
return {
status: 'unhealthy',
latency: Date.now() - start,
message: `Storage not accessible: ${storagePath}`,
};
}
}
// ============================================================================
// HEALTH CONTROLLER
// ============================================================================
/**
* Health check controller for monitoring application state.
*
* Provides endpoints for Kubernetes liveness/readiness/startup probes
* and individual service health checks per ADR-020.
*/
@Route('health')
@Tags('Health')
export class HealthController extends BaseController {
// ==========================================================================
// BASIC HEALTH CHECKS
// ==========================================================================
/**
* Simple ping endpoint.
*
* Returns a pong response to verify server is responsive.
* Use this for basic connectivity checks.
*
* @summary Simple ping endpoint
* @returns A pong response confirming the server is alive
*/
@Get('ping')
@SuccessResponse(200, 'Server is responsive')
public async ping(): Promise<SuccessResponseType<PingResponse>> {
return this.success({ message: 'pong' });
}
// ==========================================================================
// KUBERNETES PROBES (ADR-020)
// ==========================================================================
/**
* Liveness probe.
*
* Returns 200 OK if the server process is running.
* If this fails, the orchestrator should restart the container.
* This endpoint is intentionally simple and has no external dependencies.
*
* @summary Liveness probe
* @returns Status indicating the server process is alive
*/
@Get('live')
@SuccessResponse(200, 'Server process is alive')
public async live(): Promise<SuccessResponseType<LivenessResponse>> {
return this.success({
status: 'ok',
timestamp: new Date().toISOString(),
});
}
/**
* Readiness probe.
*
* Returns 200 OK if the server is ready to accept traffic.
* Checks all critical dependencies (database, Redis, storage).
* If this fails, the orchestrator should remove the container from the load balancer.
*
* @summary Readiness probe
* @returns Service health status for all critical dependencies
*/
@Get('ready')
@SuccessResponse(200, 'Server is ready to accept traffic')
@Response<ErrorResponse>(503, 'Service is unhealthy and should not receive traffic')
public async ready(): Promise<SuccessResponseType<ReadinessResponse> | ErrorResponse> {
// Check all services in parallel for speed
const [database, redis, storage] = await Promise.all([
checkDatabase(),
checkRedis(),
checkStorage(),
]);
// Determine overall status
// - 'healthy' if all critical services (db, redis) are healthy
// - 'degraded' if any service is degraded but none unhealthy
// - 'unhealthy' if any critical service is unhealthy
const criticalServices = [database, redis];
const allServices = [database, redis, storage];
let overallStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy';
if (criticalServices.some((s) => s.status === 'unhealthy')) {
overallStatus = 'unhealthy';
} else if (allServices.some((s) => s.status === 'degraded')) {
overallStatus = 'degraded';
}
const response: ReadinessResponse = {
status: overallStatus,
timestamp: new Date().toISOString(),
uptime: process.uptime(),
services: {
database,
redis,
storage,
},
};
// Return appropriate HTTP status code
// 200 = healthy or degraded (can still handle traffic)
// 503 = unhealthy (should not receive traffic)
if (overallStatus === 'unhealthy') {
this.setStatus(503);
return this.error(this.ErrorCode.SERVICE_UNAVAILABLE, 'Service unhealthy', response);
}
return this.success(response);
}
/**
* Startup probe.
*
* Similar to readiness but used during container startup.
* The orchestrator will not send liveness/readiness probes until this succeeds.
* This allows for longer initialization times without triggering restarts.
*
* @summary Startup probe for container orchestration
* @returns Startup status with database health
*/
@Get('startup')
@SuccessResponse(200, 'Server has started successfully')
@Response<ErrorResponse>(503, 'Server is still starting')
public async startup(): Promise<SuccessResponseType<StartupResponse> | ErrorResponse> {
// For startup, we only check database connectivity
// Redis and storage can be checked later in readiness
const database = await checkDatabase();
if (database.status === 'unhealthy') {
this.setStatus(503);
return this.error(this.ErrorCode.SERVICE_UNAVAILABLE, 'Waiting for database connection', {
status: 'starting',
database,
});
}
return this.success({
status: 'started',
timestamp: new Date().toISOString(),
database,
});
}
// ==========================================================================
// INDIVIDUAL SERVICE HEALTH CHECKS
// ==========================================================================
/**
* Database schema check.
*
* Checks if all essential database tables exist.
* This is a critical check to ensure the database schema is correctly set up.
*
* @summary Check database schema
* @returns Message confirming all required tables exist
*/
@Get('db-schema')
@SuccessResponse(200, 'All required database tables exist')
@Response<ErrorResponse>(500, 'Database schema check failed')
public async dbSchema(): Promise<SuccessResponseType<DbSchemaResponse> | ErrorResponse> {
const requiredTables = ['users', 'profiles', 'flyers', 'flyer_items', 'stores'];
const missingTables = await checkTablesExist(requiredTables);
if (missingTables.length > 0) {
this.setStatus(500);
return this.error(
this.ErrorCode.INTERNAL_ERROR,
`Database schema check failed. Missing tables: ${missingTables.join(', ')}.`,
);
}
return this.success({ message: 'All required database tables exist.' });
}
/**
* Storage health check.
*
* Verifies that the application's file storage path is accessible and writable.
* This is important for features like file uploads.
*
* @summary Check storage accessibility
* @returns Message confirming storage is accessible
*/
@Get('storage')
@SuccessResponse(200, 'Storage is accessible and writable')
@Response<ErrorResponse>(500, 'Storage check failed')
public async storage(): Promise<SuccessResponseType<StorageResponse> | ErrorResponse> {
const storagePath =
process.env.STORAGE_PATH || '/var/www/flyer-crawler.projectium.com/flyer-images';
try {
await fs.access(storagePath, fs.constants.W_OK);
return this.success({
message: `Storage directory '${storagePath}' is accessible and writable.`,
});
} catch {
this.setStatus(500);
return this.error(
this.ErrorCode.INTERNAL_ERROR,
`Storage check failed. Ensure the directory '${storagePath}' exists and is writable by the application.`,
);
}
}
/**
* Database pool status check.
*
* Checks the status of the database connection pool.
* This helps diagnose issues related to database connection saturation.
*
* @summary Check database connection pool status
* @returns Pool status with connection counts
*/
@Get('db-pool')
@SuccessResponse(200, 'Database pool is healthy')
@Response<ErrorResponse>(500, 'Database pool may be under stress')
public async dbPool(): Promise<SuccessResponseType<DbPoolResponse> | ErrorResponse> {
const status = getPoolStatus();
const isHealthy = status.waitingCount < 5;
const message = `Pool Status: ${status.totalCount} total, ${status.idleCount} idle, ${status.waitingCount} waiting.`;
if (isHealthy) {
return this.success({
message,
totalCount: status.totalCount,
idleCount: status.idleCount,
waitingCount: status.waitingCount,
});
}
this.setStatus(500);
return this.error(
this.ErrorCode.INTERNAL_ERROR,
`Pool may be under stress. ${message}`,
status,
);
}
/**
* Server time check.
*
* Returns the server's current time, year, and week number.
* Useful for verifying time synchronization and for features dependent on week numbers.
*
* @summary Get server time and week number
* @returns Current server time with year and week number
*/
@Get('time')
@SuccessResponse(200, 'Server time retrieved')
public async time(): Promise<SuccessResponseType<TimeResponse>> {
const now = new Date();
const { year, week } = getSimpleWeekAndYear(now);
return this.success({
currentTime: now.toISOString(),
year,
week,
});
}
/**
* Redis health check.
*
* Checks the health of the Redis connection.
*
* @summary Check Redis connectivity
* @returns Message confirming Redis is healthy
*/
@Get('redis')
@SuccessResponse(200, 'Redis connection is healthy')
@Response<ErrorResponse>(500, 'Redis health check failed')
public async redis(): Promise<SuccessResponseType<RedisHealthResponse> | ErrorResponse> {
try {
const reply = await redisConnection.ping();
if (reply === 'PONG') {
return this.success({ message: 'Redis connection is healthy.' });
}
throw new Error(`Unexpected Redis ping response: ${reply}`);
} catch (error) {
this.setStatus(500);
const message = error instanceof Error ? error.message : 'Redis health check failed';
return this.error(this.ErrorCode.INTERNAL_ERROR, message);
}
}
// ==========================================================================
// QUEUE HEALTH MONITORING (ADR-053)
// ==========================================================================
/**
* Queue health and metrics with worker heartbeats.
*
* Returns job counts for all BullMQ queues and worker heartbeat status.
* Use this endpoint to monitor queue depths and detect stuck/frozen workers.
* Implements ADR-053: Worker Health Checks and Stalled Job Monitoring.
*
* @summary Queue health and metrics
* @returns Queue metrics and worker heartbeat status
*/
@Get('queues')
@SuccessResponse(200, 'Queue metrics retrieved successfully')
@Response<ErrorResponse>(503, 'One or more queues or workers unavailable')
public async queues(): Promise<SuccessResponseType<QueuesHealthResponse> | ErrorResponse> {
// Define all queues to monitor
const queues = [
{ name: 'flyer-processing', queue: flyerQueue },
{ name: 'email-sending', queue: emailQueue },
{ name: 'analytics-reporting', queue: analyticsQueue },
{ name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue },
{ name: 'file-cleanup', queue: cleanupQueue },
{ name: 'token-cleanup', queue: tokenCleanupQueue },
{ name: 'receipt-processing', queue: receiptQueue },
{ name: 'expiry-alerts', queue: expiryAlertQueue },
{ name: 'barcode-detection', queue: barcodeQueue },
];
// Fetch job counts for all queues in parallel
const queueMetrics = await Promise.all(
queues.map(async ({ name, queue }) => {
try {
const counts = await queue.getJobCounts();
return {
name,
counts: {
waiting: counts.waiting || 0,
active: counts.active || 0,
failed: counts.failed || 0,
delayed: counts.delayed || 0,
},
};
} catch (error) {
// If individual queue fails, return error state
return {
name,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}),
);
// Fetch worker heartbeats in parallel
const workerNames = queues.map((q) => q.name);
const workerHeartbeats = await Promise.all(
workerNames.map(async (name) => {
try {
const key = `worker:heartbeat:${name}`;
const value = await redisConnection.get(key);
if (!value) {
return { name, alive: false };
}
const heartbeat = JSON.parse(value) as {
timestamp: string;
pid: number;
host: string;
};
const lastSeenMs = new Date(heartbeat.timestamp).getTime();
const nowMs = Date.now();
const ageSeconds = (nowMs - lastSeenMs) / 1000;
// Consider alive if last heartbeat < 60 seconds ago
const alive = ageSeconds < 60;
return {
name,
alive,
lastSeen: heartbeat.timestamp,
pid: heartbeat.pid,
host: heartbeat.host,
};
} catch (error) {
// If heartbeat check fails, mark as unknown
return {
name,
alive: false,
error: error instanceof Error ? error.message : 'Unknown error',
};
}
}),
);
// Build response objects
const queuesData: Record<string, QueueJobCounts | { error: string }> = {};
const workersData: Record<string, WorkerHeartbeat> = {};
let hasErrors = false;
for (const metric of queueMetrics) {
if ('error' in metric && metric.error) {
queuesData[metric.name] = { error: metric.error };
hasErrors = true;
} else if ('counts' in metric && metric.counts) {
queuesData[metric.name] = metric.counts;
}
}
for (const heartbeat of workerHeartbeats) {
if ('error' in heartbeat && heartbeat.error) {
workersData[heartbeat.name] = { alive: false, error: heartbeat.error };
} else if (!heartbeat.alive) {
workersData[heartbeat.name] = { alive: false };
hasErrors = true;
} else {
workersData[heartbeat.name] = {
alive: heartbeat.alive,
lastSeen: heartbeat.lastSeen,
pid: heartbeat.pid,
host: heartbeat.host,
};
}
}
const response: QueuesHealthResponse = {
status: hasErrors ? 'unhealthy' : 'healthy',
timestamp: new Date().toISOString(),
queues: queuesData,
workers: workersData,
};
if (hasErrors) {
this.setStatus(503);
return this.error(
this.ErrorCode.SERVICE_UNAVAILABLE,
'One or more queues or workers unavailable',
response,
);
}
return this.success(response);
}
}