Some checks failed
Deploy to Test Environment / deploy-to-test (push) Failing after 3m58s
674 lines
20 KiB
TypeScript
674 lines
20 KiB
TypeScript
// src/controllers/health.controller.ts
|
|
// ============================================================================
|
|
// HEALTH CONTROLLER
|
|
// ============================================================================
|
|
// Provides health check endpoints for monitoring the application state,
|
|
// implementing ADR-020: Health Checks and Liveness/Readiness Probes.
|
|
//
|
|
// This controller exposes endpoints for:
|
|
// - Liveness probe (/live) - Is the server process running?
|
|
// - Readiness probe (/ready) - Is the server ready to accept traffic?
|
|
// - Startup probe (/startup) - Has the server completed initialization?
|
|
// - Individual service health checks (db, redis, storage, queues)
|
|
// ============================================================================
|
|
|
|
import { Get, Route, Tags, SuccessResponse, Response } from 'tsoa';
|
|
import { BaseController } from './base.controller';
|
|
import type { SuccessResponse as SuccessResponseType, ErrorResponse, ServiceHealth } from './types';
|
|
import { getPoolStatus, getPool, checkTablesExist } from '../services/db/connection.db';
|
|
import { connection as redisConnection } from '../services/queueService.server';
|
|
import {
|
|
flyerQueue,
|
|
emailQueue,
|
|
analyticsQueue,
|
|
weeklyAnalyticsQueue,
|
|
cleanupQueue,
|
|
tokenCleanupQueue,
|
|
receiptQueue,
|
|
expiryAlertQueue,
|
|
barcodeQueue,
|
|
} from '../services/queues.server';
|
|
import { getSimpleWeekAndYear } from '../utils/dateUtils';
|
|
import fs from 'node:fs/promises';
|
|
|
|
// ============================================================================
|
|
// RESPONSE TYPES
|
|
// ============================================================================
|
|
// Types for health check responses that will appear in the OpenAPI spec.
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Simple ping response.
|
|
*/
|
|
interface PingResponse {
|
|
message: string;
|
|
}
|
|
|
|
/**
|
|
* Liveness probe response.
|
|
*/
|
|
interface LivenessResponse {
|
|
status: 'ok';
|
|
timestamp: string;
|
|
}
|
|
|
|
/**
|
|
* Readiness probe response with service status.
|
|
*/
|
|
interface ReadinessResponse {
|
|
status: 'healthy' | 'degraded' | 'unhealthy';
|
|
timestamp: string;
|
|
uptime: number;
|
|
services: {
|
|
database: ServiceHealth;
|
|
redis: ServiceHealth;
|
|
storage: ServiceHealth;
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Startup probe response.
|
|
*/
|
|
interface StartupResponse {
|
|
status: 'started' | 'starting';
|
|
timestamp: string;
|
|
database: ServiceHealth;
|
|
}
|
|
|
|
/**
|
|
* Database schema check response.
|
|
*/
|
|
interface DbSchemaResponse {
|
|
message: string;
|
|
}
|
|
|
|
/**
|
|
* Storage check response.
|
|
*/
|
|
interface StorageResponse {
|
|
message: string;
|
|
}
|
|
|
|
/**
|
|
* Database pool status response.
|
|
*/
|
|
interface DbPoolResponse {
|
|
message: string;
|
|
totalCount: number;
|
|
idleCount: number;
|
|
waitingCount: number;
|
|
}
|
|
|
|
/**
|
|
* Server time response.
|
|
*/
|
|
interface TimeResponse {
|
|
currentTime: string;
|
|
year: number;
|
|
week: number;
|
|
}
|
|
|
|
/**
|
|
* Redis health check response.
|
|
*/
|
|
interface RedisHealthResponse {
|
|
message: string;
|
|
}
|
|
|
|
/**
|
|
* Queue job counts.
|
|
*/
|
|
interface QueueJobCounts {
|
|
waiting: number;
|
|
active: number;
|
|
failed: number;
|
|
delayed: number;
|
|
}
|
|
|
|
/**
|
|
* Worker heartbeat status.
|
|
*/
|
|
interface WorkerHeartbeat {
|
|
alive: boolean;
|
|
lastSeen?: string;
|
|
pid?: number;
|
|
host?: string;
|
|
error?: string;
|
|
}
|
|
|
|
/**
|
|
* Queue health response with metrics and worker heartbeats.
|
|
*/
|
|
interface QueuesHealthResponse {
|
|
status: 'healthy' | 'unhealthy';
|
|
timestamp: string;
|
|
queues: Record<string, QueueJobCounts | { error: string }>;
|
|
workers: Record<string, WorkerHeartbeat>;
|
|
}
|
|
|
|
// ============================================================================
|
|
// HELPER FUNCTIONS
|
|
// ============================================================================
|
|
// Reusable functions for checking service health.
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Checks database connectivity with timing.
|
|
*
|
|
* @returns ServiceHealth object with database status and latency
|
|
*/
|
|
async function checkDatabase(): Promise<ServiceHealth> {
|
|
const start = Date.now();
|
|
try {
|
|
const pool = getPool();
|
|
await pool.query('SELECT 1');
|
|
const latency = Date.now() - start;
|
|
const poolStatus = getPoolStatus();
|
|
|
|
// Consider degraded if waiting connections > 3
|
|
const status = poolStatus.waitingCount > 3 ? 'degraded' : 'healthy';
|
|
|
|
return {
|
|
status,
|
|
latency,
|
|
details: {
|
|
totalConnections: poolStatus.totalCount,
|
|
idleConnections: poolStatus.idleCount,
|
|
waitingConnections: poolStatus.waitingCount,
|
|
} as Record<string, unknown>,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
latency: Date.now() - start,
|
|
message: error instanceof Error ? error.message : 'Database connection failed',
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks Redis connectivity with timing.
|
|
*
|
|
* @returns ServiceHealth object with Redis status and latency
|
|
*/
|
|
async function checkRedis(): Promise<ServiceHealth> {
|
|
const start = Date.now();
|
|
try {
|
|
const reply = await redisConnection.ping();
|
|
const latency = Date.now() - start;
|
|
|
|
if (reply === 'PONG') {
|
|
return { status: 'healthy', latency };
|
|
}
|
|
return {
|
|
status: 'unhealthy',
|
|
latency,
|
|
message: `Unexpected ping response: ${reply}`,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
status: 'unhealthy',
|
|
latency: Date.now() - start,
|
|
message: error instanceof Error ? error.message : 'Redis connection failed',
|
|
};
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Checks storage accessibility with timing.
|
|
*
|
|
* @returns ServiceHealth object with storage status and latency
|
|
*/
|
|
async function checkStorage(): Promise<ServiceHealth> {
|
|
const storagePath =
|
|
process.env.STORAGE_PATH || '/var/www/flyer-crawler.projectium.com/flyer-images';
|
|
const start = Date.now();
|
|
try {
|
|
await fs.access(storagePath, fs.constants.W_OK);
|
|
return {
|
|
status: 'healthy',
|
|
latency: Date.now() - start,
|
|
details: { path: storagePath },
|
|
};
|
|
} catch {
|
|
return {
|
|
status: 'unhealthy',
|
|
latency: Date.now() - start,
|
|
message: `Storage not accessible: ${storagePath}`,
|
|
};
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// HEALTH CONTROLLER
|
|
// ============================================================================
|
|
|
|
/**
|
|
* Health check controller for monitoring application state.
|
|
*
|
|
* Provides endpoints for Kubernetes liveness/readiness/startup probes
|
|
* and individual service health checks per ADR-020.
|
|
*/
|
|
@Route('health')
|
|
@Tags('Health')
|
|
export class HealthController extends BaseController {
|
|
// ==========================================================================
|
|
// BASIC HEALTH CHECKS
|
|
// ==========================================================================
|
|
|
|
/**
|
|
* Simple ping endpoint.
|
|
*
|
|
* Returns a pong response to verify server is responsive.
|
|
* Use this for basic connectivity checks.
|
|
*
|
|
* @summary Simple ping endpoint
|
|
* @returns A pong response confirming the server is alive
|
|
*/
|
|
@Get('ping')
|
|
@SuccessResponse(200, 'Server is responsive')
|
|
public async ping(): Promise<SuccessResponseType<PingResponse>> {
|
|
return this.success({ message: 'pong' });
|
|
}
|
|
|
|
// ==========================================================================
|
|
// KUBERNETES PROBES (ADR-020)
|
|
// ==========================================================================
|
|
|
|
/**
|
|
* Liveness probe.
|
|
*
|
|
* Returns 200 OK if the server process is running.
|
|
* If this fails, the orchestrator should restart the container.
|
|
* This endpoint is intentionally simple and has no external dependencies.
|
|
*
|
|
* @summary Liveness probe
|
|
* @returns Status indicating the server process is alive
|
|
*/
|
|
@Get('live')
|
|
@SuccessResponse(200, 'Server process is alive')
|
|
public async live(): Promise<SuccessResponseType<LivenessResponse>> {
|
|
return this.success({
|
|
status: 'ok',
|
|
timestamp: new Date().toISOString(),
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Readiness probe.
|
|
*
|
|
* Returns 200 OK if the server is ready to accept traffic.
|
|
* Checks all critical dependencies (database, Redis, storage).
|
|
* If this fails, the orchestrator should remove the container from the load balancer.
|
|
*
|
|
* @summary Readiness probe
|
|
* @returns Service health status for all critical dependencies
|
|
*/
|
|
@Get('ready')
|
|
@SuccessResponse(200, 'Server is ready to accept traffic')
|
|
@Response<ErrorResponse>(503, 'Service is unhealthy and should not receive traffic')
|
|
public async ready(): Promise<SuccessResponseType<ReadinessResponse> | ErrorResponse> {
|
|
// Check all services in parallel for speed
|
|
const [database, redis, storage] = await Promise.all([
|
|
checkDatabase(),
|
|
checkRedis(),
|
|
checkStorage(),
|
|
]);
|
|
|
|
// Determine overall status
|
|
// - 'healthy' if all critical services (db, redis) are healthy
|
|
// - 'degraded' if any service is degraded but none unhealthy
|
|
// - 'unhealthy' if any critical service is unhealthy
|
|
const criticalServices = [database, redis];
|
|
const allServices = [database, redis, storage];
|
|
|
|
let overallStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy';
|
|
|
|
if (criticalServices.some((s) => s.status === 'unhealthy')) {
|
|
overallStatus = 'unhealthy';
|
|
} else if (allServices.some((s) => s.status === 'degraded')) {
|
|
overallStatus = 'degraded';
|
|
}
|
|
|
|
const response: ReadinessResponse = {
|
|
status: overallStatus,
|
|
timestamp: new Date().toISOString(),
|
|
uptime: process.uptime(),
|
|
services: {
|
|
database,
|
|
redis,
|
|
storage,
|
|
},
|
|
};
|
|
|
|
// Return appropriate HTTP status code
|
|
// 200 = healthy or degraded (can still handle traffic)
|
|
// 503 = unhealthy (should not receive traffic)
|
|
if (overallStatus === 'unhealthy') {
|
|
this.setStatus(503);
|
|
return this.error(this.ErrorCode.SERVICE_UNAVAILABLE, 'Service unhealthy', response);
|
|
}
|
|
return this.success(response);
|
|
}
|
|
|
|
/**
|
|
* Startup probe.
|
|
*
|
|
* Similar to readiness but used during container startup.
|
|
* The orchestrator will not send liveness/readiness probes until this succeeds.
|
|
* This allows for longer initialization times without triggering restarts.
|
|
*
|
|
* @summary Startup probe for container orchestration
|
|
* @returns Startup status with database health
|
|
*/
|
|
@Get('startup')
|
|
@SuccessResponse(200, 'Server has started successfully')
|
|
@Response<ErrorResponse>(503, 'Server is still starting')
|
|
public async startup(): Promise<SuccessResponseType<StartupResponse> | ErrorResponse> {
|
|
// For startup, we only check database connectivity
|
|
// Redis and storage can be checked later in readiness
|
|
const database = await checkDatabase();
|
|
|
|
if (database.status === 'unhealthy') {
|
|
this.setStatus(503);
|
|
return this.error(this.ErrorCode.SERVICE_UNAVAILABLE, 'Waiting for database connection', {
|
|
status: 'starting',
|
|
database,
|
|
});
|
|
}
|
|
|
|
return this.success({
|
|
status: 'started',
|
|
timestamp: new Date().toISOString(),
|
|
database,
|
|
});
|
|
}
|
|
|
|
// ==========================================================================
|
|
// INDIVIDUAL SERVICE HEALTH CHECKS
|
|
// ==========================================================================
|
|
|
|
/**
|
|
* Database schema check.
|
|
*
|
|
* Checks if all essential database tables exist.
|
|
* This is a critical check to ensure the database schema is correctly set up.
|
|
*
|
|
* @summary Check database schema
|
|
* @returns Message confirming all required tables exist
|
|
*/
|
|
@Get('db-schema')
|
|
@SuccessResponse(200, 'All required database tables exist')
|
|
@Response<ErrorResponse>(500, 'Database schema check failed')
|
|
public async dbSchema(): Promise<SuccessResponseType<DbSchemaResponse> | ErrorResponse> {
|
|
const requiredTables = ['users', 'profiles', 'flyers', 'flyer_items', 'stores'];
|
|
const missingTables = await checkTablesExist(requiredTables);
|
|
|
|
if (missingTables.length > 0) {
|
|
this.setStatus(500);
|
|
return this.error(
|
|
this.ErrorCode.INTERNAL_ERROR,
|
|
`Database schema check failed. Missing tables: ${missingTables.join(', ')}.`,
|
|
);
|
|
}
|
|
|
|
return this.success({ message: 'All required database tables exist.' });
|
|
}
|
|
|
|
/**
|
|
* Storage health check.
|
|
*
|
|
* Verifies that the application's file storage path is accessible and writable.
|
|
* This is important for features like file uploads.
|
|
*
|
|
* @summary Check storage accessibility
|
|
* @returns Message confirming storage is accessible
|
|
*/
|
|
@Get('storage')
|
|
@SuccessResponse(200, 'Storage is accessible and writable')
|
|
@Response<ErrorResponse>(500, 'Storage check failed')
|
|
public async storage(): Promise<SuccessResponseType<StorageResponse> | ErrorResponse> {
|
|
const storagePath =
|
|
process.env.STORAGE_PATH || '/var/www/flyer-crawler.projectium.com/flyer-images';
|
|
|
|
try {
|
|
await fs.access(storagePath, fs.constants.W_OK);
|
|
return this.success({
|
|
message: `Storage directory '${storagePath}' is accessible and writable.`,
|
|
});
|
|
} catch {
|
|
this.setStatus(500);
|
|
return this.error(
|
|
this.ErrorCode.INTERNAL_ERROR,
|
|
`Storage check failed. Ensure the directory '${storagePath}' exists and is writable by the application.`,
|
|
);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Database pool status check.
|
|
*
|
|
* Checks the status of the database connection pool.
|
|
* This helps diagnose issues related to database connection saturation.
|
|
*
|
|
* @summary Check database connection pool status
|
|
* @returns Pool status with connection counts
|
|
*/
|
|
@Get('db-pool')
|
|
@SuccessResponse(200, 'Database pool is healthy')
|
|
@Response<ErrorResponse>(500, 'Database pool may be under stress')
|
|
public async dbPool(): Promise<SuccessResponseType<DbPoolResponse> | ErrorResponse> {
|
|
const status = getPoolStatus();
|
|
const isHealthy = status.waitingCount < 5;
|
|
const message = `Pool Status: ${status.totalCount} total, ${status.idleCount} idle, ${status.waitingCount} waiting.`;
|
|
|
|
if (isHealthy) {
|
|
return this.success({
|
|
message,
|
|
totalCount: status.totalCount,
|
|
idleCount: status.idleCount,
|
|
waitingCount: status.waitingCount,
|
|
});
|
|
}
|
|
|
|
this.setStatus(500);
|
|
return this.error(
|
|
this.ErrorCode.INTERNAL_ERROR,
|
|
`Pool may be under stress. ${message}`,
|
|
status,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Server time check.
|
|
*
|
|
* Returns the server's current time, year, and week number.
|
|
* Useful for verifying time synchronization and for features dependent on week numbers.
|
|
*
|
|
* @summary Get server time and week number
|
|
* @returns Current server time with year and week number
|
|
*/
|
|
@Get('time')
|
|
@SuccessResponse(200, 'Server time retrieved')
|
|
public async time(): Promise<SuccessResponseType<TimeResponse>> {
|
|
const now = new Date();
|
|
const { year, week } = getSimpleWeekAndYear(now);
|
|
return this.success({
|
|
currentTime: now.toISOString(),
|
|
year,
|
|
week,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Redis health check.
|
|
*
|
|
* Checks the health of the Redis connection.
|
|
*
|
|
* @summary Check Redis connectivity
|
|
* @returns Message confirming Redis is healthy
|
|
*/
|
|
@Get('redis')
|
|
@SuccessResponse(200, 'Redis connection is healthy')
|
|
@Response<ErrorResponse>(500, 'Redis health check failed')
|
|
public async redis(): Promise<SuccessResponseType<RedisHealthResponse> | ErrorResponse> {
|
|
try {
|
|
const reply = await redisConnection.ping();
|
|
if (reply === 'PONG') {
|
|
return this.success({ message: 'Redis connection is healthy.' });
|
|
}
|
|
throw new Error(`Unexpected Redis ping response: ${reply}`);
|
|
} catch (error) {
|
|
this.setStatus(500);
|
|
const message = error instanceof Error ? error.message : 'Redis health check failed';
|
|
return this.error(this.ErrorCode.INTERNAL_ERROR, message);
|
|
}
|
|
}
|
|
|
|
// ==========================================================================
|
|
// QUEUE HEALTH MONITORING (ADR-053)
|
|
// ==========================================================================
|
|
|
|
/**
|
|
* Queue health and metrics with worker heartbeats.
|
|
*
|
|
* Returns job counts for all BullMQ queues and worker heartbeat status.
|
|
* Use this endpoint to monitor queue depths and detect stuck/frozen workers.
|
|
* Implements ADR-053: Worker Health Checks and Stalled Job Monitoring.
|
|
*
|
|
* @summary Queue health and metrics
|
|
* @returns Queue metrics and worker heartbeat status
|
|
*/
|
|
@Get('queues')
|
|
@SuccessResponse(200, 'Queue metrics retrieved successfully')
|
|
@Response<ErrorResponse>(503, 'One or more queues or workers unavailable')
|
|
public async queues(): Promise<SuccessResponseType<QueuesHealthResponse> | ErrorResponse> {
|
|
// Define all queues to monitor
|
|
const queues = [
|
|
{ name: 'flyer-processing', queue: flyerQueue },
|
|
{ name: 'email-sending', queue: emailQueue },
|
|
{ name: 'analytics-reporting', queue: analyticsQueue },
|
|
{ name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue },
|
|
{ name: 'file-cleanup', queue: cleanupQueue },
|
|
{ name: 'token-cleanup', queue: tokenCleanupQueue },
|
|
{ name: 'receipt-processing', queue: receiptQueue },
|
|
{ name: 'expiry-alerts', queue: expiryAlertQueue },
|
|
{ name: 'barcode-detection', queue: barcodeQueue },
|
|
];
|
|
|
|
// Fetch job counts for all queues in parallel
|
|
const queueMetrics = await Promise.all(
|
|
queues.map(async ({ name, queue }) => {
|
|
try {
|
|
const counts = await queue.getJobCounts();
|
|
return {
|
|
name,
|
|
counts: {
|
|
waiting: counts.waiting || 0,
|
|
active: counts.active || 0,
|
|
failed: counts.failed || 0,
|
|
delayed: counts.delayed || 0,
|
|
},
|
|
};
|
|
} catch (error) {
|
|
// If individual queue fails, return error state
|
|
return {
|
|
name,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
};
|
|
}
|
|
}),
|
|
);
|
|
|
|
// Fetch worker heartbeats in parallel
|
|
const workerNames = queues.map((q) => q.name);
|
|
const workerHeartbeats = await Promise.all(
|
|
workerNames.map(async (name) => {
|
|
try {
|
|
const key = `worker:heartbeat:${name}`;
|
|
const value = await redisConnection.get(key);
|
|
|
|
if (!value) {
|
|
return { name, alive: false };
|
|
}
|
|
|
|
const heartbeat = JSON.parse(value) as {
|
|
timestamp: string;
|
|
pid: number;
|
|
host: string;
|
|
};
|
|
const lastSeenMs = new Date(heartbeat.timestamp).getTime();
|
|
const nowMs = Date.now();
|
|
const ageSeconds = (nowMs - lastSeenMs) / 1000;
|
|
|
|
// Consider alive if last heartbeat < 60 seconds ago
|
|
const alive = ageSeconds < 60;
|
|
|
|
return {
|
|
name,
|
|
alive,
|
|
lastSeen: heartbeat.timestamp,
|
|
pid: heartbeat.pid,
|
|
host: heartbeat.host,
|
|
};
|
|
} catch (error) {
|
|
// If heartbeat check fails, mark as unknown
|
|
return {
|
|
name,
|
|
alive: false,
|
|
error: error instanceof Error ? error.message : 'Unknown error',
|
|
};
|
|
}
|
|
}),
|
|
);
|
|
|
|
// Build response objects
|
|
const queuesData: Record<string, QueueJobCounts | { error: string }> = {};
|
|
const workersData: Record<string, WorkerHeartbeat> = {};
|
|
let hasErrors = false;
|
|
|
|
for (const metric of queueMetrics) {
|
|
if ('error' in metric && metric.error) {
|
|
queuesData[metric.name] = { error: metric.error };
|
|
hasErrors = true;
|
|
} else if ('counts' in metric && metric.counts) {
|
|
queuesData[metric.name] = metric.counts;
|
|
}
|
|
}
|
|
|
|
for (const heartbeat of workerHeartbeats) {
|
|
if ('error' in heartbeat && heartbeat.error) {
|
|
workersData[heartbeat.name] = { alive: false, error: heartbeat.error };
|
|
} else if (!heartbeat.alive) {
|
|
workersData[heartbeat.name] = { alive: false };
|
|
hasErrors = true;
|
|
} else {
|
|
workersData[heartbeat.name] = {
|
|
alive: heartbeat.alive,
|
|
lastSeen: heartbeat.lastSeen,
|
|
pid: heartbeat.pid,
|
|
host: heartbeat.host,
|
|
};
|
|
}
|
|
}
|
|
|
|
const response: QueuesHealthResponse = {
|
|
status: hasErrors ? 'unhealthy' : 'healthy',
|
|
timestamp: new Date().toISOString(),
|
|
queues: queuesData,
|
|
workers: workersData,
|
|
};
|
|
|
|
if (hasErrors) {
|
|
this.setStatus(503);
|
|
return this.error(
|
|
this.ErrorCode.SERVICE_UNAVAILABLE,
|
|
'One or more queues or workers unavailable',
|
|
response,
|
|
);
|
|
}
|
|
|
|
return this.success(response);
|
|
}
|
|
}
|