flyer-crawler.projectium.com/src/controllers/health.controller.ts

// src/controllers/health.controller.ts
// ============================================================================
// HEALTH CONTROLLER
// ============================================================================
// Provides health check endpoints for monitoring the application state,
// implementing ADR-020: Health Checks and Liveness/Readiness Probes.
//
// This controller exposes endpoints for:
// - Liveness probe (/live) - Is the server process running?
// - Readiness probe (/ready) - Is the server ready to accept traffic?
// - Startup probe (/startup) - Has the server completed initialization?
// - Individual service health checks (db, redis, storage, queues)
// ============================================================================

import { Get, Route, Tags, SuccessResponse, Response } from 'tsoa';
import { BaseController } from './base.controller';
import type { SuccessResponse as SuccessResponseType, ErrorResponse, ServiceHealth } from './types';
import { getPoolStatus, getPool, checkTablesExist } from '../services/db/connection.db';
import { connection as redisConnection } from '../services/queueService.server';
import {
  flyerQueue,
  emailQueue,
  analyticsQueue,
  weeklyAnalyticsQueue,
  cleanupQueue,
  tokenCleanupQueue,
  receiptQueue,
  expiryAlertQueue,
  barcodeQueue,
} from '../services/queues.server';
import { getSimpleWeekAndYear } from '../utils/dateUtils';
import fs from 'node:fs/promises';

// ============================================================================
// RESPONSE TYPES
// ============================================================================
// Types for health check responses that will appear in the OpenAPI spec.
// ============================================================================

/**
 * Simple ping response.
 */
interface PingResponse {
  message: string;
}

/**
 * Liveness probe response.
 */
interface LivenessResponse {
  status: 'ok';
  timestamp: string;
}

/**
 * Readiness probe response with service status.
 */
interface ReadinessResponse {
  status: 'healthy' | 'degraded' | 'unhealthy';
  timestamp: string;
  uptime: number;
  services: {
    database: ServiceHealth;
    redis: ServiceHealth;
    storage: ServiceHealth;
  };
}

/**
 * Startup probe response.
 */
interface StartupResponse {
  status: 'started' | 'starting';
  timestamp: string;
  database: ServiceHealth;
}

/**
 * Database schema check response.
 */
interface DbSchemaResponse {
  message: string;
}

/**
 * Storage check response.
 */
interface StorageResponse {
  message: string;
}

/**
 * Database pool status response.
 */
interface DbPoolResponse {
  message: string;
  totalCount: number;
  idleCount: number;
  waitingCount: number;
}

/**
 * Server time response.
 */
interface TimeResponse {
  currentTime: string;
  year: number;
  week: number;
}

/**
 * Redis health check response.
 */
interface RedisHealthResponse {
  message: string;
}

/**
 * Queue job counts.
 */
interface QueueJobCounts {
  waiting: number;
  active: number;
  failed: number;
  delayed: number;
}

/**
 * Worker heartbeat status.
 */
interface WorkerHeartbeat {
  alive: boolean;
  lastSeen?: string;
  pid?: number;
  host?: string;
  error?: string;
}

/**
 * Queue health response with metrics and worker heartbeats.
 */
interface QueuesHealthResponse {
  status: 'healthy' | 'unhealthy';
  timestamp: string;
  queues: Record<string, QueueJobCounts | { error: string }>;
  workers: Record<string, WorkerHeartbeat>;
}

// ============================================================================
// HELPER FUNCTIONS
// ============================================================================
// Reusable functions for checking service health.
// ============================================================================

/**
 * Checks database connectivity with timing.
 *
 * @returns ServiceHealth object with database status and latency
 */
async function checkDatabase(): Promise<ServiceHealth> {
  const start = Date.now();
  try {
    const pool = getPool();
    await pool.query('SELECT 1');
    const latency = Date.now() - start;
    const poolStatus = getPoolStatus();

    // Consider degraded if waiting connections > 3
    const status = poolStatus.waitingCount > 3 ? 'degraded' : 'healthy';

    return {
      status,
      latency,
      details: {
        totalConnections: poolStatus.totalCount,
        idleConnections: poolStatus.idleCount,
        waitingConnections: poolStatus.waitingCount,
      } as Record<string, unknown>,
    };
  } catch (error) {
    return {
      status: 'unhealthy',
      latency: Date.now() - start,
      message: error instanceof Error ? error.message : 'Database connection failed',
    };
  }
}

/**
 * Checks Redis connectivity with timing.
 *
 * @returns ServiceHealth object with Redis status and latency
 */
async function checkRedis(): Promise<ServiceHealth> {
  const start = Date.now();
  try {
    const reply = await redisConnection.ping();
    const latency = Date.now() - start;

    if (reply === 'PONG') {
      return { status: 'healthy', latency };
    }
    return {
      status: 'unhealthy',
      latency,
      message: `Unexpected ping response: ${reply}`,
    };
  } catch (error) {
    return {
      status: 'unhealthy',
      latency: Date.now() - start,
      message: error instanceof Error ? error.message : 'Redis connection failed',
    };
  }
}

/**
 * Checks storage accessibility with timing.
 *
 * @returns ServiceHealth object with storage status and latency
 */
async function checkStorage(): Promise<ServiceHealth> {
  const storagePath =
    process.env.STORAGE_PATH || '/var/www/flyer-crawler.projectium.com/flyer-images';
  const start = Date.now();
  try {
    await fs.access(storagePath, fs.constants.W_OK);
    return {
      status: 'healthy',
      latency: Date.now() - start,
      details: { path: storagePath },
    };
  } catch {
    return {
      status: 'unhealthy',
      latency: Date.now() - start,
      message: `Storage not accessible: ${storagePath}`,
    };
  }
}

// ============================================================================
// HEALTH CONTROLLER
// ============================================================================

/**
 * Health check controller for monitoring application state.
 *
 * Provides endpoints for Kubernetes liveness/readiness/startup probes
 * and individual service health checks per ADR-020.
 */
@Route('health')
@Tags('Health')
export class HealthController extends BaseController {
  // ==========================================================================
  // BASIC HEALTH CHECKS
  // ==========================================================================

  /**
   * Simple ping endpoint.
   *
   * Returns a pong response to verify server is responsive.
   * Use this for basic connectivity checks.
   *
   * @summary Simple ping endpoint
   * @returns A pong response confirming the server is alive
   */
  @Get('ping')
  @SuccessResponse(200, 'Server is responsive')
  public async ping(): Promise<SuccessResponseType<PingResponse>> {
    return this.success({ message: 'pong' });
  }

  // ==========================================================================
  // KUBERNETES PROBES (ADR-020)
  // ==========================================================================

  /**
   * Liveness probe.
   *
   * Returns 200 OK if the server process is running.
   * If this fails, the orchestrator should restart the container.
   * This endpoint is intentionally simple and has no external dependencies.
   *
   * @summary Liveness probe
   * @returns Status indicating the server process is alive
   */
  @Get('live')
  @SuccessResponse(200, 'Server process is alive')
  public async live(): Promise<SuccessResponseType<LivenessResponse>> {
    return this.success({
      status: 'ok',
      timestamp: new Date().toISOString(),
    });
  }

  /**
   * Readiness probe.
   *
   * Returns 200 OK if the server is ready to accept traffic.
   * Checks all critical dependencies (database, Redis, storage).
   * If this fails, the orchestrator should remove the container from the load balancer.
   *
   * @summary Readiness probe
   * @returns Service health status for all critical dependencies
   */
  @Get('ready')
  @SuccessResponse(200, 'Server is ready to accept traffic')
  @Response<ErrorResponse>(503, 'Service is unhealthy and should not receive traffic')
  public async ready(): Promise<SuccessResponseType<ReadinessResponse> | ErrorResponse> {
    // Check all services in parallel for speed
    const [database, redis, storage] = await Promise.all([
      checkDatabase(),
      checkRedis(),
      checkStorage(),
    ]);

    // Determine overall status
    // - 'healthy' if all critical services (db, redis) are healthy
    // - 'degraded' if any service is degraded but none unhealthy
    // - 'unhealthy' if any critical service is unhealthy
    const criticalServices = [database, redis];
    const allServices = [database, redis, storage];

    let overallStatus: 'healthy' | 'degraded' | 'unhealthy' = 'healthy';

    if (criticalServices.some((s) => s.status === 'unhealthy')) {
      overallStatus = 'unhealthy';
    } else if (allServices.some((s) => s.status === 'degraded')) {
      overallStatus = 'degraded';
    }

    const response: ReadinessResponse = {
      status: overallStatus,
      timestamp: new Date().toISOString(),
      uptime: process.uptime(),
      services: {
        database,
        redis,
        storage,
      },
    };

    // Return appropriate HTTP status code
    // 200 = healthy or degraded (can still handle traffic)
    // 503 = unhealthy (should not receive traffic)
    if (overallStatus === 'unhealthy') {
      this.setStatus(503);
      return this.error(this.ErrorCode.SERVICE_UNAVAILABLE, 'Service unhealthy', response);
    }
    return this.success(response);
  }

  /**
   * Startup probe.
   *
   * Similar to readiness but used during container startup.
   * The orchestrator will not send liveness/readiness probes until this succeeds.
   * This allows for longer initialization times without triggering restarts.
   *
   * @summary Startup probe for container orchestration
   * @returns Startup status with database health
   */
  @Get('startup')
  @SuccessResponse(200, 'Server has started successfully')
  @Response<ErrorResponse>(503, 'Server is still starting')
  public async startup(): Promise<SuccessResponseType<StartupResponse> | ErrorResponse> {
    // For startup, we only check database connectivity
    // Redis and storage can be checked later in readiness
    const database = await checkDatabase();

    if (database.status === 'unhealthy') {
      this.setStatus(503);
      return this.error(this.ErrorCode.SERVICE_UNAVAILABLE, 'Waiting for database connection', {
        status: 'starting',
        database,
      });
    }

    return this.success({
      status: 'started',
      timestamp: new Date().toISOString(),
      database,
    });
  }

  // ==========================================================================
  // INDIVIDUAL SERVICE HEALTH CHECKS
  // ==========================================================================

  /**
   * Database schema check.
   *
   * Checks if all essential database tables exist.
   * This is a critical check to ensure the database schema is correctly set up.
   *
   * @summary Check database schema
   * @returns Message confirming all required tables exist
   */
  @Get('db-schema')
  @SuccessResponse(200, 'All required database tables exist')
  @Response<ErrorResponse>(500, 'Database schema check failed')
  public async dbSchema(): Promise<SuccessResponseType<DbSchemaResponse> | ErrorResponse> {
    const requiredTables = ['users', 'profiles', 'flyers', 'flyer_items', 'stores'];
    const missingTables = await checkTablesExist(requiredTables);

    if (missingTables.length > 0) {
      this.setStatus(500);
      return this.error(
        this.ErrorCode.INTERNAL_ERROR,
        `Database schema check failed. Missing tables: ${missingTables.join(', ')}.`,
      );
    }

    return this.success({ message: 'All required database tables exist.' });
  }

  /**
   * Storage health check.
   *
   * Verifies that the application's file storage path is accessible and writable.
   * This is important for features like file uploads.
   *
   * @summary Check storage accessibility
   * @returns Message confirming storage is accessible
   */
  @Get('storage')
  @SuccessResponse(200, 'Storage is accessible and writable')
  @Response<ErrorResponse>(500, 'Storage check failed')
  public async storage(): Promise<SuccessResponseType<StorageResponse> | ErrorResponse> {
    const storagePath =
      process.env.STORAGE_PATH || '/var/www/flyer-crawler.projectium.com/flyer-images';

    try {
      await fs.access(storagePath, fs.constants.W_OK);
      return this.success({
        message: `Storage directory '${storagePath}' is accessible and writable.`,
      });
    } catch {
      this.setStatus(500);
      return this.error(
        this.ErrorCode.INTERNAL_ERROR,
        `Storage check failed. Ensure the directory '${storagePath}' exists and is writable by the application.`,
      );
    }
  }

  /**
   * Database pool status check.
   *
   * Checks the status of the database connection pool.
   * This helps diagnose issues related to database connection saturation.
   *
   * @summary Check database connection pool status
   * @returns Pool status with connection counts
   */
  @Get('db-pool')
  @SuccessResponse(200, 'Database pool is healthy')
  @Response<ErrorResponse>(500, 'Database pool may be under stress')
  public async dbPool(): Promise<SuccessResponseType<DbPoolResponse> | ErrorResponse> {
    const status = getPoolStatus();
    const isHealthy = status.waitingCount < 5;
    const message = `Pool Status: ${status.totalCount} total, ${status.idleCount} idle, ${status.waitingCount} waiting.`;

    if (isHealthy) {
      return this.success({
        message,
        totalCount: status.totalCount,
        idleCount: status.idleCount,
        waitingCount: status.waitingCount,
      });
    }

    this.setStatus(500);
    return this.error(
      this.ErrorCode.INTERNAL_ERROR,
      `Pool may be under stress. ${message}`,
      status,
    );
  }

  /**
   * Server time check.
   *
   * Returns the server's current time, year, and week number.
   * Useful for verifying time synchronization and for features dependent on week numbers.
   *
   * @summary Get server time and week number
   * @returns Current server time with year and week number
   */
  @Get('time')
  @SuccessResponse(200, 'Server time retrieved')
  public async time(): Promise<SuccessResponseType<TimeResponse>> {
    const now = new Date();
    const { year, week } = getSimpleWeekAndYear(now);
    return this.success({
      currentTime: now.toISOString(),
      year,
      week,
    });
  }

  /**
   * Redis health check.
   *
   * Checks the health of the Redis connection.
   *
   * @summary Check Redis connectivity
   * @returns Message confirming Redis is healthy
   */
  @Get('redis')
  @SuccessResponse(200, 'Redis connection is healthy')
  @Response<ErrorResponse>(500, 'Redis health check failed')
  public async redis(): Promise<SuccessResponseType<RedisHealthResponse> | ErrorResponse> {
    try {
      const reply = await redisConnection.ping();
      if (reply === 'PONG') {
        return this.success({ message: 'Redis connection is healthy.' });
      }
      throw new Error(`Unexpected Redis ping response: ${reply}`);
    } catch (error) {
      this.setStatus(500);
      const message = error instanceof Error ? error.message : 'Redis health check failed';
      return this.error(this.ErrorCode.INTERNAL_ERROR, message);
    }
  }

  // ==========================================================================
  // QUEUE HEALTH MONITORING (ADR-053)
  // ==========================================================================

  /**
   * Queue health and metrics with worker heartbeats.
   *
   * Returns job counts for all BullMQ queues and worker heartbeat status.
   * Use this endpoint to monitor queue depths and detect stuck/frozen workers.
   * Implements ADR-053: Worker Health Checks and Stalled Job Monitoring.
   *
   * @summary Queue health and metrics
   * @returns Queue metrics and worker heartbeat status
   */
  @Get('queues')
  @SuccessResponse(200, 'Queue metrics retrieved successfully')
  @Response<ErrorResponse>(503, 'One or more queues or workers unavailable')
  public async queues(): Promise<SuccessResponseType<QueuesHealthResponse> | ErrorResponse> {
    // Define all queues to monitor
    const queues = [
      { name: 'flyer-processing', queue: flyerQueue },
      { name: 'email-sending', queue: emailQueue },
      { name: 'analytics-reporting', queue: analyticsQueue },
      { name: 'weekly-analytics-reporting', queue: weeklyAnalyticsQueue },
      { name: 'file-cleanup', queue: cleanupQueue },
      { name: 'token-cleanup', queue: tokenCleanupQueue },
      { name: 'receipt-processing', queue: receiptQueue },
      { name: 'expiry-alerts', queue: expiryAlertQueue },
      { name: 'barcode-detection', queue: barcodeQueue },
    ];

    // Fetch job counts for all queues in parallel
    const queueMetrics = await Promise.all(
      queues.map(async ({ name, queue }) => {
        try {
          const counts = await queue.getJobCounts();
          return {
            name,
            counts: {
              waiting: counts.waiting || 0,
              active: counts.active || 0,
              failed: counts.failed || 0,
              delayed: counts.delayed || 0,
            },
          };
        } catch (error) {
          // If individual queue fails, return error state
          return {
            name,
            error: error instanceof Error ? error.message : 'Unknown error',
          };
        }
      }),
    );

    // Fetch worker heartbeats in parallel
    const workerNames = queues.map((q) => q.name);
    const workerHeartbeats = await Promise.all(
      workerNames.map(async (name) => {
        try {
          const key = `worker:heartbeat:${name}`;
          const value = await redisConnection.get(key);

          if (!value) {
            return { name, alive: false };
          }

          const heartbeat = JSON.parse(value) as {
            timestamp: string;
            pid: number;
            host: string;
          };
          const lastSeenMs = new Date(heartbeat.timestamp).getTime();
          const nowMs = Date.now();
          const ageSeconds = (nowMs - lastSeenMs) / 1000;

          // Consider alive if last heartbeat < 60 seconds ago
          const alive = ageSeconds < 60;

          return {
            name,
            alive,
            lastSeen: heartbeat.timestamp,
            pid: heartbeat.pid,
            host: heartbeat.host,
          };
        } catch (error) {
          // If heartbeat check fails, mark as unknown
          return {
            name,
            alive: false,
            error: error instanceof Error ? error.message : 'Unknown error',
          };
        }
      }),
    );

    // Build response objects
    const queuesData: Record<string, QueueJobCounts | { error: string }> = {};
    const workersData: Record<string, WorkerHeartbeat> = {};
    let hasErrors = false;

    for (const metric of queueMetrics) {
      if ('error' in metric && metric.error) {
        queuesData[metric.name] = { error: metric.error };
        hasErrors = true;
      } else if ('counts' in metric && metric.counts) {
        queuesData[metric.name] = metric.counts;
      }
    }

    for (const heartbeat of workerHeartbeats) {
      if ('error' in heartbeat && heartbeat.error) {
        workersData[heartbeat.name] = { alive: false, error: heartbeat.error };
      } else if (!heartbeat.alive) {
        workersData[heartbeat.name] = { alive: false };
        hasErrors = true;
      } else {
        workersData[heartbeat.name] = {
          alive: heartbeat.alive,
          lastSeen: heartbeat.lastSeen,
          pid: heartbeat.pid,
          host: heartbeat.host,
        };
      }
    }

    const response: QueuesHealthResponse = {
      status: hasErrors ? 'unhealthy' : 'healthy',
      timestamp: new Date().toISOString(),
      queues: queuesData,
      workers: workersData,
    };

    if (hasErrors) {
      this.setStatus(503);
      return this.error(
        this.ErrorCode.SERVICE_UNAVAILABLE,
        'One or more queues or workers unavailable',
        response,
      );
    }

    return this.success(response);
  }
}