PM2 Process Isolation
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 30m15s
All checks were successful
Deploy to Test Environment / deploy-to-test (push) Successful in 30m15s
This commit is contained in:
@@ -127,6 +127,17 @@ jobs:
|
|||||||
rsync -avz dist/ "$APP_PATH"
|
rsync -avz dist/ "$APP_PATH"
|
||||||
echo "Application deployment complete."
|
echo "Application deployment complete."
|
||||||
|
|
||||||
|
- name: Log Workflow Metadata
|
||||||
|
run: |
|
||||||
|
echo "=== WORKFLOW METADATA ==="
|
||||||
|
echo "Workflow file: deploy-to-prod.yml"
|
||||||
|
echo "Workflow file hash: $(sha256sum .gitea/workflows/deploy-to-prod.yml | cut -d' ' -f1)"
|
||||||
|
echo "Git commit: $(git rev-parse HEAD)"
|
||||||
|
echo "Git branch: $(git rev-parse --abbrev-ref HEAD)"
|
||||||
|
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
|
||||||
|
echo "Actor: ${{ gitea.actor }}"
|
||||||
|
echo "=== END METADATA ==="
|
||||||
|
|
||||||
- name: Install Backend Dependencies and Restart Production Server
|
- name: Install Backend Dependencies and Restart Production Server
|
||||||
env:
|
env:
|
||||||
# --- Production Secrets Injection ---
|
# --- Production Secrets Injection ---
|
||||||
@@ -165,9 +176,74 @@ jobs:
|
|||||||
cd /var/www/flyer-crawler.projectium.com
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
npm install --omit=dev
|
npm install --omit=dev
|
||||||
|
|
||||||
# --- Cleanup Errored Processes ---
|
# === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||||
|
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
pm2 jlist
|
||||||
|
echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
|
||||||
|
# --- Cleanup Errored Processes with Defense-in-Depth Safeguards ---
|
||||||
echo "Cleaning up errored or stopped PRODUCTION PM2 processes..."
|
echo "Cleaning up errored or stopped PRODUCTION PM2 processes..."
|
||||||
node -e "const exec = require('child_process').execSync; try { const list = JSON.parse(exec('pm2 jlist').toString()); const prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker']; list.forEach(p => { if ((p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') && prodProcesses.includes(p.name)) { console.log('Deleting ' + p.pm2_env.status + ' production process: ' + p.name + ' (' + p.pm2_env.pm_id + ')'); try { exec('pm2 delete ' + p.pm2_env.pm_id); } catch(e) { console.error('Failed to delete ' + p.pm2_env.pm_id); } } }); console.log('✅ Production process cleanup complete.'); } catch (e) { console.error('Error cleaning up processes:', e); }"
|
node -e "
|
||||||
|
const exec = require('child_process').execSync;
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(exec('pm2 jlist').toString());
|
||||||
|
const prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker'];
|
||||||
|
|
||||||
|
// Filter for processes that match our criteria
|
||||||
|
const targetProcesses = list.filter(p =>
|
||||||
|
(p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||||
|
prodProcesses.includes(p.name)
|
||||||
|
);
|
||||||
|
|
||||||
|
// SAFEGUARD 1: Process count validation
|
||||||
|
const totalProcesses = list.length;
|
||||||
|
if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
console.error('Total processes: ' + totalProcesses + ', Target processes: ' + targetProcesses.length);
|
||||||
|
console.error('This indicates a potential filter bug. Aborting cleanup.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFEGUARD 2: Explicit name verification
|
||||||
|
console.log('Found ' + targetProcesses.length + ' PRODUCTION processes to clean:');
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log(' - ' + p.name + ' (status: ' + p.pm2_env.status + ', pm_id: ' + p.pm2_env.pm_id + ')');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Perform the cleanup
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log('Deleting ' + p.pm2_env.status + ' production process: ' + p.name + ' (' + p.pm2_env.pm_id + ')');
|
||||||
|
try {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
} catch(e) {
|
||||||
|
console.error('Failed to delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Production process cleanup complete.');
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error cleaning up processes:', e);
|
||||||
|
}
|
||||||
|
"
|
||||||
|
|
||||||
|
# === POST-CLEANUP VERIFICATION ===
|
||||||
|
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
pm2 jlist | node -e "
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||||
|
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test') && !p.name.endsWith('-dev'));
|
||||||
|
console.log('Production processes after cleanup:');
|
||||||
|
prodProcesses.forEach(p => {
|
||||||
|
console.log(' ' + p.name + ': ' + p.pm2_env.status);
|
||||||
|
});
|
||||||
|
if (prodProcesses.length === 0) {
|
||||||
|
console.log(' (no production processes currently running)');
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Failed to parse PM2 output:', e.message);
|
||||||
|
}
|
||||||
|
"
|
||||||
|
echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
|
||||||
# --- Version Check Logic ---
|
# --- Version Check Logic ---
|
||||||
# Get the version from the newly deployed package.json
|
# Get the version from the newly deployed package.json
|
||||||
|
|||||||
@@ -87,6 +87,17 @@ jobs:
|
|||||||
- name: Lint Check
|
- name: Lint Check
|
||||||
run: npm run lint || true
|
run: npm run lint || true
|
||||||
|
|
||||||
|
- name: Log Workflow Metadata
|
||||||
|
run: |
|
||||||
|
echo "=== WORKFLOW METADATA ==="
|
||||||
|
echo "Workflow file: deploy-to-test.yml"
|
||||||
|
echo "Workflow file hash: $(sha256sum .gitea/workflows/deploy-to-test.yml | cut -d' ' -f1)"
|
||||||
|
echo "Git commit: $(git rev-parse HEAD)"
|
||||||
|
echo "Git branch: $(git rev-parse --abbrev-ref HEAD)"
|
||||||
|
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
|
||||||
|
echo "Actor: ${{ gitea.actor }}"
|
||||||
|
echo "=== END METADATA ==="
|
||||||
|
|
||||||
- name: Stop Test Server Before Tests
|
- name: Stop Test Server Before Tests
|
||||||
# This is a critical step to ensure a clean test environment.
|
# This is a critical step to ensure a clean test environment.
|
||||||
# It stops the currently running pm2 process, freeing up port 3001 so that the
|
# It stops the currently running pm2 process, freeing up port 3001 so that the
|
||||||
@@ -94,10 +105,74 @@ jobs:
|
|||||||
# '|| true' ensures the workflow doesn't fail if the process isn't running.
|
# '|| true' ensures the workflow doesn't fail if the process isn't running.
|
||||||
run: |
|
run: |
|
||||||
echo "--- Stopping and deleting all test processes ---"
|
echo "--- Stopping and deleting all test processes ---"
|
||||||
|
|
||||||
|
# === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||||
|
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
pm2 jlist || echo "No PM2 processes running"
|
||||||
|
echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
|
||||||
# Use a script to parse pm2's JSON output and delete any process whose name ends with '-test'.
|
# Use a script to parse pm2's JSON output and delete any process whose name ends with '-test'.
|
||||||
# This is safer than 'pm2 delete all' and more robust than naming each process individually.
|
# This is safer than 'pm2 delete all' and more robust than naming each process individually.
|
||||||
# It prevents the accumulation of duplicate processes from previous test runs.
|
# It prevents the accumulation of duplicate processes from previous test runs.
|
||||||
node -e "const exec = require('child_process').execSync; try { const list = JSON.parse(exec('pm2 jlist').toString()); list.forEach(p => { if (p.name && p.name.endsWith('-test')) { console.log('Deleting test process: ' + p.name + ' (' + p.pm2_env.pm_id + ')'); try { exec('pm2 delete ' + p.pm2_env.pm_id); } catch(e) { console.error('Failed to delete ' + p.pm2_env.pm_id, e.message); } } }); console.log('✅ Test process cleanup complete.'); } catch (e) { if (e.stdout.toString().includes('No process found')) { console.log('No PM2 processes running, cleanup not needed.'); } else { console.error('Error cleaning up test processes:', e.message); } }" || true
|
node -e "
|
||||||
|
const exec = require('child_process').execSync;
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(exec('pm2 jlist').toString());
|
||||||
|
|
||||||
|
// Filter for test processes only
|
||||||
|
const targetProcesses = list.filter(p => p.name && p.name.endsWith('-test'));
|
||||||
|
|
||||||
|
// SAFEGUARD 1: Process count validation
|
||||||
|
const totalProcesses = list.length;
|
||||||
|
if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
console.error('Total processes: ' + totalProcesses + ', Target processes: ' + targetProcesses.length);
|
||||||
|
console.error('This indicates a potential filter bug. Aborting cleanup.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFEGUARD 2: Explicit name verification
|
||||||
|
console.log('Found ' + targetProcesses.length + ' TEST processes to clean:');
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log(' - ' + p.name + ' (status: ' + p.pm2_env.status + ', pm_id: ' + p.pm2_env.pm_id + ')');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Perform the cleanup
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log('Deleting test process: ' + p.name + ' (' + p.pm2_env.pm_id + ')');
|
||||||
|
try {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
} catch(e) {
|
||||||
|
console.error('Failed to delete ' + p.pm2_env.pm_id, e.message);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Test process cleanup complete.');
|
||||||
|
} catch (e) {
|
||||||
|
if (e.stdout && e.stdout.toString().includes('No process found')) {
|
||||||
|
console.log('No PM2 processes running, cleanup not needed.');
|
||||||
|
} else {
|
||||||
|
console.error('Error cleaning up test processes:', e.message);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
" || true
|
||||||
|
|
||||||
|
# === POST-CLEANUP VERIFICATION ===
|
||||||
|
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
pm2 jlist 2>/dev/null | node -e "
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||||
|
const testProcesses = list.filter(p => p.name && p.name.endsWith('-test'));
|
||||||
|
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test') && !p.name.endsWith('-dev'));
|
||||||
|
console.log('Test processes after cleanup: ' + testProcesses.length);
|
||||||
|
testProcesses.forEach(p => console.log(' ' + p.name + ': ' + p.pm2_env.status));
|
||||||
|
console.log('Production processes (should be untouched): ' + prodProcesses.length);
|
||||||
|
prodProcesses.forEach(p => console.log(' ' + p.name + ': ' + p.pm2_env.status));
|
||||||
|
} catch (e) {
|
||||||
|
console.log('No PM2 processes or failed to parse output');
|
||||||
|
}
|
||||||
|
" || true
|
||||||
|
echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
|
||||||
- name: Flush Redis Test Database Before Tests
|
- name: Flush Redis Test Database Before Tests
|
||||||
# CRITICAL: Clear Redis database 1 (test database) to remove stale BullMQ jobs.
|
# CRITICAL: Clear Redis database 1 (test database) to remove stale BullMQ jobs.
|
||||||
@@ -492,9 +567,74 @@ jobs:
|
|||||||
cd /var/www/flyer-crawler-test.projectium.com
|
cd /var/www/flyer-crawler-test.projectium.com
|
||||||
npm install --omit=dev
|
npm install --omit=dev
|
||||||
|
|
||||||
# --- Cleanup Errored Processes ---
|
# === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||||
|
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
pm2 jlist
|
||||||
|
echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
|
||||||
|
# --- Cleanup Errored Processes with Defense-in-Depth Safeguards ---
|
||||||
echo "Cleaning up errored or stopped TEST PM2 processes..."
|
echo "Cleaning up errored or stopped TEST PM2 processes..."
|
||||||
node -e "const exec = require('child_process').execSync; try { const list = JSON.parse(exec('pm2 jlist').toString()); list.forEach(p => { if ((p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') && p.name && p.name.endsWith('-test')) { console.log('Deleting ' + p.pm2_env.status + ' test process: ' + p.name + ' (' + p.pm2_env.pm_id + ')'); try { exec('pm2 delete ' + p.pm2_env.pm_id); } catch(e) { console.error('Failed to delete ' + p.pm2_env.pm_id); } } }); console.log('✅ Test process cleanup complete.'); } catch (e) { console.error('Error cleaning up processes:', e); }"
|
node -e "
|
||||||
|
const exec = require('child_process').execSync;
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(exec('pm2 jlist').toString());
|
||||||
|
|
||||||
|
// Filter for errored/stopped test processes only
|
||||||
|
const targetProcesses = list.filter(p =>
|
||||||
|
(p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||||
|
p.name && p.name.endsWith('-test')
|
||||||
|
);
|
||||||
|
|
||||||
|
// SAFEGUARD 1: Process count validation
|
||||||
|
const totalProcesses = list.length;
|
||||||
|
if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
console.error('Total processes: ' + totalProcesses + ', Target processes: ' + targetProcesses.length);
|
||||||
|
console.error('This indicates a potential filter bug. Aborting cleanup.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFEGUARD 2: Explicit name verification
|
||||||
|
console.log('Found ' + targetProcesses.length + ' errored/stopped TEST processes to clean:');
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log(' - ' + p.name + ' (status: ' + p.pm2_env.status + ', pm_id: ' + p.pm2_env.pm_id + ')');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Perform the cleanup
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log('Deleting ' + p.pm2_env.status + ' test process: ' + p.name + ' (' + p.pm2_env.pm_id + ')');
|
||||||
|
try {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
} catch(e) {
|
||||||
|
console.error('Failed to delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Test process cleanup complete.');
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error cleaning up processes:', e);
|
||||||
|
}
|
||||||
|
"
|
||||||
|
|
||||||
|
# === POST-CLEANUP VERIFICATION ===
|
||||||
|
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
pm2 jlist | node -e "
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||||
|
const testProcesses = list.filter(p => p.name && p.name.endsWith('-test'));
|
||||||
|
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test') && !p.name.endsWith('-dev'));
|
||||||
|
console.log('Test processes after cleanup:');
|
||||||
|
testProcesses.forEach(p => console.log(' ' + p.name + ': ' + p.pm2_env.status));
|
||||||
|
if (testProcesses.length === 0) {
|
||||||
|
console.log(' (no test processes currently running)');
|
||||||
|
}
|
||||||
|
console.log('Production processes (should be untouched): ' + prodProcesses.length);
|
||||||
|
prodProcesses.forEach(p => console.log(' ' + p.name + ': ' + p.pm2_env.status));
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Failed to parse PM2 output:', e.message);
|
||||||
|
}
|
||||||
|
"
|
||||||
|
echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
|
||||||
# Use `startOrReload` with the TEST ecosystem file. This starts test-specific processes
|
# Use `startOrReload` with the TEST ecosystem file. This starts test-specific processes
|
||||||
# (flyer-crawler-api-test, flyer-crawler-worker-test, flyer-crawler-analytics-worker-test)
|
# (flyer-crawler-api-test, flyer-crawler-worker-test, flyer-crawler-analytics-worker-test)
|
||||||
|
|||||||
@@ -109,6 +109,17 @@ jobs:
|
|||||||
rsync -avz dist/ "$APP_PATH"
|
rsync -avz dist/ "$APP_PATH"
|
||||||
echo "Application deployment complete."
|
echo "Application deployment complete."
|
||||||
|
|
||||||
|
- name: Log Workflow Metadata
|
||||||
|
run: |
|
||||||
|
echo "=== WORKFLOW METADATA ==="
|
||||||
|
echo "Workflow file: manual-deploy-major.yml"
|
||||||
|
echo "Workflow file hash: $(sha256sum .gitea/workflows/manual-deploy-major.yml | cut -d' ' -f1)"
|
||||||
|
echo "Git commit: $(git rev-parse HEAD)"
|
||||||
|
echo "Git branch: $(git rev-parse --abbrev-ref HEAD)"
|
||||||
|
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
|
||||||
|
echo "Actor: ${{ gitea.actor }}"
|
||||||
|
echo "=== END METADATA ==="
|
||||||
|
|
||||||
- name: Install Backend Dependencies and Restart Production Server
|
- name: Install Backend Dependencies and Restart Production Server
|
||||||
env:
|
env:
|
||||||
# --- Production Secrets Injection ---
|
# --- Production Secrets Injection ---
|
||||||
@@ -138,9 +149,74 @@ jobs:
|
|||||||
cd /var/www/flyer-crawler.projectium.com
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
npm install --omit=dev
|
npm install --omit=dev
|
||||||
|
|
||||||
# --- Cleanup Errored Processes ---
|
# === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||||
|
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
pm2 jlist
|
||||||
|
echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
|
||||||
|
# --- Cleanup Errored Processes with Defense-in-Depth Safeguards ---
|
||||||
echo "Cleaning up errored or stopped PRODUCTION PM2 processes..."
|
echo "Cleaning up errored or stopped PRODUCTION PM2 processes..."
|
||||||
node -e "const exec = require('child_process').execSync; try { const list = JSON.parse(exec('pm2 jlist').toString()); const prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker']; list.forEach(p => { if ((p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') && prodProcesses.includes(p.name)) { console.log('Deleting ' + p.pm2_env.status + ' production process: ' + p.name + ' (' + p.pm2_env.pm_id + ')'); try { exec('pm2 delete ' + p.pm2_env.pm_id); } catch(e) { console.error('Failed to delete ' + p.pm2_env.pm_id); } } }); console.log('✅ Production process cleanup complete.'); } catch (e) { console.error('Error cleaning up processes:', e); }"
|
node -e "
|
||||||
|
const exec = require('child_process').execSync;
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(exec('pm2 jlist').toString());
|
||||||
|
const prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker'];
|
||||||
|
|
||||||
|
// Filter for processes that match our criteria
|
||||||
|
const targetProcesses = list.filter(p =>
|
||||||
|
(p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||||
|
prodProcesses.includes(p.name)
|
||||||
|
);
|
||||||
|
|
||||||
|
// SAFEGUARD 1: Process count validation
|
||||||
|
const totalProcesses = list.length;
|
||||||
|
if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
console.error('Total processes: ' + totalProcesses + ', Target processes: ' + targetProcesses.length);
|
||||||
|
console.error('This indicates a potential filter bug. Aborting cleanup.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SAFEGUARD 2: Explicit name verification
|
||||||
|
console.log('Found ' + targetProcesses.length + ' PRODUCTION processes to clean:');
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log(' - ' + p.name + ' (status: ' + p.pm2_env.status + ', pm_id: ' + p.pm2_env.pm_id + ')');
|
||||||
|
});
|
||||||
|
|
||||||
|
// Perform the cleanup
|
||||||
|
targetProcesses.forEach(p => {
|
||||||
|
console.log('Deleting ' + p.pm2_env.status + ' production process: ' + p.name + ' (' + p.pm2_env.pm_id + ')');
|
||||||
|
try {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
} catch(e) {
|
||||||
|
console.error('Failed to delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('Production process cleanup complete.');
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error cleaning up processes:', e);
|
||||||
|
}
|
||||||
|
"
|
||||||
|
|
||||||
|
# === POST-CLEANUP VERIFICATION ===
|
||||||
|
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
pm2 jlist | node -e "
|
||||||
|
try {
|
||||||
|
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||||
|
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test') && !p.name.endsWith('-dev'));
|
||||||
|
console.log('Production processes after cleanup:');
|
||||||
|
prodProcesses.forEach(p => {
|
||||||
|
console.log(' ' + p.name + ': ' + p.pm2_env.status);
|
||||||
|
});
|
||||||
|
if (prodProcesses.length === 0) {
|
||||||
|
console.log(' (no production processes currently running)');
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Failed to parse PM2 output:', e.message);
|
||||||
|
}
|
||||||
|
"
|
||||||
|
echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
|
||||||
# --- Version Check Logic ---
|
# --- Version Check Logic ---
|
||||||
# Get the version from the newly deployed package.json
|
# Get the version from the newly deployed package.json
|
||||||
|
|||||||
35
CLAUDE.md
35
CLAUDE.md
@@ -49,6 +49,8 @@ Out-of-sync = test failures.
|
|||||||
|
|
||||||
**CRITICAL**: Production and test environments share the same PM2 daemon on the server.
|
**CRITICAL**: Production and test environments share the same PM2 daemon on the server.
|
||||||
|
|
||||||
|
**See also**: [PM2 Process Isolation Incidents](#pm2-process-isolation-incidents) for past incidents and response procedures.
|
||||||
|
|
||||||
| Environment | Processes | Config File |
|
| Environment | Processes | Config File |
|
||||||
| ----------- | -------------------------------------------------------------------------------------------- | --------------------------- |
|
| ----------- | -------------------------------------------------------------------------------------------- | --------------------------- |
|
||||||
| Production | `flyer-crawler-api`, `flyer-crawler-worker`, `flyer-crawler-analytics-worker` | `ecosystem.config.cjs` |
|
| Production | `flyer-crawler-api`, `flyer-crawler-worker`, `flyer-crawler-analytics-worker` | `ecosystem.config.cjs` |
|
||||||
@@ -288,6 +290,39 @@ Common issues with solutions:
|
|||||||
|
|
||||||
**Full Details**: See test issues section at end of this document or [docs/development/TESTING.md](docs/development/TESTING.md)
|
**Full Details**: See test issues section at end of this document or [docs/development/TESTING.md](docs/development/TESTING.md)
|
||||||
|
|
||||||
|
### PM2 Process Isolation Incidents
|
||||||
|
|
||||||
|
**CRITICAL**: PM2 process cleanup scripts can affect all PM2 processes if not properly filtered.
|
||||||
|
|
||||||
|
**Incident**: 2026-02-17 Production Deployment (v0.15.0)
|
||||||
|
|
||||||
|
- **Impact**: ALL PM2 processes on production server were killed
|
||||||
|
- **Affected**: stock-alert.projectium.com and all other PM2-managed applications
|
||||||
|
- **Root Cause**: Under investigation (see [incident report](docs/operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md))
|
||||||
|
- **Status**: Safeguards added to prevent recurrence
|
||||||
|
|
||||||
|
**Prevention Measures** (implemented):
|
||||||
|
|
||||||
|
1. Name-based filtering (exact match or pattern-based)
|
||||||
|
2. Pre-cleanup process list logging
|
||||||
|
3. Process count validation (abort if filtering all processes)
|
||||||
|
4. Explicit name verification in logs
|
||||||
|
5. Post-cleanup verification
|
||||||
|
6. Workflow version hash logging
|
||||||
|
|
||||||
|
**If PM2 Incident Occurs**:
|
||||||
|
|
||||||
|
- **DO NOT** attempt another deployment immediately
|
||||||
|
- Follow the [PM2 Incident Response Runbook](docs/operations/PM2-INCIDENT-RESPONSE.md)
|
||||||
|
- Manually restore affected processes
|
||||||
|
- Investigate workflow execution logs before next deployment
|
||||||
|
|
||||||
|
**Related Documentation**:
|
||||||
|
|
||||||
|
- [PM2 Process Isolation Requirements](#pm2-process-isolation-productiontest-servers) (existing section)
|
||||||
|
- [Incident Report 2026-02-17](docs/operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
|
||||||
|
- [PM2 Incident Response Runbook](docs/operations/PM2-INCIDENT-RESPONSE.md)
|
||||||
|
|
||||||
### Git Bash Path Conversion (Windows)
|
### Git Bash Path Conversion (Windows)
|
||||||
|
|
||||||
Git Bash auto-converts Unix paths, breaking container commands.
|
Git Bash auto-converts Unix paths, breaking container commands.
|
||||||
|
|||||||
@@ -47,6 +47,14 @@ Production operations and deployment:
|
|||||||
- [Logstash Troubleshooting](operations/LOGSTASH-TROUBLESHOOTING.md) - Debugging logs
|
- [Logstash Troubleshooting](operations/LOGSTASH-TROUBLESHOOTING.md) - Debugging logs
|
||||||
- [Monitoring](operations/MONITORING.md) - Bugsink, health checks, observability
|
- [Monitoring](operations/MONITORING.md) - Bugsink, health checks, observability
|
||||||
|
|
||||||
|
**Incident Response**:
|
||||||
|
|
||||||
|
- [PM2 Incident Response Runbook](operations/PM2-INCIDENT-RESPONSE.md) - Step-by-step procedures for PM2 incidents
|
||||||
|
|
||||||
|
**Incident Reports**:
|
||||||
|
|
||||||
|
- [2026-02-17 PM2 Process Kill](operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md) - ALL PM2 processes killed during v0.15.0 deployment (Mitigated)
|
||||||
|
|
||||||
**NGINX Reference Configs** (in repository root):
|
**NGINX Reference Configs** (in repository root):
|
||||||
|
|
||||||
- `etc-nginx-sites-available-flyer-crawler.projectium.com` - Production server config
|
- `etc-nginx-sites-available-flyer-crawler.projectium.com` - Production server config
|
||||||
|
|||||||
199
docs/adr/0061-pm2-process-isolation-safeguards.md
Normal file
199
docs/adr/0061-pm2-process-isolation-safeguards.md
Normal file
@@ -0,0 +1,199 @@
|
|||||||
|
# ADR-061: PM2 Process Isolation Safeguards
|
||||||
|
|
||||||
|
## Status
|
||||||
|
|
||||||
|
Accepted
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
On 2026-02-17, a critical incident occurred during v0.15.0 production deployment where ALL PM2 processes on the production server were terminated, not just flyer-crawler processes. This caused unplanned downtime for multiple applications including `stock-alert.projectium.com`.
|
||||||
|
|
||||||
|
### Problem Statement
|
||||||
|
|
||||||
|
Production and test environments share the same PM2 daemon on the server. This creates a risk where deployment scripts that operate on PM2 processes can accidentally affect processes belonging to other applications or environments.
|
||||||
|
|
||||||
|
### Pre-existing Controls
|
||||||
|
|
||||||
|
Prior to the incident, PM2 process isolation controls were already in place (commit `b6a62a0`):
|
||||||
|
|
||||||
|
- Production workflows used whitelist-based filtering with explicit process names
|
||||||
|
- Test workflows filtered by `-test` suffix pattern
|
||||||
|
- CLAUDE.md documented the prohibition of `pm2 stop all`, `pm2 delete all`, and `pm2 restart all`
|
||||||
|
|
||||||
|
Despite these controls being present in the codebase and included in v0.15.0, the incident still occurred. The leading hypothesis is that the Gitea runner executed a cached/older version of the workflow file.
|
||||||
|
|
||||||
|
### Requirements
|
||||||
|
|
||||||
|
1. Prevent accidental deletion of processes from other applications or environments
|
||||||
|
2. Provide audit trail for forensic analysis when incidents occur
|
||||||
|
3. Enable automatic abort when dangerous conditions are detected
|
||||||
|
4. Maintain visibility into PM2 operations during deployment
|
||||||
|
5. Work correctly even if the filtering logic itself is bypassed
|
||||||
|
|
||||||
|
## Decision
|
||||||
|
|
||||||
|
Implement a defense-in-depth strategy with 5 layers of safeguards in all deployment workflows that interact with PM2 processes.
|
||||||
|
|
||||||
|
### Safeguard Layers
|
||||||
|
|
||||||
|
#### Layer 1: Workflow Metadata Logging
|
||||||
|
|
||||||
|
Log workflow execution metadata at the start of each deployment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "=== WORKFLOW METADATA ==="
|
||||||
|
echo "Workflow file: deploy-to-prod.yml"
|
||||||
|
echo "Workflow file hash: $(sha256sum .gitea/workflows/deploy-to-prod.yml | cut -d' ' -f1)"
|
||||||
|
echo "Git commit: $(git rev-parse HEAD)"
|
||||||
|
echo "Git branch: $(git rev-parse --abbrev-ref HEAD)"
|
||||||
|
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
|
||||||
|
echo "Actor: ${{ gitea.actor }}"
|
||||||
|
echo "=== END METADATA ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
**Purpose**: Enables verification of which workflow version was actually executed.
|
||||||
|
|
||||||
|
#### Layer 2: Pre-Cleanup PM2 State Logging
|
||||||
|
|
||||||
|
Capture full PM2 process list before any modifications:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
pm2 jlist
|
||||||
|
echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
**Purpose**: Provides forensic evidence of system state before cleanup.
|
||||||
|
|
||||||
|
#### Layer 3: Process Count Validation (SAFETY ABORT)
|
||||||
|
|
||||||
|
Abort deployment if the filter would delete ALL processes and there are more than 3 processes total:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const totalProcesses = list.length;
|
||||||
|
if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
console.error(
|
||||||
|
'Total processes: ' + totalProcesses + ', Target processes: ' + targetProcesses.length,
|
||||||
|
);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Purpose**: Catches filter bugs or unexpected conditions automatically.
|
||||||
|
|
||||||
|
**Threshold Rationale**: A threshold of 3 allows normal operation when only the expected processes exist (API, Worker, Analytics Worker) while catching anomalies when the server hosts additional applications.
|
||||||
|
|
||||||
|
#### Layer 4: Explicit Name Verification
|
||||||
|
|
||||||
|
Log the exact name, status, and PM2 ID of each process that will be affected:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
console.log('Found ' + targetProcesses.length + ' PRODUCTION processes to clean:');
|
||||||
|
targetProcesses.forEach((p) => {
|
||||||
|
console.log(
|
||||||
|
' - ' + p.name + ' (status: ' + p.pm2_env.status + ', pm_id: ' + p.pm2_env.pm_id + ')',
|
||||||
|
);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Purpose**: Provides clear visibility into cleanup operations.
|
||||||
|
|
||||||
|
#### Layer 5: Post-Cleanup Verification
|
||||||
|
|
||||||
|
After cleanup, verify environment isolation was maintained:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
pm2 jlist | node -e "
|
||||||
|
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||||
|
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test'));
|
||||||
|
console.log('Production processes after cleanup: ' + prodProcesses.length);
|
||||||
|
"
|
||||||
|
echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
**Purpose**: Immediately identifies cross-environment contamination.
|
||||||
|
|
||||||
|
## Consequences
|
||||||
|
|
||||||
|
### Positive
|
||||||
|
|
||||||
|
1. **Automatic Prevention**: Layer 3 (process count validation) can prevent catastrophic process deletion automatically, without human intervention.
|
||||||
|
|
||||||
|
2. **Forensic Capability**: Layers 1 and 2 provide the data needed to determine root cause after an incident.
|
||||||
|
|
||||||
|
3. **Visibility**: Layers 4 and 5 make PM2 operations transparent in workflow logs.
|
||||||
|
|
||||||
|
4. **Fail-Safe Design**: Even if individual layers fail, other layers provide backup protection.
|
||||||
|
|
||||||
|
5. **Non-Breaking**: Safeguards are additive and do not change the existing filtering logic.
|
||||||
|
|
||||||
|
### Negative
|
||||||
|
|
||||||
|
1. **Increased Log Volume**: Additional logging increases workflow output size.
|
||||||
|
|
||||||
|
2. **Minor Performance Impact**: Extra PM2 commands add a few seconds to deployment time.
|
||||||
|
|
||||||
|
3. **Threshold Tuning**: The threshold of 3 may need adjustment if the expected process count changes.
|
||||||
|
|
||||||
|
### Neutral
|
||||||
|
|
||||||
|
1. **Root Cause Still Unknown**: These safeguards mitigate the risk but do not definitively explain why the original incident occurred.
|
||||||
|
|
||||||
|
2. **No Structural Changes**: The underlying architecture (shared PM2 daemon) remains unchanged.
|
||||||
|
|
||||||
|
## Alternatives Considered
|
||||||
|
|
||||||
|
### PM2 Namespaces
|
||||||
|
|
||||||
|
PM2 supports namespaces to isolate groups of processes. This would provide complete isolation but requires:
|
||||||
|
|
||||||
|
- Changes to ecosystem config files
|
||||||
|
- Changes to all PM2 commands in workflows
|
||||||
|
- Potential breaking changes to monitoring and log aggregation
|
||||||
|
|
||||||
|
**Decision**: Deferred for future consideration. Current safeguards provide adequate protection.
|
||||||
|
|
||||||
|
### Separate PM2 Daemons
|
||||||
|
|
||||||
|
Running a separate PM2 daemon per application would eliminate cross-application risk entirely.
|
||||||
|
|
||||||
|
**Decision**: Not implemented due to increased operational complexity and the current safeguards being sufficient.
|
||||||
|
|
||||||
|
### Deployment Locks
|
||||||
|
|
||||||
|
Implementing mutex-style locks to prevent concurrent deployments could prevent race conditions.
|
||||||
|
|
||||||
|
**Decision**: Not implemented as the current safeguards address the identified risk. May be reconsidered if concurrent deployment issues are observed.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
### Files Modified
|
||||||
|
|
||||||
|
| File | Changes |
|
||||||
|
| ------------------------------------------ | ---------------------- |
|
||||||
|
| `.gitea/workflows/deploy-to-prod.yml` | All 5 safeguard layers |
|
||||||
|
| `.gitea/workflows/deploy-to-test.yml` | All 5 safeguard layers |
|
||||||
|
| `.gitea/workflows/manual-deploy-major.yml` | All 5 safeguard layers |
|
||||||
|
|
||||||
|
### Validation
|
||||||
|
|
||||||
|
A standalone test file validates the safeguard logic:
|
||||||
|
|
||||||
|
- **File**: `tests/qa/test-pm2-safeguard-logic.js`
|
||||||
|
- **Coverage**: 11 scenarios covering normal operations and dangerous edge cases
|
||||||
|
- **Result**: All tests pass
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- [Incident Report: 2026-02-17](../operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
|
||||||
|
- [PM2 Incident Response Runbook](../operations/PM2-INCIDENT-RESPONSE.md)
|
||||||
|
- [Session Summary](../archive/sessions/PM2_SAFEGUARDS_SESSION_2026-02-17.md)
|
||||||
|
- [CLAUDE.md - PM2 Process Isolation](../../CLAUDE.md#pm2-process-isolation-productiontest-servers)
|
||||||
|
- [ADR-014: Containerization and Deployment Strategy](0014-containerization-and-deployment-strategy.md)
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- PM2 Documentation: https://pm2.keymetrics.io/docs/usage/application-declaration/
|
||||||
|
- Defense in Depth: https://en.wikipedia.org/wiki/Defense_in_depth_(computing)
|
||||||
@@ -56,6 +56,7 @@ This directory contains a log of the architectural decisions made for the Flyer
|
|||||||
**[ADR-038](./0038-graceful-shutdown-pattern.md)**: Graceful Shutdown Pattern (Accepted)
|
**[ADR-038](./0038-graceful-shutdown-pattern.md)**: Graceful Shutdown Pattern (Accepted)
|
||||||
**[ADR-053](./0053-worker-health-checks.md)**: Worker Health Checks and Stalled Job Monitoring (Accepted)
|
**[ADR-053](./0053-worker-health-checks.md)**: Worker Health Checks and Stalled Job Monitoring (Accepted)
|
||||||
**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed)
|
**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed)
|
||||||
|
**[ADR-061](./0061-pm2-process-isolation-safeguards.md)**: PM2 Process Isolation Safeguards (Accepted)
|
||||||
|
|
||||||
## 7. Frontend / User Interface
|
## 7. Frontend / User Interface
|
||||||
|
|
||||||
|
|||||||
377
docs/archive/sessions/PM2_SAFEGUARDS_SESSION_2026-02-17.md
Normal file
377
docs/archive/sessions/PM2_SAFEGUARDS_SESSION_2026-02-17.md
Normal file
@@ -0,0 +1,377 @@
|
|||||||
|
# PM2 Process Isolation Safeguards Project
|
||||||
|
|
||||||
|
**Session Date**: 2026-02-17
|
||||||
|
**Status**: Completed
|
||||||
|
**Triggered By**: Critical production incident during v0.15.0 deployment
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Executive Summary
|
||||||
|
|
||||||
|
On 2026-02-17, a critical incident occurred during v0.15.0 production deployment where ALL PM2 processes on the production server were killed, not just the flyer-crawler processes. This caused unplanned downtime for multiple applications including `stock-alert.projectium.com`.
|
||||||
|
|
||||||
|
Despite PM2 process isolation fixes already being in place (commit `b6a62a0`), the incident still occurred. Investigation suggests the Gitea runner may have executed a cached/older version of the workflow files. In response, we implemented a comprehensive defense-in-depth strategy with 5 layers of safeguards across all deployment workflows.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Incident Background
|
||||||
|
|
||||||
|
### What Happened
|
||||||
|
|
||||||
|
| Aspect | Detail |
|
||||||
|
| --------------------- | ------------------------------------------------------- |
|
||||||
|
| **Date/Time** | 2026-02-17 ~07:40 UTC |
|
||||||
|
| **Trigger** | v0.15.0 production deployment via `deploy-to-prod.yml` |
|
||||||
|
| **Impact** | ALL PM2 processes killed (all environments) |
|
||||||
|
| **Collateral Damage** | `stock-alert.projectium.com` and other PM2-managed apps |
|
||||||
|
| **Severity** | P1 - Critical |
|
||||||
|
|
||||||
|
### Key Mystery
|
||||||
|
|
||||||
|
The PM2 process isolation fix was already implemented in commit `b6a62a0` (2026-02-13) and was included in v0.15.0. The fix correctly used whitelist-based filtering:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const prodProcesses = [
|
||||||
|
'flyer-crawler-api',
|
||||||
|
'flyer-crawler-worker',
|
||||||
|
'flyer-crawler-analytics-worker',
|
||||||
|
];
|
||||||
|
list.forEach((p) => {
|
||||||
|
if (
|
||||||
|
(p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||||
|
prodProcesses.includes(p.name)
|
||||||
|
) {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Hypothesis**: Gitea runner executed a cached older version of the workflow file that did not contain the fix.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Solution: Defense-in-Depth Safeguards
|
||||||
|
|
||||||
|
Rather than relying solely on the filter logic (which may be correct but not executed), we implemented 5 layers of safeguards that provide visibility, validation, and automatic abort capabilities.
|
||||||
|
|
||||||
|
### Safeguard Layers
|
||||||
|
|
||||||
|
| Layer | Name | Purpose |
|
||||||
|
| ----- | --------------------------------- | ------------------------------------------------------- |
|
||||||
|
| 1 | **Workflow Metadata Logging** | Audit trail of which workflow version actually executed |
|
||||||
|
| 2 | **Pre-Cleanup PM2 State Logging** | Capture full process list before any modifications |
|
||||||
|
| 3 | **Process Count Validation** | SAFETY ABORT if filter would delete ALL processes |
|
||||||
|
| 4 | **Explicit Name Verification** | Log exactly which processes will be affected |
|
||||||
|
| 5 | **Post-Cleanup Verification** | Verify environment isolation after cleanup |
|
||||||
|
|
||||||
|
### Layer Details
|
||||||
|
|
||||||
|
#### Layer 1: Workflow Metadata Logging
|
||||||
|
|
||||||
|
Logs at the start of deployment:
|
||||||
|
|
||||||
|
- Workflow file name
|
||||||
|
- SHA-256 hash of the workflow file
|
||||||
|
- Git commit being deployed
|
||||||
|
- Git branch
|
||||||
|
- Timestamp (UTC)
|
||||||
|
- Actor (who triggered the deployment)
|
||||||
|
|
||||||
|
**Purpose**: If an incident occurs, we can verify whether the executed workflow matches the repository version.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "=== WORKFLOW METADATA ==="
|
||||||
|
echo "Workflow file: deploy-to-prod.yml"
|
||||||
|
echo "Workflow file hash: $(sha256sum .gitea/workflows/deploy-to-prod.yml | cut -d' ' -f1)"
|
||||||
|
echo "Git commit: $(git rev-parse HEAD)"
|
||||||
|
echo "Timestamp: $(date -u '+%Y-%m-%d %H:%M:%S UTC')"
|
||||||
|
echo "Actor: ${{ gitea.actor }}"
|
||||||
|
echo "=== END METADATA ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Layer 2: Pre-Cleanup PM2 State Logging
|
||||||
|
|
||||||
|
Captures full PM2 process list in JSON format before any modifications.
|
||||||
|
|
||||||
|
**Purpose**: Provides forensic evidence of what processes existed before cleanup began.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
pm2 jlist
|
||||||
|
echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Layer 3: Process Count Validation (SAFETY ABORT)
|
||||||
|
|
||||||
|
The most critical safeguard. Aborts the entire deployment if the filter would delete ALL processes and there are more than 3 processes total.
|
||||||
|
|
||||||
|
**Purpose**: Catches filter bugs or unexpected conditions that would result in catastrophic process deletion.
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// SAFEGUARD 1: Process count validation
|
||||||
|
const totalProcesses = list.length;
|
||||||
|
if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
console.error(
|
||||||
|
'Total processes: ' + totalProcesses + ', Target processes: ' + targetProcesses.length,
|
||||||
|
);
|
||||||
|
console.error('This indicates a potential filter bug. Aborting cleanup.');
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Threshold Rationale**: The threshold of 3 allows normal operation when only the 3 expected processes exist (API, Worker, Analytics Worker) while catching anomalies when the server hosts more applications.
|
||||||
|
|
||||||
|
#### Layer 4: Explicit Name Verification
|
||||||
|
|
||||||
|
Logs the exact name, status, and PM2 ID of each process that will be deleted.
|
||||||
|
|
||||||
|
**Purpose**: Provides clear visibility into what the cleanup operation will actually do.
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
console.log('Found ' + targetProcesses.length + ' PRODUCTION processes to clean:');
|
||||||
|
targetProcesses.forEach((p) => {
|
||||||
|
console.log(
|
||||||
|
' - ' + p.name + ' (status: ' + p.pm2_env.status + ', pm_id: ' + p.pm2_env.pm_id + ')',
|
||||||
|
);
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Layer 5: Post-Cleanup Verification
|
||||||
|
|
||||||
|
After cleanup, logs the state of processes by environment to verify isolation was maintained.
|
||||||
|
|
||||||
|
**Purpose**: Immediately identifies if the cleanup affected the wrong environment.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
pm2 jlist | node -e "
|
||||||
|
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||||
|
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test'));
|
||||||
|
const testProcesses = list.filter(p => p.name && p.name.endsWith('-test'));
|
||||||
|
console.log('Production processes after cleanup: ' + prodProcesses.length);
|
||||||
|
console.log('Test processes (should be untouched): ' + testProcesses.length);
|
||||||
|
"
|
||||||
|
echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
### Files Modified
|
||||||
|
|
||||||
|
| File | Changes |
|
||||||
|
| ------------------------------------------ | --------------------------------------------- |
|
||||||
|
| `.gitea/workflows/deploy-to-prod.yml` | Added all 5 safeguard layers |
|
||||||
|
| `.gitea/workflows/deploy-to-test.yml` | Added all 5 safeguard layers |
|
||||||
|
| `.gitea/workflows/manual-deploy-major.yml` | Added all 5 safeguard layers |
|
||||||
|
| `CLAUDE.md` | Added PM2 Process Isolation Incidents section |
|
||||||
|
|
||||||
|
### Files Created
|
||||||
|
|
||||||
|
| File | Purpose |
|
||||||
|
| --------------------------------------------------------- | --------------------------------------- |
|
||||||
|
| `docs/operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md` | Detailed incident report |
|
||||||
|
| `docs/operations/PM2-INCIDENT-RESPONSE.md` | Comprehensive incident response runbook |
|
||||||
|
| `tests/qa/test-pm2-safeguard-logic.js` | Validation tests for safeguard logic |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing and Validation
|
||||||
|
|
||||||
|
### Test Artifact
|
||||||
|
|
||||||
|
A standalone JavaScript test file was created to validate the safeguard logic:
|
||||||
|
|
||||||
|
**File**: `tests/qa/test-pm2-safeguard-logic.js`
|
||||||
|
|
||||||
|
**Test Categories**:
|
||||||
|
|
||||||
|
1. **Normal Operations (should NOT abort)**
|
||||||
|
- 3 errored out of 15 processes
|
||||||
|
- 1 errored out of 10 processes
|
||||||
|
- 0 processes to clean
|
||||||
|
- Fresh server with 3 processes (threshold boundary)
|
||||||
|
|
||||||
|
2. **Dangerous Operations (SHOULD abort)**
|
||||||
|
- All 10 processes targeted
|
||||||
|
- All 15 processes targeted
|
||||||
|
- All 4 processes targeted (just above threshold)
|
||||||
|
|
||||||
|
3. **Workflow-Specific Filter Tests**
|
||||||
|
- Production filter only matches production processes
|
||||||
|
- Test filter only matches `-test` suffix processes
|
||||||
|
- Filters don't cross-contaminate environments
|
||||||
|
|
||||||
|
### Test Results
|
||||||
|
|
||||||
|
All 11 scenarios passed:
|
||||||
|
|
||||||
|
| Scenario | Total | Target | Expected | Result |
|
||||||
|
| -------------------------- | ----- | ------ | -------- | ------ |
|
||||||
|
| Normal prod cleanup | 15 | 3 | No abort | PASS |
|
||||||
|
| Normal test cleanup | 15 | 3 | No abort | PASS |
|
||||||
|
| Single process | 10 | 1 | No abort | PASS |
|
||||||
|
| No cleanup needed | 10 | 0 | No abort | PASS |
|
||||||
|
| Fresh server (threshold) | 3 | 3 | No abort | PASS |
|
||||||
|
| Minimal server | 2 | 2 | No abort | PASS |
|
||||||
|
| Empty PM2 | 0 | 0 | No abort | PASS |
|
||||||
|
| Filter bug - 10 processes | 10 | 10 | ABORT | PASS |
|
||||||
|
| Filter bug - 15 processes | 15 | 15 | ABORT | PASS |
|
||||||
|
| Filter bug - 4 processes | 4 | 4 | ABORT | PASS |
|
||||||
|
| Filter bug - 100 processes | 100 | 100 | ABORT | PASS |
|
||||||
|
|
||||||
|
### YAML Validation
|
||||||
|
|
||||||
|
All workflow files passed YAML syntax validation using `python -c "import yaml; yaml.safe_load(open(...))"`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Documentation Updates
|
||||||
|
|
||||||
|
### CLAUDE.md Updates
|
||||||
|
|
||||||
|
Added new section at line 293: **PM2 Process Isolation Incidents**
|
||||||
|
|
||||||
|
Contains:
|
||||||
|
|
||||||
|
- Reference to the 2026-02-17 incident
|
||||||
|
- Impact summary
|
||||||
|
- Prevention measures list
|
||||||
|
- Response instructions
|
||||||
|
- Links to related documentation
|
||||||
|
|
||||||
|
### docs/README.md
|
||||||
|
|
||||||
|
Added incident report reference under **Operations > Incident Reports**.
|
||||||
|
|
||||||
|
### Cross-References Verified
|
||||||
|
|
||||||
|
| Document | Reference | Status |
|
||||||
|
| --------------- | --------------------------------------- | ------ |
|
||||||
|
| CLAUDE.md | PM2-INCIDENT-RESPONSE.md | Valid |
|
||||||
|
| CLAUDE.md | INCIDENT-2026-02-17-PM2-PROCESS-KILL.md | Valid |
|
||||||
|
| Incident Report | CLAUDE.md PM2 section | Valid |
|
||||||
|
| Incident Report | PM2-INCIDENT-RESPONSE.md | Valid |
|
||||||
|
| docs/README.md | INCIDENT-2026-02-17-PM2-PROCESS-KILL.md | Valid |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Lessons Learned
|
||||||
|
|
||||||
|
### Technical Lessons
|
||||||
|
|
||||||
|
1. **Filter logic alone is not sufficient** - Even correct filters can be bypassed if an older version of the script is executed.
|
||||||
|
|
||||||
|
2. **Workflow caching is a real risk** - CI/CD runners may cache workflow files, leading to stale versions being executed.
|
||||||
|
|
||||||
|
3. **Defense-in-depth is essential for destructive operations** - Multiple layers of validation catch failures that single-point checks miss.
|
||||||
|
|
||||||
|
4. **Visibility enables diagnosis** - Pre/post state logging makes root cause analysis possible.
|
||||||
|
|
||||||
|
5. **Automatic abort prevents cascading failures** - The process count validation could have prevented the incident entirely.
|
||||||
|
|
||||||
|
### Process Lessons
|
||||||
|
|
||||||
|
1. **Shared PM2 daemons are risky** - Multiple applications sharing a PM2 daemon create cross-application dependencies.
|
||||||
|
|
||||||
|
2. **Documentation should include failure modes** - CLAUDE.md now explicitly documents what can go wrong and how to respond.
|
||||||
|
|
||||||
|
3. **Runbooks save time during incidents** - The incident response runbook provides step-by-step guidance when time is critical.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Considerations
|
||||||
|
|
||||||
|
### Not Implemented (Potential Future Work)
|
||||||
|
|
||||||
|
1. **PM2 Namespacing** - Use PM2's native namespace feature to completely isolate environments.
|
||||||
|
|
||||||
|
2. **Separate PM2 Daemons** - Run one PM2 daemon per application to eliminate cross-application risk.
|
||||||
|
|
||||||
|
3. **Deployment Locks** - Implement mutex-style locks to prevent concurrent deployments.
|
||||||
|
|
||||||
|
4. **Workflow Version Verification** - Add a pre-flight check that compares workflow hash against expected value.
|
||||||
|
|
||||||
|
5. **Automated Rollback** - Implement automatic process restoration if safeguards detect a problem.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- **ADR-061**: [PM2 Process Isolation Safeguards](../../adr/0061-pm2-process-isolation-safeguards.md)
|
||||||
|
- **Incident Report**: [INCIDENT-2026-02-17-PM2-PROCESS-KILL.md](../../operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
|
||||||
|
- **Response Runbook**: [PM2-INCIDENT-RESPONSE.md](../../operations/PM2-INCIDENT-RESPONSE.md)
|
||||||
|
- **CLAUDE.md Section**: [PM2 Process Isolation Incidents](../../../CLAUDE.md#pm2-process-isolation-incidents)
|
||||||
|
- **Test Artifact**: [test-pm2-safeguard-logic.js](../../../tests/qa/test-pm2-safeguard-logic.js)
|
||||||
|
- **ADR-014**: [Containerization and Deployment Strategy](../../adr/0014-containerization-and-deployment-strategy.md)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix: Workflow Changes Summary
|
||||||
|
|
||||||
|
### deploy-to-prod.yml
|
||||||
|
|
||||||
|
```diff
|
||||||
|
+ - name: Log Workflow Metadata
|
||||||
|
+ run: |
|
||||||
|
+ echo "=== WORKFLOW METADATA ==="
|
||||||
|
+ echo "Workflow file: deploy-to-prod.yml"
|
||||||
|
+ echo "Workflow file hash: $(sha256sum .gitea/workflows/deploy-to-prod.yml | cut -d' ' -f1)"
|
||||||
|
+ ...
|
||||||
|
|
||||||
|
- name: Install Backend Dependencies and Restart Production Server
|
||||||
|
run: |
|
||||||
|
+ # === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||||
|
+ echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||||
|
+ pm2 jlist
|
||||||
|
+ echo "=== END PRE-CLEANUP STATE ==="
|
||||||
|
+
|
||||||
|
# --- Cleanup Errored Processes with Defense-in-Depth Safeguards ---
|
||||||
|
node -e "
|
||||||
|
...
|
||||||
|
+ // SAFEGUARD 1: Process count validation
|
||||||
|
+ if (targetProcesses.length === totalProcesses && totalProcesses > 3) {
|
||||||
|
+ console.error('SAFETY ABORT: Filter would delete ALL processes!');
|
||||||
|
+ process.exit(1);
|
||||||
|
+ }
|
||||||
|
+
|
||||||
|
+ // SAFEGUARD 2: Explicit name verification
|
||||||
|
+ console.log('Found ' + targetProcesses.length + ' PRODUCTION processes to clean:');
|
||||||
|
+ targetProcesses.forEach(p => {
|
||||||
|
+ console.log(' - ' + p.name + ' (status: ' + p.pm2_env.status + ')');
|
||||||
|
+ });
|
||||||
|
...
|
||||||
|
"
|
||||||
|
+
|
||||||
|
+ # === POST-CLEANUP VERIFICATION ===
|
||||||
|
+ echo "=== POST-CLEANUP VERIFICATION ==="
|
||||||
|
+ pm2 jlist | node -e "..."
|
||||||
|
+ echo "=== END POST-CLEANUP VERIFICATION ==="
|
||||||
|
```
|
||||||
|
|
||||||
|
Similar changes were applied to `deploy-to-test.yml` and `manual-deploy-major.yml`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Session Participants
|
||||||
|
|
||||||
|
| Role | Agent Type | Responsibility |
|
||||||
|
| ------------ | ------------------------- | ------------------------------------- |
|
||||||
|
| Orchestrator | Main Claude | Session coordination and delegation |
|
||||||
|
| Planner | planner subagent | Incident analysis and solution design |
|
||||||
|
| Documenter | describer-for-ai subagent | Incident report creation |
|
||||||
|
| Coder #1 | coder subagent | Workflow safeguard implementation |
|
||||||
|
| Coder #2 | coder subagent | Incident response runbook creation |
|
||||||
|
| Coder #3 | coder subagent | CLAUDE.md updates |
|
||||||
|
| Tester | tester subagent | Comprehensive validation |
|
||||||
|
| Archivist | Lead Technical Archivist | Final documentation |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Revision History
|
||||||
|
|
||||||
|
| Date | Author | Change |
|
||||||
|
| ---------- | ------------------------ | ----------------------- |
|
||||||
|
| 2026-02-17 | Lead Technical Archivist | Initial session summary |
|
||||||
269
docs/operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md
Normal file
269
docs/operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md
Normal file
@@ -0,0 +1,269 @@
|
|||||||
|
# Incident Report: PM2 Process Kill During v0.15.0 Deployment
|
||||||
|
|
||||||
|
**Date**: 2026-02-17
|
||||||
|
**Severity**: Critical
|
||||||
|
**Status**: Mitigated - Safeguards Implemented
|
||||||
|
**Affected Systems**: All PM2-managed applications on projectium.com server
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Resolution Summary
|
||||||
|
|
||||||
|
**Safeguards implemented on 2026-02-17** to prevent recurrence:
|
||||||
|
|
||||||
|
1. Workflow metadata logging (audit trail)
|
||||||
|
2. Pre-cleanup PM2 state logging (forensics)
|
||||||
|
3. Process count validation with SAFETY ABORT (automatic prevention)
|
||||||
|
4. Explicit name verification (visibility)
|
||||||
|
5. Post-cleanup verification (environment isolation check)
|
||||||
|
|
||||||
|
**Documentation created**:
|
||||||
|
|
||||||
|
- [PM2 Incident Response Runbook](PM2-INCIDENT-RESPONSE.md)
|
||||||
|
- [PM2 Safeguards Session Summary](../archive/sessions/PM2_SAFEGUARDS_SESSION_2026-02-17.md)
|
||||||
|
- CLAUDE.md updated with [PM2 Process Isolation Incidents section](../../CLAUDE.md#pm2-process-isolation-incidents)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
During v0.15.0 production deployment, ALL PM2 processes on the server were terminated, not just flyer-crawler processes. This caused unplanned downtime for other applications including stock-alert.
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
| Time (Approx) | Event |
|
||||||
|
| --------------------- | ---------------------------------------------------------------- |
|
||||||
|
| 2026-02-17 ~07:40 UTC | v0.15.0 production deployment triggered via `deploy-to-prod.yml` |
|
||||||
|
| Unknown | All PM2 processes killed (flyer-crawler AND other apps) |
|
||||||
|
| Unknown | Incident discovered - stock-alert down |
|
||||||
|
| 2026-02-17 | Investigation initiated |
|
||||||
|
| 2026-02-17 | Defense-in-depth safeguards implemented in all workflows |
|
||||||
|
| 2026-02-17 | Incident response runbook created |
|
||||||
|
| 2026-02-17 | Status changed to Mitigated |
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Affected Applications**: All PM2-managed processes on projectium.com
|
||||||
|
- flyer-crawler-api, flyer-crawler-worker, flyer-crawler-analytics-worker (expected)
|
||||||
|
- stock-alert (NOT expected - collateral damage)
|
||||||
|
- Potentially other unidentified applications
|
||||||
|
- **Downtime Duration**: TBD
|
||||||
|
- **User Impact**: Service unavailability for all affected applications
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Investigation Findings
|
||||||
|
|
||||||
|
### Deployment Workflow Analysis
|
||||||
|
|
||||||
|
All deployment workflows were reviewed for PM2 process isolation:
|
||||||
|
|
||||||
|
| Workflow | PM2 Isolation | Implementation |
|
||||||
|
| ------------------------- | -------------- | ------------------------------------------------------------------------------------------------- |
|
||||||
|
| `deploy-to-prod.yml` | Whitelist | `prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker']` |
|
||||||
|
| `deploy-to-test.yml` | Pattern | `p.name.endsWith('-test')` |
|
||||||
|
| `manual-deploy-major.yml` | Whitelist | Same as deploy-to-prod |
|
||||||
|
| `manual-db-restore.yml` | Explicit names | `pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker` |
|
||||||
|
|
||||||
|
### Fix Commit Already In Place
|
||||||
|
|
||||||
|
The PM2 process isolation fix was implemented in commit `b6a62a0` (2026-02-13):
|
||||||
|
|
||||||
|
```
|
||||||
|
commit b6a62a036f39ac895271402a61e5cc4227369de7
|
||||||
|
Author: Torben Sorensen <torben.sorensen@gmail.com>
|
||||||
|
Date: Fri Feb 13 10:19:28 2026 -0800
|
||||||
|
|
||||||
|
be specific about pm2 processes
|
||||||
|
|
||||||
|
Files modified:
|
||||||
|
.gitea/workflows/deploy-to-prod.yml
|
||||||
|
.gitea/workflows/deploy-to-test.yml
|
||||||
|
.gitea/workflows/manual-db-restore.yml
|
||||||
|
.gitea/workflows/manual-deploy-major.yml
|
||||||
|
CLAUDE.md
|
||||||
|
```
|
||||||
|
|
||||||
|
### v0.15.0 Release Contains Fix
|
||||||
|
|
||||||
|
Confirmed: v0.15.0 (commit `93ad624`, 2026-02-18) includes the fix commit:
|
||||||
|
|
||||||
|
```
|
||||||
|
93ad624 ci: Bump version to 0.15.0 for production release [skip ci]
|
||||||
|
...
|
||||||
|
b6a62a0 be specific about pm2 processes <-- Fix commit included
|
||||||
|
```
|
||||||
|
|
||||||
|
### Current Workflow PM2 Commands
|
||||||
|
|
||||||
|
**Production Deploy (`deploy-to-prod.yml` line 170)**:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
const prodProcesses = [
|
||||||
|
'flyer-crawler-api',
|
||||||
|
'flyer-crawler-worker',
|
||||||
|
'flyer-crawler-analytics-worker',
|
||||||
|
];
|
||||||
|
list.forEach((p) => {
|
||||||
|
if (
|
||||||
|
(p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||||
|
prodProcesses.includes(p.name)
|
||||||
|
) {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Test Deploy (`deploy-to-test.yml` line 100)**:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
list.forEach((p) => {
|
||||||
|
if (p.name && p.name.endsWith('-test')) {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
Both implementations have proper name filtering and should NOT affect non-flyer-crawler processes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Discrepancy Analysis
|
||||||
|
|
||||||
|
### Key Mystery
|
||||||
|
|
||||||
|
**If the fixes are in place, why did ALL processes get killed?**
|
||||||
|
|
||||||
|
### Possible Explanations
|
||||||
|
|
||||||
|
#### 1. Workflow Version Mismatch (HIGH PROBABILITY)
|
||||||
|
|
||||||
|
**Hypothesis**: Gitea runner cached an older version of the workflow file.
|
||||||
|
|
||||||
|
- Gitea Actions may cache workflow definitions
|
||||||
|
- The runner might have executed an older version without the fix
|
||||||
|
- Need to verify: What version of `deploy-to-prod.yml` actually executed?
|
||||||
|
|
||||||
|
**Investigation Required**:
|
||||||
|
|
||||||
|
- Check Gitea workflow execution logs for actual script content
|
||||||
|
- Verify runner workflow caching behavior
|
||||||
|
- Compare executed workflow vs repository version
|
||||||
|
|
||||||
|
#### 2. Concurrent Workflow Execution (MEDIUM PROBABILITY)
|
||||||
|
|
||||||
|
**Hypothesis**: Another workflow ran simultaneously with destructive PM2 commands.
|
||||||
|
|
||||||
|
Workflows with potential issues:
|
||||||
|
|
||||||
|
- `manual-db-reset-prod.yml` - Does NOT restart PM2 (schema reset only)
|
||||||
|
- `manual-redis-flush-prod.yml` - Does NOT touch PM2
|
||||||
|
- Test deployment concurrent with prod deployment
|
||||||
|
|
||||||
|
**Investigation Required**:
|
||||||
|
|
||||||
|
- Check Gitea Actions history for concurrent workflow runs
|
||||||
|
- Review timestamps of all workflow executions on 2026-02-17
|
||||||
|
|
||||||
|
#### 3. Manual SSH Command (MEDIUM PROBABILITY)
|
||||||
|
|
||||||
|
**Hypothesis**: Someone SSH'd to the server and ran `pm2 stop all` or `pm2 delete all` manually.
|
||||||
|
|
||||||
|
**Investigation Required**:
|
||||||
|
|
||||||
|
- Check server shell history (if available)
|
||||||
|
- Review any maintenance windows or manual interventions
|
||||||
|
- Ask team members about manual actions
|
||||||
|
|
||||||
|
#### 4. PM2 Internal Issue (LOW PROBABILITY)
|
||||||
|
|
||||||
|
**Hypothesis**: PM2 daemon crash or corruption caused all processes to stop.
|
||||||
|
|
||||||
|
**Investigation Required**:
|
||||||
|
|
||||||
|
- Check PM2 daemon logs on server
|
||||||
|
- Look for OOM killer events in system logs
|
||||||
|
- Check disk space issues during deployment
|
||||||
|
|
||||||
|
#### 5. Script Execution Error (LOW PROBABILITY)
|
||||||
|
|
||||||
|
**Hypothesis**: JavaScript parsing error caused the filtering logic to be bypassed.
|
||||||
|
|
||||||
|
**Investigation Required**:
|
||||||
|
|
||||||
|
- Review workflow execution logs for JavaScript errors
|
||||||
|
- Test the inline Node.js scripts locally
|
||||||
|
- Check for shell escaping issues
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Documentation/Code Gaps Identified
|
||||||
|
|
||||||
|
### CLAUDE.md Documentation
|
||||||
|
|
||||||
|
The PM2 isolation rules are documented in `CLAUDE.md`, but:
|
||||||
|
|
||||||
|
- Documentation uses `pm2 restart all` in the Quick Reference table (for dev container - acceptable)
|
||||||
|
- Multiple docs still reference `pm2 restart all` without environment context
|
||||||
|
- No incident response runbook for PM2 issues
|
||||||
|
|
||||||
|
### Workflow Gaps
|
||||||
|
|
||||||
|
1. **No Workflow Audit Trail**: No logging of which exact workflow version executed
|
||||||
|
2. **No Pre-deployment Verification**: Workflows don't log PM2 state before modifications
|
||||||
|
3. **No Cross-Application Impact Assessment**: No mechanism to detect/warn about other apps
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Next Steps for Root Cause Analysis
|
||||||
|
|
||||||
|
### Immediate (Priority 1)
|
||||||
|
|
||||||
|
1. [ ] Retrieve Gitea Actions execution logs for v0.15.0 deployment
|
||||||
|
2. [ ] Extract actual executed workflow content from logs
|
||||||
|
3. [ ] Check for concurrent workflow executions on 2026-02-17
|
||||||
|
4. [ ] Review server PM2 daemon logs around incident time
|
||||||
|
|
||||||
|
### Short-term (Priority 2)
|
||||||
|
|
||||||
|
5. [ ] Implement pre-deployment PM2 state logging in workflows
|
||||||
|
6. [ ] Add workflow version hash logging for audit trail
|
||||||
|
7. [ ] Create incident response runbook for PM2/deployment issues
|
||||||
|
|
||||||
|
### Long-term (Priority 3)
|
||||||
|
|
||||||
|
8. [ ] Evaluate PM2 namespacing for complete process isolation
|
||||||
|
9. [ ] Consider separate PM2 daemon per application
|
||||||
|
10. [ ] Implement deployment monitoring/alerting
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- [CLAUDE.md - PM2 Process Isolation](../../../CLAUDE.md) (Critical Rules section)
|
||||||
|
- [ADR-014: Containerization and Deployment Strategy](../adr/0014-containerization-and-deployment-strategy.md)
|
||||||
|
- [Deployment Guide](./DEPLOYMENT.md)
|
||||||
|
- Workflow files in `.gitea/workflows/`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix: Commit Timeline
|
||||||
|
|
||||||
|
```
|
||||||
|
93ad624 ci: Bump version to 0.15.0 for production release [skip ci] <-- v0.15.0 release
|
||||||
|
7dd4f21 ci: Bump version to 0.14.4 [skip ci]
|
||||||
|
174b637 even more typescript fixes
|
||||||
|
4f80baf ci: Bump version to 0.14.3 [skip ci]
|
||||||
|
8450b5e Generate TSOA Spec and Routes
|
||||||
|
e4d830a ci: Bump version to 0.14.2 [skip ci]
|
||||||
|
b6a62a0 be specific about pm2 processes <-- PM2 fix commit
|
||||||
|
2d2cd52 Massive Dependency Modernization Project
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Revision History
|
||||||
|
|
||||||
|
| Date | Author | Change |
|
||||||
|
| ---------- | ------------------ | ----------------------- |
|
||||||
|
| 2026-02-17 | Investigation Team | Initial incident report |
|
||||||
818
docs/operations/PM2-INCIDENT-RESPONSE.md
Normal file
818
docs/operations/PM2-INCIDENT-RESPONSE.md
Normal file
@@ -0,0 +1,818 @@
|
|||||||
|
# PM2 Incident Response Runbook
|
||||||
|
|
||||||
|
**Purpose**: Step-by-step procedures for responding to PM2 process isolation incidents on the projectium.com server.
|
||||||
|
|
||||||
|
**Audience**: On-call responders, system administrators, developers with server access.
|
||||||
|
|
||||||
|
**Last updated**: 2026-02-17
|
||||||
|
|
||||||
|
**Related documentation**:
|
||||||
|
|
||||||
|
- [CLAUDE.md - PM2 Process Isolation Rules](../../CLAUDE.md)
|
||||||
|
- [Incident Report: 2026-02-17](INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
|
||||||
|
- [Monitoring Guide](MONITORING.md)
|
||||||
|
- [Deployment Guide](DEPLOYMENT.md)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Table of Contents
|
||||||
|
|
||||||
|
1. [Quick Reference](#quick-reference)
|
||||||
|
2. [Detection](#detection)
|
||||||
|
3. [Initial Assessment](#initial-assessment)
|
||||||
|
4. [Immediate Response](#immediate-response)
|
||||||
|
5. [Process Restoration](#process-restoration)
|
||||||
|
6. [Root Cause Investigation](#root-cause-investigation)
|
||||||
|
7. [Communication Templates](#communication-templates)
|
||||||
|
8. [Prevention Measures](#prevention-measures)
|
||||||
|
9. [Contact Information](#contact-information)
|
||||||
|
10. [Post-Incident Review](#post-incident-review)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Quick Reference
|
||||||
|
|
||||||
|
### PM2 Process Inventory
|
||||||
|
|
||||||
|
| Application | Environment | Process Names | Config File | Directory |
|
||||||
|
| ------------- | ----------- | -------------------------------------------------------------------------------------------- | --------------------------- | -------------------------------------------- |
|
||||||
|
| Flyer Crawler | Production | `flyer-crawler-api`, `flyer-crawler-worker`, `flyer-crawler-analytics-worker` | `ecosystem.config.cjs` | `/var/www/flyer-crawler.projectium.com` |
|
||||||
|
| Flyer Crawler | Test | `flyer-crawler-api-test`, `flyer-crawler-worker-test`, `flyer-crawler-analytics-worker-test` | `ecosystem-test.config.cjs` | `/var/www/flyer-crawler-test.projectium.com` |
|
||||||
|
| Stock Alert | Production | `stock-alert-*` | (varies) | `/var/www/stock-alert.projectium.com` |
|
||||||
|
|
||||||
|
### Critical Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check PM2 status
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Check specific process
|
||||||
|
pm2 show flyer-crawler-api
|
||||||
|
|
||||||
|
# View recent logs
|
||||||
|
pm2 logs --lines 50
|
||||||
|
|
||||||
|
# Restart specific processes (SAFE)
|
||||||
|
pm2 restart flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker
|
||||||
|
|
||||||
|
# DO NOT USE (affects ALL apps)
|
||||||
|
# pm2 restart all <-- DANGEROUS
|
||||||
|
# pm2 stop all <-- DANGEROUS
|
||||||
|
# pm2 delete all <-- DANGEROUS
|
||||||
|
```
|
||||||
|
|
||||||
|
### Severity Classification
|
||||||
|
|
||||||
|
| Severity | Criteria | Response Time | Example |
|
||||||
|
| ----------------- | --------------------------------------------- | ------------------- | ----------------------------------------------- |
|
||||||
|
| **P1 - Critical** | Multiple applications down, production impact | Immediate (< 5 min) | All PM2 processes killed |
|
||||||
|
| **P2 - High** | Single application down, production impact | < 15 min | Flyer Crawler prod down, Stock Alert unaffected |
|
||||||
|
| **P3 - Medium** | Test environment only, no production impact | < 1 hour | Test processes killed, production unaffected |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Detection
|
||||||
|
|
||||||
|
### How to Identify a PM2 Incident
|
||||||
|
|
||||||
|
**Automated Indicators**:
|
||||||
|
|
||||||
|
- Health check failures on `/api/health/ready`
|
||||||
|
- Monitoring alerts (UptimeRobot, etc.)
|
||||||
|
- Bugsink showing connection errors
|
||||||
|
- NGINX returning 502 Bad Gateway
|
||||||
|
|
||||||
|
**User-Reported Symptoms**:
|
||||||
|
|
||||||
|
- "The site is down"
|
||||||
|
- "I can't log in"
|
||||||
|
- "Pages are loading slowly then timing out"
|
||||||
|
- "I see a 502 error"
|
||||||
|
|
||||||
|
**Manual Discovery**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# SSH to server
|
||||||
|
ssh gitea-runner@projectium.com
|
||||||
|
|
||||||
|
# Check if PM2 is running
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Expected output shows processes
|
||||||
|
# If empty or all errored = incident
|
||||||
|
```
|
||||||
|
|
||||||
|
### Incident Signature: Process Isolation Violation
|
||||||
|
|
||||||
|
When a PM2 incident is caused by process isolation failure, you will see:
|
||||||
|
|
||||||
|
```text
|
||||||
|
# Expected state (normal):
|
||||||
|
+-----------------------------------+----+-----+---------+-------+
|
||||||
|
| App name | id |mode | status | cpu |
|
||||||
|
+-----------------------------------+----+-----+---------+-------+
|
||||||
|
| flyer-crawler-api | 0 |clust| online | 0% |
|
||||||
|
| flyer-crawler-worker | 1 |fork | online | 0% |
|
||||||
|
| flyer-crawler-analytics-worker | 2 |fork | online | 0% |
|
||||||
|
| flyer-crawler-api-test | 3 |fork | online | 0% |
|
||||||
|
| flyer-crawler-worker-test | 4 |fork | online | 0% |
|
||||||
|
| flyer-crawler-analytics-worker-test| 5 |fork | online | 0% |
|
||||||
|
| stock-alert-api | 6 |fork | online | 0% |
|
||||||
|
+-----------------------------------+----+-----+---------+-------+
|
||||||
|
|
||||||
|
# Incident state (isolation violation):
|
||||||
|
# All processes missing or errored - not just one app
|
||||||
|
+-----------------------------------+----+-----+---------+-------+
|
||||||
|
| App name | id |mode | status | cpu |
|
||||||
|
+-----------------------------------+----+-----+---------+-------+
|
||||||
|
# (empty or all processes errored/stopped)
|
||||||
|
+-----------------------------------+----+-----+---------+-------+
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Initial Assessment
|
||||||
|
|
||||||
|
### Step 1: Gather Information (2 minutes)
|
||||||
|
|
||||||
|
Run these commands and capture output:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Check PM2 status
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# 2. Check PM2 daemon status
|
||||||
|
pm2 ping
|
||||||
|
|
||||||
|
# 3. Check recent PM2 logs
|
||||||
|
pm2 logs --lines 20 --nostream
|
||||||
|
|
||||||
|
# 4. Check system status
|
||||||
|
systemctl status pm2-gitea-runner --no-pager
|
||||||
|
|
||||||
|
# 5. Check disk space
|
||||||
|
df -h /
|
||||||
|
|
||||||
|
# 6. Check memory
|
||||||
|
free -h
|
||||||
|
|
||||||
|
# 7. Check recent deployments (in app directory)
|
||||||
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
|
git log --oneline -5
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Determine Scope
|
||||||
|
|
||||||
|
| Question | Command | Impact Level |
|
||||||
|
| ------------------------ | ---------------------------------------------------------------- | ------------------------------- |
|
||||||
|
| How many apps affected? | `pm2 list` | Count missing/errored processes |
|
||||||
|
| Is production down? | `curl https://flyer-crawler.projectium.com/api/health/ping` | Yes/No |
|
||||||
|
| Is test down? | `curl https://flyer-crawler-test.projectium.com/api/health/ping` | Yes/No |
|
||||||
|
| Are other apps affected? | `pm2 list \| grep stock-alert` | Yes/No |
|
||||||
|
|
||||||
|
### Step 3: Classify Severity
|
||||||
|
|
||||||
|
```text
|
||||||
|
Decision Tree:
|
||||||
|
|
||||||
|
Production app(s) down?
|
||||||
|
|
|
||||||
|
+-- YES: Multiple apps affected?
|
||||||
|
| |
|
||||||
|
| +-- YES --> P1 CRITICAL (all apps down)
|
||||||
|
| |
|
||||||
|
| +-- NO --> P2 HIGH (single app down)
|
||||||
|
|
|
||||||
|
+-- NO: Test environment only?
|
||||||
|
|
|
||||||
|
+-- YES --> P3 MEDIUM
|
||||||
|
|
|
||||||
|
+-- NO --> Investigate further
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Document Initial State
|
||||||
|
|
||||||
|
Capture this information before making any changes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Save PM2 state to file
|
||||||
|
pm2 jlist > /tmp/pm2-incident-$(date +%Y%m%d-%H%M%S).json
|
||||||
|
|
||||||
|
# Save system state
|
||||||
|
{
|
||||||
|
echo "=== PM2 List ==="
|
||||||
|
pm2 list
|
||||||
|
echo ""
|
||||||
|
echo "=== Disk Space ==="
|
||||||
|
df -h
|
||||||
|
echo ""
|
||||||
|
echo "=== Memory ==="
|
||||||
|
free -h
|
||||||
|
echo ""
|
||||||
|
echo "=== Recent Git Commits ==="
|
||||||
|
cd /var/www/flyer-crawler.projectium.com && git log --oneline -5
|
||||||
|
} > /tmp/incident-state-$(date +%Y%m%d-%H%M%S).txt
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Immediate Response
|
||||||
|
|
||||||
|
### Priority 1: Stop Ongoing Deployments
|
||||||
|
|
||||||
|
If a deployment is currently running:
|
||||||
|
|
||||||
|
1. Check Gitea Actions for running workflows
|
||||||
|
2. Cancel any in-progress deployment workflows
|
||||||
|
3. Do NOT start new deployments until incident resolved
|
||||||
|
|
||||||
|
### Priority 2: Assess Which Processes Are Down
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get list of processes and their status
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Check which processes exist but are errored/stopped
|
||||||
|
pm2 jlist | jq '.[] | {name, status: .pm2_env.status}'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Priority 3: Establish Order of Restoration
|
||||||
|
|
||||||
|
Restore in this order (production first, critical path first):
|
||||||
|
|
||||||
|
| Priority | Process | Rationale |
|
||||||
|
| -------- | ------------------------------------- | ------------------------------------ |
|
||||||
|
| 1 | `flyer-crawler-api` | Production API - highest user impact |
|
||||||
|
| 2 | `flyer-crawler-worker` | Production background jobs |
|
||||||
|
| 3 | `flyer-crawler-analytics-worker` | Production analytics |
|
||||||
|
| 4 | `stock-alert-*` | Other production apps |
|
||||||
|
| 5 | `flyer-crawler-api-test` | Test environment |
|
||||||
|
| 6 | `flyer-crawler-worker-test` | Test background jobs |
|
||||||
|
| 7 | `flyer-crawler-analytics-worker-test` | Test analytics |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Process Restoration
|
||||||
|
|
||||||
|
### Scenario A: Flyer Crawler Production Processes Missing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Navigate to production directory
|
||||||
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
|
|
||||||
|
# Start production processes
|
||||||
|
pm2 start ecosystem.config.cjs
|
||||||
|
|
||||||
|
# Verify processes started
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Check health endpoint
|
||||||
|
curl -s http://localhost:3001/api/health/ready | jq .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scenario B: Flyer Crawler Test Processes Missing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Navigate to test directory
|
||||||
|
cd /var/www/flyer-crawler-test.projectium.com
|
||||||
|
|
||||||
|
# Start test processes
|
||||||
|
pm2 start ecosystem-test.config.cjs
|
||||||
|
|
||||||
|
# Verify processes started
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Check health endpoint
|
||||||
|
curl -s http://localhost:3002/api/health/ready | jq .
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scenario C: Stock Alert Processes Missing
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Navigate to stock-alert directory
|
||||||
|
cd /var/www/stock-alert.projectium.com
|
||||||
|
|
||||||
|
# Start processes (adjust config file name as needed)
|
||||||
|
pm2 start ecosystem.config.cjs
|
||||||
|
|
||||||
|
# Verify processes started
|
||||||
|
pm2 list
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scenario D: All Processes Missing
|
||||||
|
|
||||||
|
Execute restoration in priority order:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Flyer Crawler Production (highest priority)
|
||||||
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
|
pm2 start ecosystem.config.cjs
|
||||||
|
|
||||||
|
# Verify production is healthy before continuing
|
||||||
|
curl -s http://localhost:3001/api/health/ready | jq '.data.status'
|
||||||
|
# Should return "healthy"
|
||||||
|
|
||||||
|
# 2. Stock Alert Production
|
||||||
|
cd /var/www/stock-alert.projectium.com
|
||||||
|
pm2 start ecosystem.config.cjs
|
||||||
|
|
||||||
|
# 3. Flyer Crawler Test (lower priority)
|
||||||
|
cd /var/www/flyer-crawler-test.projectium.com
|
||||||
|
pm2 start ecosystem-test.config.cjs
|
||||||
|
|
||||||
|
# 4. Save PM2 process list
|
||||||
|
pm2 save
|
||||||
|
|
||||||
|
# 5. Final verification
|
||||||
|
pm2 list
|
||||||
|
```
|
||||||
|
|
||||||
|
### Health Check Verification
|
||||||
|
|
||||||
|
After restoration, verify each application:
|
||||||
|
|
||||||
|
**Flyer Crawler Production**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# API health
|
||||||
|
curl -s https://flyer-crawler.projectium.com/api/health/ready | jq '.data.status'
|
||||||
|
# Expected: "healthy"
|
||||||
|
|
||||||
|
# Check all services
|
||||||
|
curl -s https://flyer-crawler.projectium.com/api/health/ready | jq '.data.services'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Flyer Crawler Test**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s https://flyer-crawler-test.projectium.com/api/health/ready | jq '.data.status'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Stock Alert**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Adjust URL as appropriate for stock-alert
|
||||||
|
curl -s https://stock-alert.projectium.com/api/health/ready | jq '.data.status'
|
||||||
|
```
|
||||||
|
|
||||||
|
### Verification Checklist
|
||||||
|
|
||||||
|
After restoration, confirm:
|
||||||
|
|
||||||
|
- [ ] `pm2 list` shows all expected processes as `online`
|
||||||
|
- [ ] Production health check returns `healthy`
|
||||||
|
- [ ] Test health check returns `healthy` (if applicable)
|
||||||
|
- [ ] No processes showing high restart count
|
||||||
|
- [ ] No processes showing `errored` or `stopped` status
|
||||||
|
- [ ] PM2 process list saved: `pm2 save`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Root Cause Investigation
|
||||||
|
|
||||||
|
### Step 1: Check Workflow Execution Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Find recent Gitea Actions runs
|
||||||
|
# (Access via Gitea web UI: Repository > Actions > Recent Runs)
|
||||||
|
|
||||||
|
# Look for these workflows:
|
||||||
|
# - deploy-to-prod.yml
|
||||||
|
# - deploy-to-test.yml
|
||||||
|
# - manual-deploy-major.yml
|
||||||
|
# - manual-db-restore.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Check PM2 Daemon Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# PM2 daemon logs
|
||||||
|
cat ~/.pm2/pm2.log | tail -100
|
||||||
|
|
||||||
|
# PM2 process-specific logs
|
||||||
|
ls -la ~/.pm2/logs/
|
||||||
|
|
||||||
|
# Recent API logs
|
||||||
|
tail -100 ~/.pm2/logs/flyer-crawler-api-out.log
|
||||||
|
tail -100 ~/.pm2/logs/flyer-crawler-api-error.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Check System Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# System journal for PM2 service
|
||||||
|
journalctl -u pm2-gitea-runner -n 100 --no-pager
|
||||||
|
|
||||||
|
# Kernel messages (OOM killer, etc.)
|
||||||
|
journalctl -k -n 50 --no-pager | grep -i "killed\|oom\|memory"
|
||||||
|
|
||||||
|
# Authentication logs (unauthorized access)
|
||||||
|
tail -50 /var/log/auth.log
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Git History Analysis
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recent commits to deployment workflows
|
||||||
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
|
git log --oneline -20 -- .gitea/workflows/
|
||||||
|
|
||||||
|
# Check what changed in PM2 configs
|
||||||
|
git log --oneline -10 -- ecosystem.config.cjs ecosystem-test.config.cjs
|
||||||
|
|
||||||
|
# Diff against last known good state
|
||||||
|
git diff <last-good-commit> -- .gitea/workflows/ ecosystem*.cjs
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Timing Correlation
|
||||||
|
|
||||||
|
Create a timeline:
|
||||||
|
|
||||||
|
```text
|
||||||
|
| Time (UTC) | Event | Source |
|
||||||
|
|------------|-------|--------|
|
||||||
|
| XX:XX | Last successful health check | Monitoring |
|
||||||
|
| XX:XX | Deployment workflow started | Gitea Actions |
|
||||||
|
| XX:XX | First failed health check | Monitoring |
|
||||||
|
| XX:XX | Incident detected | User report / Alert |
|
||||||
|
| XX:XX | Investigation started | On-call |
|
||||||
|
```
|
||||||
|
|
||||||
|
### Common Root Causes
|
||||||
|
|
||||||
|
| Root Cause | Evidence | Prevention |
|
||||||
|
| ---------------------------- | -------------------------------------- | ---------------------------- |
|
||||||
|
| `pm2 stop all` in workflow | Workflow logs show "all" command | Use explicit process names |
|
||||||
|
| `pm2 delete all` in workflow | Empty PM2 list after deploy | Use whitelist-based deletion |
|
||||||
|
| OOM killer | `journalctl -k` shows "Killed process" | Increase memory limits |
|
||||||
|
| Disk space exhaustion | `df -h` shows 100% | Log rotation, cleanup |
|
||||||
|
| Manual intervention | Shell history shows pm2 commands | Document all manual actions |
|
||||||
|
| Concurrent deployments | Multiple workflows at same time | Implement deployment locks |
|
||||||
|
| Workflow caching issue | Old workflow version executed | Force workflow refresh |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Communication Templates
|
||||||
|
|
||||||
|
### Incident Notification (Internal)
|
||||||
|
|
||||||
|
```text
|
||||||
|
Subject: [P1 INCIDENT] PM2 Process Isolation Failure - Multiple Apps Down
|
||||||
|
|
||||||
|
Status: INVESTIGATING
|
||||||
|
Time Detected: YYYY-MM-DD HH:MM UTC
|
||||||
|
Affected Systems: [flyer-crawler-prod, stock-alert-prod, ...]
|
||||||
|
|
||||||
|
Summary:
|
||||||
|
All PM2 processes on projectium.com server were terminated unexpectedly.
|
||||||
|
Multiple production applications are currently down.
|
||||||
|
|
||||||
|
Impact:
|
||||||
|
- flyer-crawler.projectium.com: DOWN
|
||||||
|
- stock-alert.projectium.com: DOWN
|
||||||
|
- [other affected apps]
|
||||||
|
|
||||||
|
Current Actions:
|
||||||
|
- Restoring critical production processes
|
||||||
|
- Investigating root cause
|
||||||
|
|
||||||
|
Next Update: In 15 minutes or upon status change
|
||||||
|
|
||||||
|
Incident Commander: [Name]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Status Update Template
|
||||||
|
|
||||||
|
```text
|
||||||
|
Subject: [P1 INCIDENT] PM2 Process Isolation Failure - UPDATE #N
|
||||||
|
|
||||||
|
Status: [INVESTIGATING | IDENTIFIED | RESTORING | RESOLVED]
|
||||||
|
Time: YYYY-MM-DD HH:MM UTC
|
||||||
|
|
||||||
|
Progress Since Last Update:
|
||||||
|
- [Action taken]
|
||||||
|
- [Discovery made]
|
||||||
|
- [Process restored]
|
||||||
|
|
||||||
|
Current State:
|
||||||
|
- flyer-crawler.projectium.com: [UP|DOWN]
|
||||||
|
- stock-alert.projectium.com: [UP|DOWN]
|
||||||
|
|
||||||
|
Root Cause: [If identified]
|
||||||
|
|
||||||
|
Next Steps:
|
||||||
|
- [Planned action]
|
||||||
|
|
||||||
|
ETA to Resolution: [If known]
|
||||||
|
|
||||||
|
Next Update: In [X] minutes
|
||||||
|
```
|
||||||
|
|
||||||
|
### Resolution Notification
|
||||||
|
|
||||||
|
```text
|
||||||
|
Subject: [RESOLVED] PM2 Process Isolation Failure
|
||||||
|
|
||||||
|
Status: RESOLVED
|
||||||
|
Time Resolved: YYYY-MM-DD HH:MM UTC
|
||||||
|
Total Downtime: X minutes
|
||||||
|
|
||||||
|
Summary:
|
||||||
|
All PM2 processes have been restored. Services are operating normally.
|
||||||
|
|
||||||
|
Root Cause:
|
||||||
|
[Brief description of what caused the incident]
|
||||||
|
|
||||||
|
Impact Summary:
|
||||||
|
- flyer-crawler.projectium.com: Down for X minutes
|
||||||
|
- stock-alert.projectium.com: Down for X minutes
|
||||||
|
- Estimated user impact: [description]
|
||||||
|
|
||||||
|
Immediate Actions Taken:
|
||||||
|
1. [Action]
|
||||||
|
2. [Action]
|
||||||
|
|
||||||
|
Follow-up Actions:
|
||||||
|
1. [ ] [Preventive measure] - Owner: [Name] - Due: [Date]
|
||||||
|
2. [ ] Post-incident review scheduled for [Date]
|
||||||
|
|
||||||
|
Post-Incident Review: [Link or scheduled time]
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Prevention Measures
|
||||||
|
|
||||||
|
### Pre-Deployment Checklist
|
||||||
|
|
||||||
|
Before triggering any deployment:
|
||||||
|
|
||||||
|
- [ ] Review workflow file for PM2 commands
|
||||||
|
- [ ] Confirm no `pm2 stop all`, `pm2 delete all`, or `pm2 restart all`
|
||||||
|
- [ ] Verify process names are explicitly listed
|
||||||
|
- [ ] Check for concurrent deployment risks
|
||||||
|
- [ ] Confirm recent workflow changes were reviewed
|
||||||
|
|
||||||
|
### Workflow Review Checklist
|
||||||
|
|
||||||
|
When reviewing deployment workflow changes:
|
||||||
|
|
||||||
|
- [ ] All PM2 `stop` commands use explicit process names
|
||||||
|
- [ ] All PM2 `delete` commands filter by process name pattern
|
||||||
|
- [ ] All PM2 `restart` commands use explicit process names
|
||||||
|
- [ ] Test deployments filter by `-test` suffix
|
||||||
|
- [ ] Production deployments use whitelist array
|
||||||
|
|
||||||
|
**Safe Patterns**:
|
||||||
|
|
||||||
|
```javascript
|
||||||
|
// SAFE: Explicit process names (production)
|
||||||
|
const prodProcesses = [
|
||||||
|
'flyer-crawler-api',
|
||||||
|
'flyer-crawler-worker',
|
||||||
|
'flyer-crawler-analytics-worker',
|
||||||
|
];
|
||||||
|
list.forEach((p) => {
|
||||||
|
if (
|
||||||
|
(p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||||
|
prodProcesses.includes(p.name)
|
||||||
|
) {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// SAFE: Pattern-based filtering (test)
|
||||||
|
list.forEach((p) => {
|
||||||
|
if (p.name && p.name.endsWith('-test')) {
|
||||||
|
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
```
|
||||||
|
|
||||||
|
**Dangerous Patterns** (NEVER USE):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# DANGEROUS - affects ALL applications
|
||||||
|
pm2 stop all
|
||||||
|
pm2 delete all
|
||||||
|
pm2 restart all
|
||||||
|
|
||||||
|
# DANGEROUS - no name filtering
|
||||||
|
pm2 delete $(pm2 jlist | jq -r '.[] | select(.pm2_env.status == "errored") | .pm_id')
|
||||||
|
```
|
||||||
|
|
||||||
|
### PM2 Configuration Validation
|
||||||
|
|
||||||
|
Before deploying PM2 config changes:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Test configuration locally
|
||||||
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
|
node -e "console.log(JSON.stringify(require('./ecosystem.config.cjs'), null, 2))"
|
||||||
|
|
||||||
|
# Verify process names
|
||||||
|
node -e "require('./ecosystem.config.cjs').apps.forEach(a => console.log(a.name))"
|
||||||
|
|
||||||
|
# Expected output should match documented process names
|
||||||
|
```
|
||||||
|
|
||||||
|
### Deployment Monitoring
|
||||||
|
|
||||||
|
After every deployment:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Immediate verification
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Check no unexpected processes were affected
|
||||||
|
pm2 list | grep -v flyer-crawler
|
||||||
|
# Should still show other apps (e.g., stock-alert)
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
curl -s https://flyer-crawler.projectium.com/api/health/ready | jq '.data.status'
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Contact Information
|
||||||
|
|
||||||
|
### On-Call Escalation
|
||||||
|
|
||||||
|
| Role | Contact | When to Escalate |
|
||||||
|
| ----------------- | -------------- | ----------------------------------- |
|
||||||
|
| Primary On-Call | [Name/Channel] | First responder |
|
||||||
|
| Secondary On-Call | [Name/Channel] | If primary unavailable after 10 min |
|
||||||
|
| Engineering Lead | [Name/Channel] | P1 incidents > 30 min |
|
||||||
|
| Product Owner | [Name/Channel] | User communication needed |
|
||||||
|
|
||||||
|
### External Dependencies
|
||||||
|
|
||||||
|
| Service | Support Channel | When to Contact |
|
||||||
|
| --------------- | --------------- | ----------------------- |
|
||||||
|
| Server Provider | [Contact info] | Hardware/network issues |
|
||||||
|
| DNS Provider | [Contact info] | DNS resolution failures |
|
||||||
|
| SSL Certificate | [Contact info] | Certificate issues |
|
||||||
|
|
||||||
|
### Communication Channels
|
||||||
|
|
||||||
|
| Channel | Purpose |
|
||||||
|
| -------------- | -------------------------- |
|
||||||
|
| `#incidents` | Real-time incident updates |
|
||||||
|
| `#deployments` | Deployment announcements |
|
||||||
|
| `#engineering` | Technical discussion |
|
||||||
|
| Email list | Formal notifications |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Post-Incident Review
|
||||||
|
|
||||||
|
### Incident Report Template
|
||||||
|
|
||||||
|
```markdown
|
||||||
|
# Incident Report: [Title]
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
| ------------------ | ----------------- |
|
||||||
|
| Date | YYYY-MM-DD |
|
||||||
|
| Duration | X hours Y minutes |
|
||||||
|
| Severity | P1/P2/P3 |
|
||||||
|
| Incident Commander | [Name] |
|
||||||
|
| Status | Resolved |
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
| Time (UTC) | Event |
|
||||||
|
| ---------- | ------------------- |
|
||||||
|
| HH:MM | [Event description] |
|
||||||
|
| HH:MM | [Event description] |
|
||||||
|
|
||||||
|
## Impact
|
||||||
|
|
||||||
|
- **Users affected**: [Number/description]
|
||||||
|
- **Revenue impact**: [If applicable]
|
||||||
|
- **SLA impact**: [If applicable]
|
||||||
|
|
||||||
|
## Root Cause
|
||||||
|
|
||||||
|
[Detailed technical explanation]
|
||||||
|
|
||||||
|
## Resolution
|
||||||
|
|
||||||
|
[What was done to resolve the incident]
|
||||||
|
|
||||||
|
## Contributing Factors
|
||||||
|
|
||||||
|
1. [Factor]
|
||||||
|
2. [Factor]
|
||||||
|
|
||||||
|
## Action Items
|
||||||
|
|
||||||
|
| Action | Owner | Due Date | Status |
|
||||||
|
| -------- | ------ | -------- | ------ |
|
||||||
|
| [Action] | [Name] | [Date] | [ ] |
|
||||||
|
|
||||||
|
## Lessons Learned
|
||||||
|
|
||||||
|
### What Went Well
|
||||||
|
|
||||||
|
- [Item]
|
||||||
|
|
||||||
|
### What Could Be Improved
|
||||||
|
|
||||||
|
- [Item]
|
||||||
|
|
||||||
|
## Appendix
|
||||||
|
|
||||||
|
- Link to monitoring data
|
||||||
|
- Link to relevant logs
|
||||||
|
- Link to workflow runs
|
||||||
|
```
|
||||||
|
|
||||||
|
### Lessons Learned Format
|
||||||
|
|
||||||
|
Use "5 Whys" technique:
|
||||||
|
|
||||||
|
```text
|
||||||
|
Problem: All PM2 processes were killed during deployment
|
||||||
|
|
||||||
|
Why 1: The deployment workflow ran `pm2 delete all`
|
||||||
|
Why 2: The workflow used an outdated version of the script
|
||||||
|
Why 3: Gitea runner cached the old workflow file
|
||||||
|
Why 4: No mechanism to verify workflow version before execution
|
||||||
|
Why 5: Workflow versioning and audit trail not implemented
|
||||||
|
|
||||||
|
Root Cause: Lack of workflow versioning and execution verification
|
||||||
|
|
||||||
|
Preventive Measure: Implement workflow hash logging and pre-execution verification
|
||||||
|
```
|
||||||
|
|
||||||
|
### Action Items Tracking
|
||||||
|
|
||||||
|
Create Gitea issues for each action item:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Example using Gitea CLI or API
|
||||||
|
gh issue create --title "Implement PM2 state logging in deployment workflows" \
|
||||||
|
--body "Related to incident YYYY-MM-DD. Add pre-deployment PM2 state capture." \
|
||||||
|
--label "incident-follow-up,priority:high"
|
||||||
|
```
|
||||||
|
|
||||||
|
Track action items in a central location:
|
||||||
|
|
||||||
|
| Issue # | Action | Owner | Due | Status |
|
||||||
|
| ------- | -------------------------------- | ------ | ------ | ------ |
|
||||||
|
| #123 | Add PM2 state logging | [Name] | [Date] | Open |
|
||||||
|
| #124 | Implement workflow version hash | [Name] | [Date] | Open |
|
||||||
|
| #125 | Create deployment lock mechanism | [Name] | [Date] | Open |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Appendix: PM2 Command Reference
|
||||||
|
|
||||||
|
### Safe Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Status and monitoring
|
||||||
|
pm2 list
|
||||||
|
pm2 show <process-name>
|
||||||
|
pm2 monit
|
||||||
|
pm2 logs <process-name>
|
||||||
|
|
||||||
|
# Restart specific processes
|
||||||
|
pm2 restart flyer-crawler-api
|
||||||
|
pm2 restart flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker
|
||||||
|
|
||||||
|
# Reload (zero-downtime, cluster mode only)
|
||||||
|
pm2 reload flyer-crawler-api
|
||||||
|
|
||||||
|
# Start from config
|
||||||
|
pm2 start ecosystem.config.cjs
|
||||||
|
pm2 start ecosystem.config.cjs --only flyer-crawler-api
|
||||||
|
```
|
||||||
|
|
||||||
|
### Dangerous Commands (Use With Caution)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CAUTION: These affect ALL processes
|
||||||
|
pm2 stop all # Stops every PM2 process
|
||||||
|
pm2 restart all # Restarts every PM2 process
|
||||||
|
pm2 delete all # Removes every PM2 process
|
||||||
|
|
||||||
|
# CAUTION: Modifies saved process list
|
||||||
|
pm2 save # Overwrites saved process list
|
||||||
|
pm2 resurrect # Restores from saved list
|
||||||
|
|
||||||
|
# CAUTION: Affects PM2 daemon
|
||||||
|
pm2 kill # Kills PM2 daemon and all processes
|
||||||
|
pm2 update # Updates PM2 in place (may cause brief outage)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Revision History
|
||||||
|
|
||||||
|
| Date | Author | Change |
|
||||||
|
| ---------- | ---------------------- | ------------------------ |
|
||||||
|
| 2026-02-17 | Incident Response Team | Initial runbook creation |
|
||||||
222
tests/qa/test-pm2-safeguard-logic.js
Normal file
222
tests/qa/test-pm2-safeguard-logic.js
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
/**
|
||||||
|
* PM2 Safeguard Logic Validation Tests
|
||||||
|
*
|
||||||
|
* This script tests the safeguard logic implemented in deployment workflows
|
||||||
|
* to prevent accidental deletion of all PM2 processes.
|
||||||
|
*
|
||||||
|
* Run with: node tests/qa/test-pm2-safeguard-logic.js
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Simulate the safeguard logic from workflows
|
||||||
|
function evaluateSafeguard(totalProcesses, targetProcesses, threshold = 3) {
|
||||||
|
// SAFEGUARD 1: Process count validation
|
||||||
|
// If we're about to delete ALL processes AND there are more than threshold processes,
|
||||||
|
// this indicates a potential filter bug
|
||||||
|
const shouldAbort = targetProcesses === totalProcesses && totalProcesses > threshold;
|
||||||
|
return { shouldAbort, totalProcesses, targetProcesses };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test scenarios
|
||||||
|
const scenarios = [
|
||||||
|
// Normal operations - should NOT abort
|
||||||
|
{
|
||||||
|
name: 'Normal production cleanup - 3 errored out of 15',
|
||||||
|
totalProcs: 15,
|
||||||
|
targetProcs: 3,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'Production deployment cleans up only the 3 errored production processes',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Normal test cleanup - 3 test processes out of 15',
|
||||||
|
totalProcs: 15,
|
||||||
|
targetProcs: 3,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'Test deployment cleans up only the 3 test processes',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Single process cleanup - 1 errored out of 10',
|
||||||
|
totalProcs: 10,
|
||||||
|
targetProcs: 1,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'Only one process is errored and targeted for cleanup',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'No processes to clean - 0 out of 10',
|
||||||
|
totalProcs: 10,
|
||||||
|
targetProcs: 0,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'No processes match the cleanup criteria',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Fresh server - 3 out of 3 (at threshold)',
|
||||||
|
totalProcs: 3,
|
||||||
|
targetProcs: 3,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'Server with only 3 processes (threshold boundary - should proceed)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Minimal server - 2 out of 2',
|
||||||
|
totalProcs: 2,
|
||||||
|
targetProcs: 2,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'Server with only 2 processes (below threshold)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Empty PM2 state - 0 out of 0',
|
||||||
|
totalProcs: 0,
|
||||||
|
targetProcs: 0,
|
||||||
|
expectedAbort: false,
|
||||||
|
description: 'No PM2 processes at all (fresh install)',
|
||||||
|
},
|
||||||
|
|
||||||
|
// Dangerous operations - SHOULD abort
|
||||||
|
{
|
||||||
|
name: 'Filter bug - all 10 processes targeted',
|
||||||
|
totalProcs: 10,
|
||||||
|
targetProcs: 10,
|
||||||
|
expectedAbort: true,
|
||||||
|
description: 'DANGEROUS: Filter would delete ALL 10 processes - indicates bug',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Filter bug - all 15 processes targeted',
|
||||||
|
totalProcs: 15,
|
||||||
|
targetProcs: 15,
|
||||||
|
expectedAbort: true,
|
||||||
|
description: 'DANGEROUS: Filter would delete ALL 15 processes - indicates bug',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Filter bug - all 4 processes targeted',
|
||||||
|
totalProcs: 4,
|
||||||
|
targetProcs: 4,
|
||||||
|
expectedAbort: true,
|
||||||
|
description: 'DANGEROUS: Filter would delete ALL 4 processes (just above threshold)',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'Filter bug - all 100 processes targeted',
|
||||||
|
totalProcs: 100,
|
||||||
|
targetProcs: 100,
|
||||||
|
expectedAbort: true,
|
||||||
|
description: 'DANGEROUS: Filter would delete ALL 100 processes - extreme case',
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
// Run tests
|
||||||
|
console.log('========================================');
|
||||||
|
console.log('PM2 SAFEGUARD LOGIC VALIDATION');
|
||||||
|
console.log('========================================\n');
|
||||||
|
|
||||||
|
let passed = 0;
|
||||||
|
let failed = 0;
|
||||||
|
|
||||||
|
scenarios.forEach((scenario, index) => {
|
||||||
|
const result = evaluateSafeguard(scenario.totalProcs, scenario.targetProcs);
|
||||||
|
const testPassed = result.shouldAbort === scenario.expectedAbort;
|
||||||
|
|
||||||
|
if (testPassed) {
|
||||||
|
passed++;
|
||||||
|
console.log(`[PASS] Test ${index + 1}: ${scenario.name}`);
|
||||||
|
console.log(` Total: ${scenario.totalProcs}, Target: ${scenario.targetProcs}`);
|
||||||
|
console.log(` Expected abort: ${scenario.expectedAbort}, Got: ${result.shouldAbort}`);
|
||||||
|
} else {
|
||||||
|
failed++;
|
||||||
|
console.log(`[FAIL] Test ${index + 1}: ${scenario.name}`);
|
||||||
|
console.log(` Total: ${scenario.totalProcs}, Target: ${scenario.targetProcs}`);
|
||||||
|
console.log(` Expected abort: ${scenario.expectedAbort}, Got: ${result.shouldAbort}`);
|
||||||
|
console.log(` Description: ${scenario.description}`);
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('========================================');
|
||||||
|
console.log(`RESULTS: ${passed} passed, ${failed} failed`);
|
||||||
|
console.log('========================================');
|
||||||
|
|
||||||
|
// Edge case tests for specific workflow patterns
|
||||||
|
console.log('\n========================================');
|
||||||
|
console.log('WORKFLOW-SPECIFIC FILTER TESTS');
|
||||||
|
console.log('========================================\n');
|
||||||
|
|
||||||
|
// Simulate production workflow filter
|
||||||
|
function simulateProdFilter(processList) {
|
||||||
|
const prodProcesses = [
|
||||||
|
'flyer-crawler-api',
|
||||||
|
'flyer-crawler-worker',
|
||||||
|
'flyer-crawler-analytics-worker',
|
||||||
|
];
|
||||||
|
return processList.filter(
|
||||||
|
(p) => (p.status === 'errored' || p.status === 'stopped') && prodProcesses.includes(p.name),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Simulate test workflow filter
|
||||||
|
function simulateTestFilter(processList) {
|
||||||
|
return processList.filter((p) => p.name && p.name.endsWith('-test'));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test case: Normal mixed environment
|
||||||
|
const mixedEnvProcesses = [
|
||||||
|
{ name: 'flyer-crawler-api', status: 'online' },
|
||||||
|
{ name: 'flyer-crawler-worker', status: 'errored' },
|
||||||
|
{ name: 'flyer-crawler-analytics-worker', status: 'online' },
|
||||||
|
{ name: 'flyer-crawler-api-test', status: 'online' },
|
||||||
|
{ name: 'flyer-crawler-worker-test', status: 'online' },
|
||||||
|
{ name: 'flyer-crawler-analytics-worker-test', status: 'online' },
|
||||||
|
{ name: 'stock-alert-api', status: 'online' },
|
||||||
|
{ name: 'stock-alert-worker', status: 'online' },
|
||||||
|
];
|
||||||
|
|
||||||
|
const prodFiltered = simulateProdFilter(mixedEnvProcesses);
|
||||||
|
const testFiltered = simulateTestFilter(mixedEnvProcesses);
|
||||||
|
|
||||||
|
console.log('Test: Mixed environment with production processes');
|
||||||
|
console.log(`Total processes: ${mixedEnvProcesses.length}`);
|
||||||
|
console.log(`Production filter matches: ${prodFiltered.length}`);
|
||||||
|
console.log(` Names: ${prodFiltered.map((p) => p.name).join(', ') || '(none)'}`);
|
||||||
|
console.log(`Test filter matches: ${testFiltered.length}`);
|
||||||
|
console.log(` Names: ${testFiltered.map((p) => p.name).join(', ')}`);
|
||||||
|
|
||||||
|
// Verify production filter does NOT match test or other apps
|
||||||
|
const prodFilterSafe = prodFiltered.every(
|
||||||
|
(p) => !p.name.endsWith('-test') && p.name.startsWith('flyer-crawler-'),
|
||||||
|
);
|
||||||
|
console.log(`Production filter safe (no test/other apps): ${prodFilterSafe ? 'PASS' : 'FAIL'}`);
|
||||||
|
|
||||||
|
// Verify test filter does NOT match production or other apps
|
||||||
|
const testFilterSafe = testFiltered.every((p) => p.name.endsWith('-test'));
|
||||||
|
console.log(`Test filter safe (only -test suffix): ${testFilterSafe ? 'PASS' : 'FAIL'}`);
|
||||||
|
|
||||||
|
// Test case: All processes errored (dangerous scenario)
|
||||||
|
console.log('\nTest: All production processes errored (edge case)');
|
||||||
|
const allErroredProd = [
|
||||||
|
{ name: 'flyer-crawler-api', status: 'errored' },
|
||||||
|
{ name: 'flyer-crawler-worker', status: 'errored' },
|
||||||
|
{ name: 'flyer-crawler-analytics-worker', status: 'errored' },
|
||||||
|
{ name: 'flyer-crawler-api-test', status: 'online' },
|
||||||
|
{ name: 'flyer-crawler-worker-test', status: 'online' },
|
||||||
|
{ name: 'stock-alert-api', status: 'online' },
|
||||||
|
];
|
||||||
|
|
||||||
|
const allErroredFiltered = simulateProdFilter(allErroredProd);
|
||||||
|
const safeguardCheck = evaluateSafeguard(allErroredProd.length, allErroredFiltered.length);
|
||||||
|
console.log(`Total processes: ${allErroredProd.length}`);
|
||||||
|
console.log(`Production errored processes: ${allErroredFiltered.length}`);
|
||||||
|
console.log(`Safeguard would abort: ${safeguardCheck.shouldAbort}`);
|
||||||
|
console.log(`Expected: false (3 out of 6 is not ALL processes)`);
|
||||||
|
console.log(`Result: ${safeguardCheck.shouldAbort === false ? 'PASS' : 'FAIL'}`);
|
||||||
|
|
||||||
|
// Test case: Bug simulation - filter returns everything
|
||||||
|
console.log('\nTest: Bug simulation - filter returns all processes');
|
||||||
|
const buggyFilterResult = mixedEnvProcesses; // Simulating a bug where filter returns everything
|
||||||
|
const buggySafeguardCheck = evaluateSafeguard(mixedEnvProcesses.length, buggyFilterResult.length);
|
||||||
|
console.log(`Total processes: ${mixedEnvProcesses.length}`);
|
||||||
|
console.log(`Buggy filter matches: ${buggyFilterResult.length}`);
|
||||||
|
console.log(`Safeguard would abort: ${buggySafeguardCheck.shouldAbort}`);
|
||||||
|
console.log(`Expected: true (prevents all-process deletion)`);
|
||||||
|
console.log(`Result: ${buggySafeguardCheck.shouldAbort === true ? 'PASS' : 'FAIL'}`);
|
||||||
|
|
||||||
|
console.log('\n========================================');
|
||||||
|
console.log('ALL TESTS COMPLETE');
|
||||||
|
console.log('========================================');
|
||||||
|
|
||||||
|
// Exit with appropriate code
|
||||||
|
process.exit(failed > 0 ? 1 : 0);
|
||||||
Reference in New Issue
Block a user