Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8140cf6b4a | ||
|
|
1af5d8ff5a | ||
|
|
d21496e8b2 | ||
| dfeb2f1c3d | |||
|
|
209d7ceba1 | ||
|
|
645c1784b7 | ||
|
|
e02716c092 | ||
| 66e6d2fdbc | |||
| 82a38b4e2a | |||
|
|
f6f4415aeb | ||
|
|
0c23aa4c5e | ||
| 07125fc99d | |||
| 626aa80799 |
@@ -163,8 +163,8 @@ jobs:
|
||||
# ========================================
|
||||
echo ""
|
||||
echo "--- Stopping PM2 Processes ---"
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker || echo "No production processes to stop"
|
||||
pm2 list
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker --namespace flyer-crawler-prod || echo "No production processes to stop"
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
|
||||
# ========================================
|
||||
# LAYER 3: SAFE RSYNC WITH COMPREHENSIVE EXCLUDES
|
||||
@@ -253,7 +253,7 @@ jobs:
|
||||
|
||||
# === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||
pm2 jlist
|
||||
pm2 jlist --namespace flyer-crawler-prod
|
||||
echo "=== END PRE-CLEANUP STATE ==="
|
||||
|
||||
# --- Cleanup Errored Processes with Defense-in-Depth Safeguards ---
|
||||
@@ -261,7 +261,7 @@ jobs:
|
||||
node -e "
|
||||
const exec = require('child_process').execSync;
|
||||
try {
|
||||
const list = JSON.parse(exec('pm2 jlist').toString());
|
||||
const list = JSON.parse(exec('pm2 jlist --namespace flyer-crawler-prod').toString());
|
||||
const prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker'];
|
||||
|
||||
// Filter for processes that match our criteria
|
||||
@@ -289,7 +289,7 @@ jobs:
|
||||
targetProcesses.forEach(p => {
|
||||
console.log('Deleting ' + p.pm2_env.status + ' production process: ' + p.name + ' (' + p.pm2_env.pm_id + ')');
|
||||
try {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id + ' --namespace flyer-crawler-prod');
|
||||
} catch(e) {
|
||||
console.error('Failed to delete ' + p.pm2_env.pm_id);
|
||||
}
|
||||
@@ -303,11 +303,11 @@ jobs:
|
||||
|
||||
# Save PM2 process list after cleanup to persist deletions
|
||||
echo "Saving PM2 process list after cleanup..."
|
||||
pm2 save
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
|
||||
# === POST-CLEANUP VERIFICATION ===
|
||||
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||
pm2 jlist | node -e "
|
||||
pm2 jlist --namespace flyer-crawler-prod | node -e "
|
||||
try {
|
||||
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test') && !p.name.endsWith('-dev'));
|
||||
@@ -331,7 +331,7 @@ jobs:
|
||||
|
||||
# Get the running version from PM2 for the main API process
|
||||
# We use a small node script to parse the JSON output from pm2 jlist
|
||||
RUNNING_VERSION=$(pm2 jlist | node -e "try { const list = JSON.parse(require('fs').readFileSync(0, 'utf-8')); const app = list.find(p => p.name === 'flyer-crawler-api'); console.log(app ? app.pm2_env.version : ''); } catch(e) { console.log(''); }")
|
||||
RUNNING_VERSION=$(pm2 jlist --namespace flyer-crawler-prod | node -e "try { const list = JSON.parse(require('fs').readFileSync(0, 'utf-8')); const app = list.find(p => p.name === 'flyer-crawler-api'); console.log(app ? app.pm2_env.version : ''); } catch(e) { console.log(''); }")
|
||||
echo "Running PM2 Version: $RUNNING_VERSION"
|
||||
|
||||
if [ "${{ gitea.event.inputs.force_reload }}" == "true" ] || [ "$NEW_VERSION" != "$RUNNING_VERSION" ] || [ -z "$RUNNING_VERSION" ]; then
|
||||
@@ -340,7 +340,7 @@ jobs:
|
||||
else
|
||||
echo "Version mismatch (Running: $RUNNING_VERSION -> Deployed: $NEW_VERSION) or app not running. Reloading PM2..."
|
||||
fi
|
||||
pm2 startOrReload ecosystem.config.cjs --update-env && pm2 save
|
||||
pm2 startOrReload ecosystem.config.cjs --update-env --namespace flyer-crawler-prod && pm2 save --namespace flyer-crawler-prod
|
||||
echo "Production backend server reloaded successfully."
|
||||
else
|
||||
echo "Version $NEW_VERSION is already running. Skipping PM2 reload."
|
||||
@@ -370,14 +370,14 @@ jobs:
|
||||
sleep 5 # Wait a few seconds for the app to start and log its output.
|
||||
|
||||
# Resolve the PM2 ID dynamically to ensure we target the correct process
|
||||
PM2_ID=$(pm2 jlist | node -e "try { const list = JSON.parse(require('fs').readFileSync(0, 'utf-8')); const app = list.find(p => p.name === 'flyer-crawler-api'); console.log(app ? app.pm2_env.pm_id : ''); } catch(e) { console.log(''); }")
|
||||
PM2_ID=$(pm2 jlist --namespace flyer-crawler-prod | node -e "try { const list = JSON.parse(require('fs').readFileSync(0, 'utf-8')); const app = list.find(p => p.name === 'flyer-crawler-api'); console.log(app ? app.pm2_env.pm_id : ''); } catch(e) { console.log(''); }")
|
||||
|
||||
if [ -n "$PM2_ID" ]; then
|
||||
echo "Found process ID: $PM2_ID"
|
||||
pm2 describe "$PM2_ID" || echo "Failed to describe process $PM2_ID"
|
||||
pm2 logs "$PM2_ID" --lines 20 --nostream || echo "Failed to get logs for $PM2_ID"
|
||||
pm2 env "$PM2_ID" || echo "Failed to get env for $PM2_ID"
|
||||
pm2 describe "$PM2_ID" --namespace flyer-crawler-prod || echo "Failed to describe process $PM2_ID"
|
||||
pm2 logs "$PM2_ID" --lines 20 --nostream --namespace flyer-crawler-prod || echo "Failed to get logs for $PM2_ID"
|
||||
pm2 env "$PM2_ID" --namespace flyer-crawler-prod || echo "Failed to get env for $PM2_ID"
|
||||
else
|
||||
echo "Could not find process 'flyer-crawler-api' in pm2 list."
|
||||
pm2 list # Fallback to listing everything to help debug
|
||||
pm2 list --namespace flyer-crawler-prod # Fallback to listing everything to help debug
|
||||
fi
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -57,8 +57,8 @@ jobs:
|
||||
- name: Step 1 - Stop Application Server
|
||||
run: |
|
||||
echo "Stopping PRODUCTION PM2 processes to release database connections..."
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker || echo "Production PM2 processes were not running."
|
||||
pm2 save
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker --namespace flyer-crawler-prod || echo "Production PM2 processes were not running."
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
echo "✅ Production application server stopped and saved."
|
||||
|
||||
- name: Step 2 - Drop and Recreate Database
|
||||
@@ -92,5 +92,5 @@ jobs:
|
||||
run: |
|
||||
echo "Restarting application server..."
|
||||
cd /var/www/flyer-crawler.projectium.com
|
||||
pm2 startOrReload ecosystem.config.cjs --env production && pm2 save
|
||||
pm2 startOrReload ecosystem.config.cjs --env production --namespace flyer-crawler-prod && pm2 save --namespace flyer-crawler-prod
|
||||
echo "✅ Application server restarted."
|
||||
|
||||
@@ -157,7 +157,7 @@ jobs:
|
||||
|
||||
# === PRE-CLEANUP PM2 STATE LOGGING ===
|
||||
echo "=== PRE-CLEANUP PM2 STATE ==="
|
||||
pm2 jlist
|
||||
pm2 jlist --namespace flyer-crawler-prod
|
||||
echo "=== END PRE-CLEANUP STATE ==="
|
||||
|
||||
# --- Cleanup Errored Processes with Defense-in-Depth Safeguards ---
|
||||
@@ -165,7 +165,7 @@ jobs:
|
||||
node -e "
|
||||
const exec = require('child_process').execSync;
|
||||
try {
|
||||
const list = JSON.parse(exec('pm2 jlist').toString());
|
||||
const list = JSON.parse(exec('pm2 jlist --namespace flyer-crawler-prod').toString());
|
||||
const prodProcesses = ['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker'];
|
||||
|
||||
// Filter for processes that match our criteria
|
||||
@@ -193,7 +193,7 @@ jobs:
|
||||
targetProcesses.forEach(p => {
|
||||
console.log('Deleting ' + p.pm2_env.status + ' production process: ' + p.name + ' (' + p.pm2_env.pm_id + ')');
|
||||
try {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id + ' --namespace flyer-crawler-prod');
|
||||
} catch(e) {
|
||||
console.error('Failed to delete ' + p.pm2_env.pm_id);
|
||||
}
|
||||
@@ -207,11 +207,11 @@ jobs:
|
||||
|
||||
# Save PM2 process list after cleanup to persist deletions
|
||||
echo "Saving PM2 process list after cleanup..."
|
||||
pm2 save
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
|
||||
# === POST-CLEANUP VERIFICATION ===
|
||||
echo "=== POST-CLEANUP VERIFICATION ==="
|
||||
pm2 jlist | node -e "
|
||||
pm2 jlist --namespace flyer-crawler-prod | node -e "
|
||||
try {
|
||||
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||
const prodProcesses = list.filter(p => p.name && p.name.startsWith('flyer-crawler-') && !p.name.endsWith('-test') && !p.name.endsWith('-dev'));
|
||||
@@ -235,7 +235,7 @@ jobs:
|
||||
|
||||
# Get the running version from PM2 for the main API process
|
||||
# We use a small node script to parse the JSON output from pm2 jlist
|
||||
RUNNING_VERSION=$(pm2 jlist | node -e "try { const list = JSON.parse(require('fs').readFileSync(0, 'utf-8')); const app = list.find(p => p.name === 'flyer-crawler-api'); console.log(app ? app.pm2_env.version : ''); } catch(e) { console.log(''); }")
|
||||
RUNNING_VERSION=$(pm2 jlist --namespace flyer-crawler-prod | node -e "try { const list = JSON.parse(require('fs').readFileSync(0, 'utf-8')); const app = list.find(p => p.name === 'flyer-crawler-api'); console.log(app ? app.pm2_env.version : ''); } catch(e) { console.log(''); }")
|
||||
echo "Running PM2 Version: $RUNNING_VERSION"
|
||||
|
||||
if [ "${{ gitea.event.inputs.force_reload }}" == "true" ] || [ "$NEW_VERSION" != "$RUNNING_VERSION" ] || [ -z "$RUNNING_VERSION" ]; then
|
||||
@@ -244,7 +244,7 @@ jobs:
|
||||
else
|
||||
echo "Version mismatch (Running: $RUNNING_VERSION -> Deployed: $NEW_VERSION) or app not running. Reloading PM2..."
|
||||
fi
|
||||
pm2 startOrReload ecosystem.config.cjs --env production --update-env && pm2 save
|
||||
pm2 startOrReload ecosystem.config.cjs --env production --update-env --namespace flyer-crawler-prod && pm2 save --namespace flyer-crawler-prod
|
||||
echo "Production backend server reloaded successfully."
|
||||
else
|
||||
echo "Version $NEW_VERSION is already running. Skipping PM2 reload."
|
||||
@@ -267,6 +267,6 @@ jobs:
|
||||
run: |
|
||||
echo "--- Displaying recent PM2 logs for flyer-crawler-api ---"
|
||||
sleep 5
|
||||
pm2 describe flyer-crawler-api || echo "Could not find production pm2 process."
|
||||
pm2 logs flyer-crawler-api --lines 20 --nostream || echo "Could not find production pm2 process."
|
||||
pm2 env flyer-crawler-api || echo "Could not find production pm2 process."
|
||||
pm2 describe flyer-crawler-api --namespace flyer-crawler-prod || echo "Could not find production pm2 process."
|
||||
pm2 logs flyer-crawler-api --lines 20 --nostream --namespace flyer-crawler-prod || echo "Could not find production pm2 process."
|
||||
pm2 env flyer-crawler-api --namespace flyer-crawler-prod || echo "Could not find production pm2 process."
|
||||
@@ -26,12 +26,25 @@ jobs:
|
||||
echo "PM2 CURRENT STATE SNAPSHOT"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "=== Production Namespace (flyer-crawler-prod) ==="
|
||||
echo "--- PM2 List (Human Readable) ---"
|
||||
pm2 list
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
echo ""
|
||||
echo "--- PM2 List (JSON) ---"
|
||||
pm2 jlist > /tmp/pm2-state-initial.json
|
||||
cat /tmp/pm2-state-initial.json | jq '.'
|
||||
pm2 jlist --namespace flyer-crawler-prod > /tmp/pm2-state-initial-prod.json
|
||||
cat /tmp/pm2-state-initial-prod.json | jq '.'
|
||||
echo ""
|
||||
echo "=== Test Namespace (flyer-crawler-test) ==="
|
||||
echo "--- PM2 List (Human Readable) ---"
|
||||
pm2 list --namespace flyer-crawler-test
|
||||
echo ""
|
||||
echo "--- PM2 List (JSON) ---"
|
||||
pm2 jlist --namespace flyer-crawler-test > /tmp/pm2-state-initial-test.json
|
||||
cat /tmp/pm2-state-initial-test.json | jq '.'
|
||||
echo ""
|
||||
echo "=== All Namespaces Combined ==="
|
||||
echo "--- PM2 List (All) ---"
|
||||
pm2 list
|
||||
echo ""
|
||||
echo "--- PM2 Daemon Info ---"
|
||||
pm2 info pm2-logrotate || echo "pm2-logrotate not found"
|
||||
@@ -47,14 +60,32 @@ jobs:
|
||||
echo "========================================="
|
||||
echo "PROCESS WORKING DIRECTORIES"
|
||||
echo "========================================="
|
||||
pm2 jlist | jq -r '.[] | "Process: \(.name) | CWD: \(.pm2_env.pm_cwd) | Exists: \(if .pm2_env.pm_cwd then "checking..." else "N/A" end)"'
|
||||
echo ""
|
||||
echo "=== Production Namespace (flyer-crawler-prod) ==="
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[] | "Process: \(.name) | CWD: \(.pm2_env.pm_cwd) | Exists: \(if .pm2_env.pm_cwd then "checking..." else "N/A" end)"'
|
||||
echo ""
|
||||
echo "--- Checking if CWDs still exist ---"
|
||||
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' | while read cwd; do
|
||||
if [ -d "$cwd" ]; then
|
||||
echo "✅ EXISTS: $cwd"
|
||||
else
|
||||
echo "❌ MISSING: $cwd (THIS WILL CAUSE CRASHES!)"
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[].pm2_env.pm_cwd' | while read cwd; do
|
||||
if [ -n "$cwd" ] && [ "$cwd" != "null" ]; then
|
||||
if [ -d "$cwd" ]; then
|
||||
echo "✅ EXISTS: $cwd"
|
||||
else
|
||||
echo "❌ MISSING: $cwd (THIS WILL CAUSE CRASHES!)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
echo "=== Test Namespace (flyer-crawler-test) ==="
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[] | "Process: \(.name) | CWD: \(.pm2_env.pm_cwd) | Exists: \(if .pm2_env.pm_cwd then "checking..." else "N/A" end)"'
|
||||
echo ""
|
||||
echo "--- Checking if CWDs still exist ---"
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[].pm2_env.pm_cwd' | while read cwd; do
|
||||
if [ -n "$cwd" ] && [ "$cwd" != "null" ]; then
|
||||
if [ -d "$cwd" ]; then
|
||||
echo "✅ EXISTS: $cwd"
|
||||
else
|
||||
echo "❌ MISSING: $cwd (THIS WILL CAUSE CRASHES!)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -78,10 +109,21 @@ jobs:
|
||||
echo "========================================="
|
||||
echo "ALL PM2-MANAGED PROJECTS"
|
||||
echo "========================================="
|
||||
pm2 jlist | jq -r '.[] | "[\(.pm_id)] \(.name) - v\(.pm2_env.version // "N/A") - \(.pm2_env.status) - CWD: \(.pm2_env.pm_cwd)"'
|
||||
echo ""
|
||||
echo "=== Production Namespace (flyer-crawler-prod) ==="
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[] | "[\(.pm_id)] \(.name) - v\(.pm2_env.version // "N/A") - \(.pm2_env.status) - CWD: \(.pm2_env.pm_cwd)"'
|
||||
echo ""
|
||||
echo "--- Projects by CWD ---"
|
||||
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' | sort -u
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[].pm2_env.pm_cwd' | sort -u
|
||||
echo ""
|
||||
echo "=== Test Namespace (flyer-crawler-test) ==="
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[] | "[\(.pm_id)] \(.name) - v\(.pm2_env.version // "N/A") - \(.pm2_env.status) - CWD: \(.pm2_env.pm_cwd)"'
|
||||
echo ""
|
||||
echo "--- Projects by CWD ---"
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[].pm2_env.pm_cwd' | sort -u
|
||||
echo ""
|
||||
echo "=== All Namespaces (for reference) ==="
|
||||
pm2 jlist | jq -r '.[] | "[\(.pm_id)] \(.name) [ns: \(.pm2_env.namespace // "default")] - \(.pm2_env.status)"'
|
||||
echo ""
|
||||
echo "--- Checking which projects might interfere ---"
|
||||
for dir in /var/www/*; do
|
||||
@@ -107,15 +149,29 @@ jobs:
|
||||
|
||||
for i in $(seq 1 $COUNT); do
|
||||
echo "--- Capture $i at $(date) ---"
|
||||
pm2 jlist | jq -r '.[] | "\(.name): \(.pm2_env.status) (restarts: \(.pm2_env.restart_time))"'
|
||||
echo ""
|
||||
echo "=== Production Namespace (flyer-crawler-prod) ==="
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[] | "\(.name): \(.pm2_env.status) (restarts: \(.pm2_env.restart_time))"'
|
||||
|
||||
# Check for new crashes
|
||||
CRASHED=$(pm2 jlist | jq '[.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped")] | length')
|
||||
if [ "$CRASHED" -gt 0 ]; then
|
||||
echo "⚠️ WARNING: $CRASHED process(es) in crashed state!"
|
||||
pm2 jlist | jq -r '.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped") | " - \(.name): \(.pm2_env.status)"'
|
||||
# Check for crashes in production
|
||||
CRASHED_PROD=$(pm2 jlist --namespace flyer-crawler-prod | jq '[.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped")] | length')
|
||||
if [ "$CRASHED_PROD" -gt 0 ]; then
|
||||
echo "⚠️ WARNING: $CRASHED_PROD PRODUCTION process(es) in crashed state!"
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped") | " - \(.name): \(.pm2_env.status)"'
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Test Namespace (flyer-crawler-test) ==="
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[] | "\(.name): \(.pm2_env.status) (restarts: \(.pm2_env.restart_time))"'
|
||||
|
||||
# Check for crashes in test
|
||||
CRASHED_TEST=$(pm2 jlist --namespace flyer-crawler-test | jq '[.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped")] | length')
|
||||
if [ "$CRASHED_TEST" -gt 0 ]; then
|
||||
echo "⚠️ WARNING: $CRASHED_TEST TEST process(es) in crashed state!"
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped") | " - \(.name): \(.pm2_env.status)"'
|
||||
fi
|
||||
|
||||
echo ""
|
||||
sleep $INTERVAL
|
||||
done
|
||||
|
||||
@@ -165,19 +221,32 @@ jobs:
|
||||
echo "DIAGNOSTIC SUMMARY"
|
||||
echo "========================================="
|
||||
echo ""
|
||||
echo "Total PM2 processes: $(pm2 jlist | jq 'length')"
|
||||
echo "Online: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "online")] | length')"
|
||||
echo "Stopped: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "stopped")] | length')"
|
||||
echo "Errored: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "errored")] | length')"
|
||||
echo "=== Production Namespace (flyer-crawler-prod) ==="
|
||||
echo "Total processes: $(pm2 jlist --namespace flyer-crawler-prod | jq 'length')"
|
||||
echo "Online: $(pm2 jlist --namespace flyer-crawler-prod | jq '[.[] | select(.pm2_env.status == "online")] | length')"
|
||||
echo "Stopped: $(pm2 jlist --namespace flyer-crawler-prod | jq '[.[] | select(.pm2_env.status == "stopped")] | length')"
|
||||
echo "Errored: $(pm2 jlist --namespace flyer-crawler-prod | jq '[.[] | select(.pm2_env.status == "errored")] | length')"
|
||||
echo ""
|
||||
echo "Flyer-crawler processes:"
|
||||
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler")) | " \(.name): \(.pm2_env.status)"'
|
||||
echo "Flyer-crawler PROD processes:"
|
||||
pm2 jlist --namespace flyer-crawler-prod | jq -r '.[] | select(.name | contains("flyer-crawler")) | " \(.name): \(.pm2_env.status)"'
|
||||
echo ""
|
||||
echo "Stock-alert processes:"
|
||||
pm2 jlist | jq -r '.[] | select(.name | contains("stock-alert")) | " \(.name): \(.pm2_env.status)"'
|
||||
echo "=== Test Namespace (flyer-crawler-test) ==="
|
||||
echo "Total processes: $(pm2 jlist --namespace flyer-crawler-test | jq 'length')"
|
||||
echo "Online: $(pm2 jlist --namespace flyer-crawler-test | jq '[.[] | select(.pm2_env.status == "online")] | length')"
|
||||
echo "Stopped: $(pm2 jlist --namespace flyer-crawler-test | jq '[.[] | select(.pm2_env.status == "stopped")] | length')"
|
||||
echo "Errored: $(pm2 jlist --namespace flyer-crawler-test | jq '[.[] | select(.pm2_env.status == "errored")] | length')"
|
||||
echo ""
|
||||
echo "Flyer-crawler TEST processes:"
|
||||
pm2 jlist --namespace flyer-crawler-test | jq -r '.[] | select(.name | contains("flyer-crawler")) | " \(.name): \(.pm2_env.status)"'
|
||||
echo ""
|
||||
echo "=== All Namespaces Summary ==="
|
||||
echo "Total PM2 processes (all): $(pm2 jlist | jq 'length')"
|
||||
echo ""
|
||||
echo "Stock-alert processes (separate project):"
|
||||
pm2 jlist | jq -r '.[] | select(.name | contains("stock-alert")) | " \(.name): \(.pm2_env.status) [ns: \(.pm2_env.namespace // "default")]"'
|
||||
echo ""
|
||||
echo "Other processes:"
|
||||
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler") | not) | select(.name | contains("stock-alert") | not) | " \(.name): \(.pm2_env.status)"'
|
||||
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler") | not) | select(.name | contains("stock-alert") | not) | " \(.name): \(.pm2_env.status) [ns: \(.pm2_env.namespace // "default")]"'
|
||||
echo ""
|
||||
echo "========================================="
|
||||
echo "RECOMMENDATIONS"
|
||||
@@ -185,4 +254,5 @@ jobs:
|
||||
echo "1. Check for missing CWDs (marked with ❌ above)"
|
||||
echo "2. Review PM2 daemon log for ENOENT errors"
|
||||
echo "3. Verify no deployments are running rsync --delete while processes are online"
|
||||
echo "4. Consider separating PM2 daemons by user or using PM2 namespaces"
|
||||
echo "4. Use namespace-specific commands: pm2 list --namespace flyer-crawler-prod"
|
||||
echo "5. Avoid pm2 restart all - use namespace targeting instead"
|
||||
|
||||
@@ -33,22 +33,22 @@ jobs:
|
||||
cd /var/www/flyer-crawler-test.projectium.com
|
||||
|
||||
echo "--- Current PM2 State (Before Restart) ---"
|
||||
pm2 list
|
||||
pm2 list --namespace flyer-crawler-test
|
||||
|
||||
echo "--- Restarting Test Processes ---"
|
||||
pm2 restart flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test || {
|
||||
pm2 restart flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test --namespace flyer-crawler-test || {
|
||||
echo "Restart failed, attempting to start processes..."
|
||||
pm2 start ecosystem-test.config.cjs
|
||||
pm2 start ecosystem-test.config.cjs --namespace flyer-crawler-test
|
||||
}
|
||||
|
||||
echo "--- Saving PM2 Process List ---"
|
||||
pm2 save
|
||||
pm2 save --namespace flyer-crawler-test
|
||||
|
||||
echo "--- Waiting 3 seconds for processes to stabilize ---"
|
||||
sleep 3
|
||||
|
||||
echo "=== TEST ENVIRONMENT STATUS ==="
|
||||
pm2 ps
|
||||
pm2 ps --namespace flyer-crawler-test
|
||||
|
||||
- name: Restart Production Environment
|
||||
if: gitea.event.inputs.environment == 'production' || gitea.event.inputs.environment == 'both'
|
||||
@@ -57,30 +57,51 @@ jobs:
|
||||
cd /var/www/flyer-crawler.projectium.com
|
||||
|
||||
echo "--- Current PM2 State (Before Restart) ---"
|
||||
pm2 list
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
|
||||
echo "--- Restarting Production Processes ---"
|
||||
pm2 restart flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker || {
|
||||
pm2 restart flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker --namespace flyer-crawler-prod || {
|
||||
echo "Restart failed, attempting to start processes..."
|
||||
pm2 start ecosystem.config.cjs
|
||||
pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod
|
||||
}
|
||||
|
||||
echo "--- Saving PM2 Process List ---"
|
||||
pm2 save
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
|
||||
echo "--- Waiting 3 seconds for processes to stabilize ---"
|
||||
sleep 3
|
||||
|
||||
echo "=== PRODUCTION ENVIRONMENT STATUS ==="
|
||||
pm2 ps
|
||||
pm2 ps --namespace flyer-crawler-prod
|
||||
|
||||
- name: Final PM2 Status (All Processes)
|
||||
run: |
|
||||
echo "========================================="
|
||||
echo "FINAL PM2 STATUS - ALL PROCESSES"
|
||||
echo "========================================="
|
||||
pm2 ps
|
||||
|
||||
echo ""
|
||||
echo "--- PM2 Logs (Last 20 Lines) ---"
|
||||
pm2 logs --lines 20 --nostream || echo "No logs available"
|
||||
if [ "${{ gitea.event.inputs.environment }}" = "test" ]; then
|
||||
echo "--- Test Namespace ---"
|
||||
pm2 ps --namespace flyer-crawler-test
|
||||
echo ""
|
||||
echo "--- PM2 Logs (Last 20 Lines) ---"
|
||||
pm2 logs --namespace flyer-crawler-test --lines 20 --nostream || echo "No logs available"
|
||||
elif [ "${{ gitea.event.inputs.environment }}" = "production" ]; then
|
||||
echo "--- Production Namespace ---"
|
||||
pm2 ps --namespace flyer-crawler-prod
|
||||
echo ""
|
||||
echo "--- PM2 Logs (Last 20 Lines) ---"
|
||||
pm2 logs --namespace flyer-crawler-prod --lines 20 --nostream || echo "No logs available"
|
||||
else
|
||||
echo "--- Test Namespace ---"
|
||||
pm2 ps --namespace flyer-crawler-test
|
||||
echo ""
|
||||
echo "--- Production Namespace ---"
|
||||
pm2 ps --namespace flyer-crawler-prod
|
||||
echo ""
|
||||
echo "--- PM2 Logs - Test (Last 10 Lines) ---"
|
||||
pm2 logs --namespace flyer-crawler-test --lines 10 --nostream || echo "No logs available"
|
||||
echo ""
|
||||
echo "--- PM2 Logs - Production (Last 10 Lines) ---"
|
||||
pm2 logs --namespace flyer-crawler-prod --lines 10 --nostream || echo "No logs available"
|
||||
fi
|
||||
|
||||
@@ -66,19 +66,19 @@ jobs:
|
||||
echo "Restarting test PM2 processes to refresh version metadata..."
|
||||
|
||||
# Restart with --update-env to pick up new package.json version
|
||||
pm2 restart flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test --update-env && pm2 save
|
||||
pm2 --namespace flyer-crawler-test restart flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test --update-env && pm2 --namespace flyer-crawler-test save
|
||||
|
||||
echo "✅ Test PM2 processes restarted and saved"
|
||||
|
||||
# Show current state
|
||||
echo ""
|
||||
echo "--- Current PM2 State ---"
|
||||
pm2 list
|
||||
pm2 --namespace flyer-crawler-test list
|
||||
|
||||
# Verify version in PM2 metadata
|
||||
echo ""
|
||||
echo "--- Verifying Version in PM2 ---"
|
||||
pm2 jlist | node -e "
|
||||
pm2 --namespace flyer-crawler-test jlist | node -e "
|
||||
try {
|
||||
const list = JSON.parse(require('fs').readFileSync(0, 'utf-8'));
|
||||
const testProcesses = list.filter(p => p.name && p.name.endsWith('-test'));
|
||||
|
||||
113
CLAUDE.md
113
CLAUDE.md
@@ -45,45 +45,71 @@ Out-of-sync = test failures.
|
||||
- Maximum 3 fix commands at a time (errors may cascade)
|
||||
- Always verify after fixes complete
|
||||
|
||||
### PM2 Process Isolation (Production/Test Servers)
|
||||
### PM2 Namespace Isolation (Production/Test Servers)
|
||||
|
||||
**CRITICAL**: Production and test environments share the same PM2 daemon on the server.
|
||||
|
||||
Flyer-crawler uses PM2 namespaces to isolate test and production processes:
|
||||
|
||||
| Namespace | Purpose | Config File |
|
||||
| -------------------- | ---------------------- | --------------------------- |
|
||||
| `flyer-crawler-prod` | Production environment | `ecosystem.config.cjs` |
|
||||
| `flyer-crawler-test` | Test environment | `ecosystem-test.config.cjs` |
|
||||
| `flyer-crawler-dev` | Development container | `ecosystem.dev.config.cjs` |
|
||||
|
||||
This prevents `pm2 save` race conditions during simultaneous deployments. See [ADR-063](docs/adr/0063-pm2-namespace-implementation.md) for details.
|
||||
|
||||
**See also**: [PM2 Process Isolation Incidents](#pm2-process-isolation-incidents) for past incidents and response procedures.
|
||||
|
||||
| Environment | Processes | Config File |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- | --------------------------- |
|
||||
| Production | `flyer-crawler-api`, `flyer-crawler-worker`, `flyer-crawler-analytics-worker` | `ecosystem.config.cjs` |
|
||||
| Test | `flyer-crawler-api-test`, `flyer-crawler-worker-test`, `flyer-crawler-analytics-worker-test` | `ecosystem-test.config.cjs` |
|
||||
| Development | `flyer-crawler-api-dev`, `flyer-crawler-worker-dev`, `flyer-crawler-vite-dev` | `ecosystem.dev.config.cjs` |
|
||||
| Environment | Processes | Namespace |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- | -------------------- |
|
||||
| Production | `flyer-crawler-api`, `flyer-crawler-worker`, `flyer-crawler-analytics-worker` | `flyer-crawler-prod` |
|
||||
| Test | `flyer-crawler-api-test`, `flyer-crawler-worker-test`, `flyer-crawler-analytics-worker-test` | `flyer-crawler-test` |
|
||||
| Development | `flyer-crawler-api-dev`, `flyer-crawler-worker-dev`, `flyer-crawler-vite-dev` | `flyer-crawler-dev` |
|
||||
|
||||
**Deployment Scripts MUST:**
|
||||
|
||||
- ✅ Use `--namespace` flag for all PM2 commands to scope to correct environment
|
||||
- ✅ Filter PM2 commands by exact process names or name patterns (e.g., `endsWith('-test')`)
|
||||
- ❌ NEVER use `pm2 stop all`, `pm2 delete all`, or `pm2 restart all`
|
||||
- ❌ NEVER use `pm2 stop all`, `pm2 delete all`, or `pm2 restart all` without namespace
|
||||
- ❌ NEVER delete/stop processes based solely on status without name filtering
|
||||
- ✅ Always verify process names match the target environment before any operation
|
||||
|
||||
**Examples:**
|
||||
|
||||
```bash
|
||||
# ✅ CORRECT - Production cleanup (filter by name)
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker
|
||||
# ✅ CORRECT - Production commands with namespace
|
||||
pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker --namespace flyer-crawler-prod
|
||||
pm2 restart all --namespace flyer-crawler-prod && pm2 save --namespace flyer-crawler-prod
|
||||
pm2 logs --namespace flyer-crawler-prod
|
||||
|
||||
# ✅ CORRECT - Test cleanup (filter by name pattern)
|
||||
# ✅ CORRECT - Test commands with namespace
|
||||
pm2 start ecosystem-test.config.cjs --namespace flyer-crawler-test
|
||||
pm2 status --namespace flyer-crawler-test
|
||||
pm2 delete all --namespace flyer-crawler-test && pm2 save --namespace flyer-crawler-test
|
||||
|
||||
# ✅ CORRECT - Dev container commands with namespace
|
||||
pm2 start ecosystem.dev.config.cjs --namespace flyer-crawler-dev
|
||||
pm2 logs --namespace flyer-crawler-dev
|
||||
|
||||
# ✅ CORRECT - Test cleanup (filter by namespace + name pattern)
|
||||
# Only delete test processes that are errored/stopped
|
||||
list.forEach(p => {
|
||||
if ((p.pm2_env.status === 'errored' || p.pm2_env.status === 'stopped') &&
|
||||
p.name && p.name.endsWith('-test')) {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||
p.name && p.name.endsWith('-test') &&
|
||||
p.pm2_env.namespace === 'flyer-crawler-test') {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id + ' --namespace flyer-crawler-test');
|
||||
}
|
||||
});
|
||||
exec('pm2 save --namespace flyer-crawler-test');
|
||||
|
||||
# ❌ WRONG - Affects all environments
|
||||
# ❌ WRONG - Missing namespace (affects all environments)
|
||||
pm2 stop all
|
||||
pm2 delete all
|
||||
pm2 restart all
|
||||
|
||||
# ❌ WRONG - No name filtering (could delete test processes during prod deploy)
|
||||
# ❌ WRONG - No name/namespace filtering (could delete test processes during prod deploy)
|
||||
if (p.pm2_env.status === 'errored') {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||
}
|
||||
@@ -91,7 +117,7 @@ if (p.pm2_env.status === 'errored') {
|
||||
|
||||
### PM2 Save Requirement (CRITICAL)
|
||||
|
||||
**CRITICAL**: Every `pm2 start`, `pm2 restart`, `pm2 stop`, or `pm2 delete` command MUST be immediately followed by `pm2 save`.
|
||||
**CRITICAL**: Every `pm2 start`, `pm2 restart`, `pm2 stop`, or `pm2 delete` command MUST be immediately followed by `pm2 save` with the same namespace.
|
||||
|
||||
Without `pm2 save`, processes become ephemeral and will disappear on:
|
||||
|
||||
@@ -102,27 +128,31 @@ Without `pm2 save`, processes become ephemeral and will disappear on:
|
||||
**Pattern:**
|
||||
|
||||
```bash
|
||||
# ✅ CORRECT - Save after every state change
|
||||
pm2 start ecosystem.config.cjs && pm2 save
|
||||
pm2 restart my-app && pm2 save
|
||||
pm2 stop my-app && pm2 save
|
||||
pm2 delete my-app && pm2 save
|
||||
# ✅ CORRECT - Save after every state change (with namespace)
|
||||
pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod && pm2 save --namespace flyer-crawler-prod
|
||||
pm2 restart my-app --namespace flyer-crawler-prod && pm2 save --namespace flyer-crawler-prod
|
||||
pm2 stop my-app --namespace flyer-crawler-test && pm2 save --namespace flyer-crawler-test
|
||||
pm2 delete my-app --namespace flyer-crawler-test && pm2 save --namespace flyer-crawler-test
|
||||
|
||||
# ❌ WRONG - Missing save (processes become ephemeral)
|
||||
pm2 start ecosystem.config.cjs
|
||||
pm2 restart my-app
|
||||
pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod
|
||||
pm2 restart my-app --namespace flyer-crawler-prod
|
||||
|
||||
# ❌ WRONG - Missing namespace (affects wrong environment)
|
||||
pm2 start ecosystem.config.cjs && pm2 save
|
||||
```
|
||||
|
||||
**In Cleanup Scripts:**
|
||||
|
||||
```javascript
|
||||
// ✅ CORRECT - Save after cleanup loop completes
|
||||
// ✅ CORRECT - Save after cleanup loop completes (with namespace)
|
||||
const namespace = 'flyer-crawler-test';
|
||||
targetProcesses.forEach((p) => {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||
exec(`pm2 delete ${p.pm2_env.pm_id} --namespace ${namespace}`);
|
||||
});
|
||||
exec('pm2 save'); // Persist all deletions
|
||||
exec(`pm2 save --namespace ${namespace}`); // Persist all deletions
|
||||
|
||||
// ❌ WRONG - Missing save
|
||||
// ❌ WRONG - Missing save and namespace
|
||||
targetProcesses.forEach((p) => {
|
||||
exec('pm2 delete ' + p.pm2_env.pm_id);
|
||||
});
|
||||
@@ -130,12 +160,13 @@ targetProcesses.forEach((p) => {
|
||||
|
||||
**Why This Matters:**
|
||||
|
||||
PM2 maintains an in-memory process list. The `pm2 save` command writes this list to `~/.pm2/dump.pm2`, which PM2 uses to resurrect processes after daemon restarts. Without it, your carefully managed process state is lost.
|
||||
PM2 maintains an in-memory process list. The `pm2 save` command writes this list to `~/.pm2/dump.pm2`, which PM2 uses to resurrect processes after daemon restarts. Without it, your carefully managed process state is lost. Using namespaces ensures that `pm2 save` in one environment does not affect another.
|
||||
|
||||
**See Also:**
|
||||
|
||||
- [ADR-014: Containerization and Deployment Strategy](docs/adr/0014-containerization-and-deployment-strategy.md)
|
||||
- [ADR-061: PM2 Process Isolation Safeguards](docs/adr/0061-pm2-process-isolation-safeguards.md)
|
||||
- [ADR-063: PM2 Namespace Implementation](docs/adr/0063-pm2-namespace-implementation.md)
|
||||
|
||||
### Communication Style
|
||||
|
||||
@@ -150,7 +181,7 @@ Ask before assuming. Never assume:
|
||||
1. **Memory**: `mcp__memory__read_graph` - Recall project context, credentials, known issues
|
||||
2. **Git**: `git log --oneline -10` - Recent changes
|
||||
3. **Containers**: `mcp__podman__container_list` - Running state
|
||||
4. **PM2 Status**: `podman exec flyer-crawler-dev pm2 status` - Process health (API, Worker, Vite)
|
||||
4. **PM2 Status**: `podman exec flyer-crawler-dev pm2 status --namespace flyer-crawler-dev` - Process health (API, Worker, Vite)
|
||||
|
||||
---
|
||||
|
||||
@@ -158,15 +189,17 @@ Ask before assuming. Never assume:
|
||||
|
||||
### Essential Commands
|
||||
|
||||
| Command | Description |
|
||||
| ------------------------------------------------------------ | --------------------- |
|
||||
| `podman exec -it flyer-crawler-dev npm test` | Run all tests |
|
||||
| `podman exec -it flyer-crawler-dev npm run test:unit` | Unit tests only |
|
||||
| `podman exec -it flyer-crawler-dev npm run type-check` | TypeScript check |
|
||||
| `podman exec -it flyer-crawler-dev npm run test:integration` | Integration tests |
|
||||
| `podman exec -it flyer-crawler-dev pm2 status` | PM2 process status |
|
||||
| `podman exec -it flyer-crawler-dev pm2 logs` | View all PM2 logs |
|
||||
| `podman exec -it flyer-crawler-dev pm2 restart all` | Restart all processes |
|
||||
| Command | Description |
|
||||
| --------------------------------------------------------------------------------- | ------------------------ |
|
||||
| `podman exec -it flyer-crawler-dev npm test` | Run all tests |
|
||||
| `podman exec -it flyer-crawler-dev npm run test:unit` | Unit tests only |
|
||||
| `podman exec -it flyer-crawler-dev npm run type-check` | TypeScript check |
|
||||
| `podman exec -it flyer-crawler-dev npm run test:integration` | Integration tests |
|
||||
| `podman exec -it flyer-crawler-dev pm2 status --namespace flyer-crawler-dev` | PM2 process status (dev) |
|
||||
| `podman exec -it flyer-crawler-dev pm2 logs --namespace flyer-crawler-dev` | View PM2 logs (dev) |
|
||||
| `podman exec -it flyer-crawler-dev pm2 restart all --namespace flyer-crawler-dev` | Restart all (dev) |
|
||||
| `pm2 status --namespace flyer-crawler-prod` | PM2 status (production) |
|
||||
| `pm2 status --namespace flyer-crawler-test` | PM2 status (test) |
|
||||
|
||||
### Key Patterns (with file locations)
|
||||
|
||||
@@ -222,11 +255,13 @@ The dev container now matches production by using PM2 for process management.
|
||||
|
||||
| Component | Production | Dev Container |
|
||||
| ---------- | ---------------------- | ------------------------- |
|
||||
| API Server | PM2 cluster mode | PM2 fork mode + tsx watch |
|
||||
| API Server | PM2 fork mode | PM2 fork mode + tsx watch |
|
||||
| Worker | PM2 process | PM2 process + tsx watch |
|
||||
| Frontend | Static files via NGINX | PM2 + Vite dev server |
|
||||
| Logs | PM2 logs -> Logstash | PM2 logs -> Logstash |
|
||||
|
||||
**Note:** PM2 cluster mode is incompatible with tsx as script path. See [PM2-CLUSTER-MODE-INCOMPATIBILITY.md](docs/operations/PM2-CLUSTER-MODE-INCOMPATIBILITY.md).
|
||||
|
||||
**PM2 Processes in Dev Container**:
|
||||
|
||||
- `flyer-crawler-api-dev` - API server (port 3001)
|
||||
@@ -367,7 +402,7 @@ Common issues with solutions:
|
||||
|
||||
**Related Documentation**:
|
||||
|
||||
- [PM2 Process Isolation Requirements](#pm2-process-isolation-productiontest-servers) (existing section)
|
||||
- [PM2 Namespace Isolation](#pm2-namespace-isolation-productiontest-servers) (existing section)
|
||||
- [Incident Report 2026-02-17](docs/operations/INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
|
||||
- [PM2 Incident Response Runbook](docs/operations/PM2-INCIDENT-RESPONSE.md)
|
||||
|
||||
|
||||
16
README.md
16
README.md
@@ -88,7 +88,7 @@ See [docs/development/TESTING.md](docs/development/TESTING.md) for testing guide
|
||||
| [⚙️ Installation Guide](docs/getting-started/INSTALL.md) | Local development setup with Podman |
|
||||
| [🏗️ Architecture Overview](docs/architecture/DATABASE.md) | System design, database, authentication |
|
||||
| [💻 Development Guide](docs/development/TESTING.md) | Testing, debugging, code patterns |
|
||||
| [🚀 Deployment Guide](docs/operations/DEPLOYMENT.md) | Production setup, NGINX, PM2 |
|
||||
| [🚀 Deployment Guide](docs/operations/DEPLOYMENT.md) | Production setup, NGINX, PM2 namespaces |
|
||||
| [🤖 AI Agent Guides](docs/subagents/OVERVIEW.md) | Working with Claude Code subagents |
|
||||
|
||||
### Quick References
|
||||
@@ -126,13 +126,13 @@ See [INSTALL.md](INSTALL.md) for the complete list.
|
||||
|
||||
## Scripts
|
||||
|
||||
| Command | Description |
|
||||
| -------------------- | -------------------------------- |
|
||||
| `npm run dev` | Start development server |
|
||||
| `npm run build` | Build for production |
|
||||
| `npm run start:prod` | Start production server with PM2 |
|
||||
| `npm run test` | Run test suite |
|
||||
| `npm run seed` | Seed development user accounts |
|
||||
| Command | Description |
|
||||
| -------------------- | ----------------------------------------------------------- |
|
||||
| `npm run dev` | Start development server |
|
||||
| `npm run build` | Build for production |
|
||||
| `npm run start:prod` | Start production server with PM2 (uses namespace isolation) |
|
||||
| `npm run test` | Run test suite |
|
||||
| `npm run seed` | Seed development user accounts |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -47,9 +47,11 @@ Production operations and deployment:
|
||||
- [Logstash Troubleshooting](operations/LOGSTASH-TROUBLESHOOTING.md) - Debugging logs
|
||||
- [Monitoring](operations/MONITORING.md) - Bugsink, health checks, observability
|
||||
|
||||
**Incident Response**:
|
||||
**PM2 Management**:
|
||||
|
||||
- [PM2 Namespace Completion Report](operations/PM2-NAMESPACE-COMPLETION-REPORT.md) - PM2 namespace implementation project summary
|
||||
- [PM2 Incident Response Runbook](operations/PM2-INCIDENT-RESPONSE.md) - Step-by-step procedures for PM2 incidents
|
||||
- [PM2 Crash Debugging](operations/PM2-CRASH-DEBUGGING.md) - Troubleshooting PM2 crashes
|
||||
|
||||
**Incident Reports**:
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ Tests that pass on Windows but fail on Linux are considered **broken tests**. Te
|
||||
|
||||
We will standardize the deployment process using a hybrid approach:
|
||||
|
||||
1. **PM2 for Production**: Use PM2 cluster mode for process management, load balancing, and zero-downtime reloads.
|
||||
1. **PM2 for Production**: Use PM2 for process management. ~~Cluster mode~~ Fork mode used due to tsx incompatibility (see [PM2-CLUSTER-MODE-INCOMPATIBILITY.md](../operations/PM2-CLUSTER-MODE-INCOMPATIBILITY.md)).
|
||||
2. **Docker/Podman for Development**: Provide a complete containerized development environment with automatic initialization.
|
||||
3. **VS Code Dev Containers**: Enable one-click development environment setup.
|
||||
4. **Gitea Actions for CI/CD**: Automated deployment pipelines handle builds and deployments.
|
||||
@@ -187,13 +187,13 @@ Located in `ecosystem.config.cjs`:
|
||||
module.exports = {
|
||||
apps: [
|
||||
{
|
||||
// API Server - Cluster mode for load balancing
|
||||
// API Server - Fork mode (tsx incompatible with cluster)
|
||||
name: 'flyer-crawler-api',
|
||||
script: './node_modules/.bin/tsx',
|
||||
args: 'server.ts',
|
||||
max_memory_restart: '500M',
|
||||
instances: 'max', // Use all CPU cores
|
||||
exec_mode: 'cluster', // Enable cluster mode
|
||||
instances: 1, // Fork mode - single instance
|
||||
exec_mode: 'fork', // tsx requires fork mode
|
||||
kill_timeout: 5000, // Graceful shutdown timeout
|
||||
|
||||
// Restart configuration
|
||||
@@ -358,6 +358,42 @@ podman-compose -f compose.dev.yml build app
|
||||
|
||||
**Rationale**: Developers and CI systems should never need to run manual setup commands to execute tests. If the container is running, tests should work. Any deviation from this principle indicates an incomplete container setup.
|
||||
|
||||
## Updates
|
||||
|
||||
### 2026-02-19: Cluster Mode Disabled
|
||||
|
||||
**Decision:** Disabled PM2 cluster mode in favor of fork mode (single instance).
|
||||
|
||||
**Reason:** PM2 cluster mode is fundamentally incompatible with using `tsx` as the script path. The configuration pattern:
|
||||
|
||||
```javascript
|
||||
{
|
||||
script: './node_modules/.bin/tsx',
|
||||
args: 'server.ts',
|
||||
exec_mode: 'cluster',
|
||||
}
|
||||
```
|
||||
|
||||
causes 75-87% of cluster instances to fail on startup with no clear error messages. Only 1-2 out of 8 instances successfully start, with the rest showing constant restart attempts.
|
||||
|
||||
**Technical Root Cause:** PM2 requires the `node` binary as the interpreter to properly fork cluster workers using Node.js's native cluster module. When `tsx` is the script path, PM2 cannot create cluster workers correctly.
|
||||
|
||||
**Alternative:** To use cluster mode with TypeScript in the future, the correct configuration is:
|
||||
|
||||
```javascript
|
||||
{
|
||||
script: 'server.ts',
|
||||
interpreter: 'node',
|
||||
interpreter_args: '--import tsx', // Node 18.19+
|
||||
exec_mode: 'cluster',
|
||||
instances: 'max',
|
||||
}
|
||||
```
|
||||
|
||||
**Impact:** Current traffic does not require cluster mode load balancing. Fork mode with a single instance provides reliable process management without the cluster overhead.
|
||||
|
||||
**Documentation:** See [PM2-CLUSTER-MODE-INCOMPATIBILITY.md](../operations/PM2-CLUSTER-MODE-INCOMPATIBILITY.md) for full details.
|
||||
|
||||
## Related ADRs
|
||||
|
||||
- [ADR-017](./0017-ci-cd-and-branching-strategy.md) - CI/CD Strategy
|
||||
|
||||
191
docs/adr/0063-pm2-namespace-implementation.md
Normal file
191
docs/adr/0063-pm2-namespace-implementation.md
Normal file
@@ -0,0 +1,191 @@
|
||||
# ADR-063: PM2 Namespace Implementation
|
||||
|
||||
## Status
|
||||
|
||||
Accepted
|
||||
|
||||
## Context
|
||||
|
||||
### Problem
|
||||
|
||||
The PM2 process isolation safeguards implemented in [ADR-061](./0061-pm2-process-isolation-safeguards.md) successfully prevented cross-application process deletion but introduced operational complexity. Every PM2 command in deployment workflows required:
|
||||
|
||||
1. Process name filtering logic (JavaScript inline scripts)
|
||||
2. Safety abort checks (process count validation)
|
||||
3. Pre/post verification logging
|
||||
|
||||
Additionally, simultaneous test and production deployments created a race condition with `pm2 save`:
|
||||
|
||||
- Test deployment: `pm2 save` writes test processes to dump file
|
||||
- Prod deployment: `pm2 save` writes prod processes to dump file (overwrites test state)
|
||||
- PM2 daemon restart: Restores incomplete process list
|
||||
|
||||
This race condition could cause process loss on PM2 daemon restart.
|
||||
|
||||
### Requirements
|
||||
|
||||
1. Complete isolation between test/prod/dev PM2 processes
|
||||
2. Eliminate `pm2 save` race condition
|
||||
3. Simplify workflow commands
|
||||
4. Maintain backward compatibility during migration
|
||||
|
||||
## Decision
|
||||
|
||||
Implement PM2 namespaces with separate dump files per environment:
|
||||
|
||||
| Namespace | Config File | Use Case |
|
||||
| -------------------- | --------------------------- | ----------------------- |
|
||||
| `flyer-crawler-prod` | `ecosystem.config.cjs` | Production deployment |
|
||||
| `flyer-crawler-test` | `ecosystem-test.config.cjs` | Test/staging deployment |
|
||||
| `flyer-crawler-dev` | `ecosystem.dev.config.cjs` | Local development |
|
||||
|
||||
### Implementation
|
||||
|
||||
#### Ecosystem Config Changes
|
||||
|
||||
Each config file declares its namespace at the module level:
|
||||
|
||||
```javascript
|
||||
// ecosystem.config.cjs (production)
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-prod',
|
||||
apps: [
|
||||
/* ... */
|
||||
],
|
||||
};
|
||||
|
||||
// ecosystem-test.config.cjs (test)
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-test',
|
||||
apps: [
|
||||
/* ... */
|
||||
],
|
||||
};
|
||||
|
||||
// ecosystem.dev.config.cjs (development)
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-dev',
|
||||
apps: [
|
||||
/* ... */
|
||||
],
|
||||
};
|
||||
```
|
||||
|
||||
#### Workflow Command Pattern
|
||||
|
||||
All PM2 commands require `--namespace` flag:
|
||||
|
||||
```bash
|
||||
# Start/reload
|
||||
pm2 startOrReload ecosystem.config.cjs --update-env --namespace flyer-crawler-prod
|
||||
|
||||
# Process management
|
||||
pm2 stop flyer-crawler-api --namespace flyer-crawler-prod
|
||||
pm2 restart flyer-crawler-api flyer-crawler-worker --namespace flyer-crawler-prod
|
||||
pm2 delete flyer-crawler-api --namespace flyer-crawler-prod
|
||||
|
||||
# Status
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
pm2 jlist --namespace flyer-crawler-prod
|
||||
pm2 logs flyer-crawler-api --namespace flyer-crawler-prod
|
||||
pm2 describe flyer-crawler-api --namespace flyer-crawler-prod
|
||||
|
||||
# Save (namespace-isolated dump file)
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
```
|
||||
|
||||
#### Migration Script
|
||||
|
||||
Zero-downtime migration from unnamed processes to namespaced processes:
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# migrate-to-pm2-namespaces.sh
|
||||
|
||||
# 1. Stop old processes (by name)
|
||||
pm2 stop flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker || true
|
||||
pm2 stop flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test || true
|
||||
|
||||
# 2. Delete old processes
|
||||
pm2 delete flyer-crawler-api flyer-crawler-worker flyer-crawler-analytics-worker || true
|
||||
pm2 delete flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test || true
|
||||
|
||||
# 3. Save to clear old dump file
|
||||
pm2 save --force
|
||||
|
||||
# 4. Start with namespaces
|
||||
cd /var/www/flyer-crawler.projectium.com
|
||||
pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
|
||||
cd /var/www/flyer-crawler-test.projectium.com
|
||||
pm2 start ecosystem-test.config.cjs --namespace flyer-crawler-test
|
||||
pm2 save --namespace flyer-crawler-test
|
||||
```
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
|
||||
1. **Complete Process Isolation**: Namespaces create logical boundaries preventing cross-environment process operations
|
||||
2. **No Save Race Condition**: Each namespace maintains separate dump file at `~/.pm2/dump-<namespace>.pm2`
|
||||
3. **Simplified Commands**: No inline JavaScript filtering; use explicit namespace flag
|
||||
4. **Clear Organization**: `pm2 list --namespace <name>` shows only relevant processes
|
||||
5. **Retained Safeguards**: Defense-in-depth from ADR-061 remains as additional protection layer
|
||||
|
||||
### Negative
|
||||
|
||||
1. **Command Verbosity**: All PM2 commands require `--namespace` flag
|
||||
2. **Migration Required**: One-time migration to move existing processes into namespaces
|
||||
3. **Learning Curve**: Team must remember to include namespace flag
|
||||
|
||||
### Trade-offs
|
||||
|
||||
| Without Namespace | With Namespace |
|
||||
| --------------------------------- | ------------------------------------------------ |
|
||||
| `pm2 list` | `pm2 list --namespace flyer-crawler-prod` |
|
||||
| `pm2 logs app` | `pm2 logs app --namespace flyer-crawler-prod` |
|
||||
| `pm2 restart app` | `pm2 restart app --namespace flyer-crawler-prod` |
|
||||
| Filter logic in workflows | Explicit namespace declaration |
|
||||
| Single dump file (race condition) | Per-namespace dump files |
|
||||
|
||||
## Files Modified
|
||||
|
||||
| File | Changes |
|
||||
| ------------------------------------------ | ---------------------------------------------------------- |
|
||||
| `ecosystem.config.cjs` | Added `namespace: 'flyer-crawler-prod'` |
|
||||
| `ecosystem-test.config.cjs` | Added `namespace: 'flyer-crawler-test'` |
|
||||
| `ecosystem.dev.config.cjs` | Added `namespace: 'flyer-crawler-dev'` |
|
||||
| `.gitea/workflows/deploy-to-prod.yml` | Added `--namespace flyer-crawler-prod` to all PM2 commands |
|
||||
| `.gitea/workflows/deploy-to-test.yml` | Added `--namespace flyer-crawler-test` to all PM2 commands |
|
||||
| `.gitea/workflows/restart-pm2.yml` | Added `--namespace` flag for both environments |
|
||||
| `.gitea/workflows/manual-deploy-major.yml` | Added `--namespace flyer-crawler-prod` to PM2 commands |
|
||||
|
||||
## Verification
|
||||
|
||||
After migration, verify namespace isolation:
|
||||
|
||||
```bash
|
||||
# Should show only production processes
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
|
||||
# Should show only test processes
|
||||
pm2 list --namespace flyer-crawler-test
|
||||
|
||||
# Should show only dev processes (if running)
|
||||
pm2 list --namespace flyer-crawler-dev
|
||||
|
||||
# Verify separate dump files exist
|
||||
ls -la ~/.pm2/dump-flyer-crawler-*.pm2
|
||||
```
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [ADR-061: PM2 Process Isolation Safeguards](./0061-pm2-process-isolation-safeguards.md) - Prior safeguards (still active)
|
||||
- [ADR-014: Containerization and Deployment Strategy](./0014-containerization-and-deployment-strategy.md) - Overall deployment architecture
|
||||
- [PM2 Namespace Documentation](https://pm2.keymetrics.io/docs/usage/application-declaration/#namespace)
|
||||
|
||||
## References
|
||||
|
||||
- PM2 Ecosystem File: https://pm2.keymetrics.io/docs/usage/application-declaration/
|
||||
- PM2 Namespaces: https://pm2.keymetrics.io/docs/usage/process-management/#namespaces
|
||||
@@ -58,6 +58,7 @@ This directory contains a log of the architectural decisions made for the Flyer
|
||||
**[ADR-054](./0054-bugsink-gitea-issue-sync.md)**: Bugsink to Gitea Issue Synchronization (Proposed)
|
||||
**[ADR-061](./0061-pm2-process-isolation-safeguards.md)**: PM2 Process Isolation Safeguards (Accepted)
|
||||
**[ADR-062](./0062-lightweight-version-sync-workflow.md)**: Lightweight Version Sync Workflow (Accepted)
|
||||
**[ADR-063](./0063-pm2-namespace-implementation.md)**: PM2 Namespace Implementation (Accepted)
|
||||
|
||||
## 7. Frontend / User Interface
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
131
docs/operations/PM2-CLUSTER-MODE-INCOMPATIBILITY.md
Normal file
131
docs/operations/PM2-CLUSTER-MODE-INCOMPATIBILITY.md
Normal file
@@ -0,0 +1,131 @@
|
||||
# PM2 Cluster Mode Incompatibility with tsx
|
||||
|
||||
**Date Documented:** 2026-02-19
|
||||
**Affected Version:** v0.21.0
|
||||
**Status:** Resolved by switching to fork mode
|
||||
|
||||
## Issue Summary
|
||||
|
||||
PM2 cluster mode is fundamentally incompatible with using `tsx` as the script path in the ecosystem configuration. This manifests as 6-7 out of 8 cluster instances remaining in `errored` or `stopped` state with constant restart attempts (16-17 restarts observed), while only 1-2 instances successfully start.
|
||||
|
||||
## Root Cause
|
||||
|
||||
The configuration pattern used in [ecosystem.config.cjs:68-73](../../ecosystem.config.cjs#L68-L73):
|
||||
|
||||
```javascript
|
||||
{
|
||||
script: './node_modules/.bin/tsx',
|
||||
args: 'server.ts',
|
||||
exec_mode: 'cluster',
|
||||
instances: 'max',
|
||||
}
|
||||
```
|
||||
|
||||
**This is incompatible with PM2's cluster mode** because PM2 requires the `node` binary as the interpreter to properly fork worker processes using Node.js's native cluster module. When `tsx` is used as the script, PM2 cannot create cluster workers correctly.
|
||||
|
||||
## Technical Explanation
|
||||
|
||||
PM2's cluster mode uses Node.js's built-in `cluster` module to spawn multiple worker processes that share the same server port. This architecture requires:
|
||||
|
||||
1. **The interpreter must be `node`** - PM2 forks the main process using `node` as the binary
|
||||
2. **TypeScript transpilation must happen via Node.js loaders** - tsx or ts-node must be loaded as a Node.js module loader, not as the executable
|
||||
|
||||
When `tsx` is the script path (not the interpreter), PM2 attempts to treat it as the application entry point, which breaks the cluster fork mechanism. This causes worker processes to fail on startup with no clear error messages in logs (as observed: empty log files despite 16-17 restart attempts).
|
||||
|
||||
## The Correct Way to Use tsx with Cluster Mode
|
||||
|
||||
If cluster mode is required in the future, the proper configuration is:
|
||||
|
||||
```javascript
|
||||
{
|
||||
name: 'flyer-crawler-api',
|
||||
script: 'server.ts', // Direct TypeScript file
|
||||
interpreter: 'node', // Use node as interpreter
|
||||
interpreter_args: '--import tsx', // tsx as Node.js loader (Node 18.19+)
|
||||
exec_mode: 'cluster',
|
||||
instances: 'max',
|
||||
}
|
||||
```
|
||||
|
||||
**Important:** For Node.js 18.18 and below, use `--loader tsx` instead of `--import tsx`.
|
||||
|
||||
This configuration:
|
||||
|
||||
- Uses `node` as the interpreter (required for cluster mode)
|
||||
- Loads `tsx` as a Node.js module loader via `--import`
|
||||
- Allows PM2 to properly fork cluster workers
|
||||
|
||||
## Current Resolution
|
||||
|
||||
Since the application does not have high enough traffic to require cluster mode load balancing, we switched to **fork mode** with a single instance:
|
||||
|
||||
```javascript
|
||||
{
|
||||
instances: 1,
|
||||
exec_mode: 'fork',
|
||||
}
|
||||
```
|
||||
|
||||
This provides:
|
||||
|
||||
- ✅ Reliable process startup
|
||||
- ✅ Proper TypeScript execution via tsx
|
||||
- ✅ PM2 process management (restarts, logs, monitoring)
|
||||
- ❌ No load balancing across CPU cores (acceptable for current traffic)
|
||||
|
||||
## When to Reconsider Cluster Mode
|
||||
|
||||
Cluster mode should be reconsidered when:
|
||||
|
||||
1. **Traffic increases significantly** - Multiple CPU cores needed for request handling
|
||||
2. **Zero-downtime deploys are required** - PM2 reload works only in cluster mode
|
||||
3. **Configuration is updated** - Use the correct `interpreter` + `interpreter_args` pattern above
|
||||
|
||||
## Diagnostic Commands Used
|
||||
|
||||
```bash
|
||||
# Check PM2 process status
|
||||
pm2 ps
|
||||
|
||||
# View logs for failing instances
|
||||
pm2 logs flyer-crawler-api --namespace flyer-crawler-prod --lines 100 --nostream
|
||||
|
||||
# Get detailed process information
|
||||
pm2 describe 10 --namespace flyer-crawler-prod
|
||||
|
||||
# Check Node.js and PM2 versions
|
||||
node --version # v22.22.0
|
||||
pm2 --version # v6.0.13
|
||||
```
|
||||
|
||||
## Evidence from Production
|
||||
|
||||
```
|
||||
┌────┬────────────────────────────────────────┬─────────────┬─────────┬─────────┬──────────┬────────┬──────┬───────────┐
|
||||
│ id │ name │ namespace │ version │ mode │ pid │ uptime │ ↺ │ status │
|
||||
├────┼────────────────────────────────────────┼─────────────┼─────────┼─────────┼──────────┼────────┼──────┼───────────┤
|
||||
│ 10 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2032773 │ 6m │ 0 │ online │
|
||||
│ 13 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044125 │ 0 │ 16 │ errored │
|
||||
│ 14 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044198 │ 0 │ 16 │ errored │
|
||||
│ 15 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044221 │ 0 │ 16 │ errored │
|
||||
│ 16 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044179 │ 0 │ 16 │ errored │
|
||||
│ 17 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044358 │ 12s │ 17 │ online │
|
||||
│ 18 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044100 │ 0 │ 16 │ errored │
|
||||
│ 19 │ flyer-crawler-api │ flyer-craw… │ 0.21.0 │ cluster │ 2044243 │ 0 │ 17 │ errored │
|
||||
└────┴────────────────────────────────────────┴─────────────┴─────────┴─────────┴──────────┴────────┴──────┴───────────┘
|
||||
```
|
||||
|
||||
Only 2 out of 8 cluster instances were online; the rest showed 16-17 restart attempts with empty log files.
|
||||
|
||||
## References
|
||||
|
||||
- [PM2 — Use TSX to Start Your App](https://futurestud.io/tutorials/pm2-use-tsx-to-start-your-app)
|
||||
- [Running typescript app with pm2 and tsx](https://blog.vramana.com/posts/2023-02-05-pm2-tsx/)
|
||||
- [PM2 and cluster mode in Node.js/TypeScript · Issue #5790](https://github.com/Unitech/pm2/issues/5790)
|
||||
- [PM2 - Cluster Mode Documentation](https://pm2.keymetrics.io/docs/usage/cluster-mode/)
|
||||
- [PM2 - Transpilers Integration](https://pm2.io/docs/runtime/integration/transpilers/)
|
||||
|
||||
## Related ADRs
|
||||
|
||||
- [ADR-014: Containerization and Deployment Strategy](../adr/0014-containerization-and-deployment-strategy.md) - Original cluster mode decision
|
||||
- [ADR-063: PM2 Namespace Implementation](../adr/0063-pm2-namespace-implementation.md) - PM2 namespace isolation
|
||||
395
docs/operations/PM2-NAMESPACE-COMPLETION-REPORT.md
Normal file
395
docs/operations/PM2-NAMESPACE-COMPLETION-REPORT.md
Normal file
@@ -0,0 +1,395 @@
|
||||
# PM2 Namespace Implementation - Project Completion Report
|
||||
|
||||
**Date:** 2026-02-18
|
||||
**Status:** Complete
|
||||
**ADR Reference:** [ADR-063: PM2 Namespace Implementation](../adr/0063-pm2-namespace-implementation.md)
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
The PM2 namespace implementation for the flyer-crawler project is now 100% complete. This implementation provides complete process isolation between production, test, and development environments, eliminating race conditions during parallel deployments and simplifying PM2 management commands.
|
||||
|
||||
### Key Achievements
|
||||
|
||||
| Metric | Value |
|
||||
| ------------------------------ | --------------------------------- |
|
||||
| **Namespaces Implemented** | 3 (production, test, development) |
|
||||
| **Workflow Files Updated** | 6 |
|
||||
| **Config Files Modified** | 3 |
|
||||
| **Test Coverage** | 89 tests (all passing) |
|
||||
| **Race Conditions Eliminated** | `pm2 save` isolation complete |
|
||||
|
||||
---
|
||||
|
||||
## Problem Statement
|
||||
|
||||
Prior to this implementation, the project experienced critical issues with PM2 process management:
|
||||
|
||||
1. **Race Condition with `pm2 save`**: Simultaneous test and production deployments could overwrite each other's PM2 dump files, causing process loss on PM2 daemon restart.
|
||||
|
||||
2. **Cross-Environment Process Interference**: PM2 commands without proper filtering could affect processes across environments (test/production).
|
||||
|
||||
3. **Operational Complexity**: Every PM2 command required JavaScript inline filtering logic for safety.
|
||||
|
||||
4. **2026-02-17 Incident**: A production deployment accidentally killed ALL PM2 processes on the server, affecting both flyer-crawler and other PM2-managed applications.
|
||||
|
||||
---
|
||||
|
||||
## Solution Implemented
|
||||
|
||||
### Namespace Architecture
|
||||
|
||||
| Environment | Namespace | Config File | Use Case |
|
||||
| ----------- | -------------------- | --------------------------- | ---------------------------------- |
|
||||
| Production | `flyer-crawler-prod` | `ecosystem.config.cjs` | Live production deployment |
|
||||
| Test | `flyer-crawler-test` | `ecosystem-test.config.cjs` | Staging/test deployment |
|
||||
| Development | `flyer-crawler-dev` | `ecosystem.dev.config.cjs` | Local development in dev container |
|
||||
|
||||
### Namespace Definition Pattern
|
||||
|
||||
Each ecosystem config defines its namespace at the module.exports level (not inside apps):
|
||||
|
||||
```javascript
|
||||
// ecosystem.config.cjs (production)
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-prod',
|
||||
apps: [
|
||||
{ name: 'flyer-crawler-api' /* ... */ },
|
||||
{ name: 'flyer-crawler-worker' /* ... */ },
|
||||
{ name: 'flyer-crawler-analytics-worker' /* ... */ },
|
||||
],
|
||||
};
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Files Modified
|
||||
|
||||
### Ecosystem Configuration Files
|
||||
|
||||
| File | Change |
|
||||
| --------------------------- | --------------------------------------------------------------- |
|
||||
| `ecosystem.config.cjs` | Added `namespace: 'flyer-crawler-prod'` at module.exports level |
|
||||
| `ecosystem-test.config.cjs` | Added `namespace: 'flyer-crawler-test'` at module.exports level |
|
||||
| `ecosystem.dev.config.cjs` | Added `namespace: 'flyer-crawler-dev'` at module.exports level |
|
||||
|
||||
### Workflow Files
|
||||
|
||||
| File | Changes |
|
||||
| ------------------------------------------ | ------------------------------------------------------------------------------------ |
|
||||
| `.gitea/workflows/deploy-to-prod.yml` | Added `--namespace flyer-crawler-prod` to all PM2 commands |
|
||||
| `.gitea/workflows/deploy-to-test.yml` | Added `--namespace flyer-crawler-test` to all PM2 commands |
|
||||
| `.gitea/workflows/restart-pm2.yml` | Added `--namespace` flags for both test and production environments |
|
||||
| `.gitea/workflows/manual-db-restore.yml` | Added `--namespace flyer-crawler-prod` to PM2 stop, save, and startOrReload commands |
|
||||
| `.gitea/workflows/manual-deploy-major.yml` | Added `--namespace flyer-crawler-prod` to PM2 commands |
|
||||
| `.gitea/workflows/pm2-diagnostics.yml` | Added namespace-specific sections for both production and test |
|
||||
|
||||
### Session-Specific Modifications (2026-02-18)
|
||||
|
||||
The following files were modified in the final session to ensure complete namespace coverage:
|
||||
|
||||
1. **`.gitea/workflows/restart-pm2.yml`**
|
||||
- Line 45: Added `--namespace flyer-crawler-test` to `pm2 save`
|
||||
- Line 69: Added `--namespace flyer-crawler-prod` to `pm2 save`
|
||||
|
||||
2. **`.gitea/workflows/manual-db-restore.yml`**
|
||||
- Line 61: Added `--namespace flyer-crawler-prod` to `pm2 save` (after stopping processes)
|
||||
- Line 95: Added `--namespace flyer-crawler-prod` to `pm2 save` (after restart)
|
||||
|
||||
3. **`tests/pm2-namespace.test.ts`**
|
||||
- Added 6 new tests in the "PM2 Save Namespace Validation" describe block
|
||||
- Validates ALL `pm2 save` commands across all workflow files have namespace flags
|
||||
|
||||
### Migration Script
|
||||
|
||||
| File | Purpose |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------- |
|
||||
| `scripts/migrate-pm2-namespaces.sh` | Zero-downtime migration script for transitioning servers to namespace-based PM2 |
|
||||
|
||||
### Documentation
|
||||
|
||||
| File | Purpose |
|
||||
| ----------------------------------------------- | ----------------------------------------------------------- |
|
||||
| `docs/adr/0063-pm2-namespace-implementation.md` | Architecture Decision Record documenting the design |
|
||||
| `CLAUDE.md` | Updated PM2 Namespace Isolation section with usage examples |
|
||||
|
||||
---
|
||||
|
||||
## Test Coverage
|
||||
|
||||
### Test File: `tests/pm2-namespace.test.ts`
|
||||
|
||||
Total: **89 tests** (all passing)
|
||||
|
||||
#### Test Categories
|
||||
|
||||
1. **Ecosystem Configurations** (21 tests)
|
||||
- Validates namespace property in each config file
|
||||
- Verifies namespace is at module.exports level (not inside apps)
|
||||
- Confirms correct app definitions per environment
|
||||
- Ensures namespace uniqueness across environments
|
||||
|
||||
2. **Workflow Files** (38 tests)
|
||||
- Validates `--namespace` flag on all PM2 commands:
|
||||
- `pm2 list`
|
||||
- `pm2 jlist`
|
||||
- `pm2 save`
|
||||
- `pm2 stop`
|
||||
- `pm2 startOrReload`
|
||||
- `pm2 delete`
|
||||
- `pm2 logs`
|
||||
- `pm2 describe`
|
||||
- `pm2 env`
|
||||
- Verifies environment selection logic
|
||||
- Checks diagnostic workflows show both namespaces
|
||||
|
||||
3. **PM2 Save Namespace Validation** (6 tests)
|
||||
- Validates ALL `pm2 save` commands have `--namespace` flag
|
||||
- Individual file checks for clarity in test output
|
||||
- Covers: deploy-to-prod.yml, deploy-to-test.yml, restart-pm2.yml, manual-db-restore.yml, manual-deploy-major.yml
|
||||
|
||||
4. **Migration Script** (15 tests)
|
||||
- Validates script options (--dry-run, --test-only, --prod-only)
|
||||
- Verifies namespace constants
|
||||
- Checks rollback instructions
|
||||
- Confirms health check functionality
|
||||
- Validates idempotency logic
|
||||
|
||||
5. **Documentation** (15 tests)
|
||||
- ADR-063 structure validation
|
||||
- CLAUDE.md namespace section
|
||||
- Cross-reference consistency
|
||||
|
||||
6. **End-to-End Consistency** (3 tests)
|
||||
- Matching namespaces between configs and workflows
|
||||
- Namespace flag coverage ratio validation
|
||||
- Dump file isolation documentation
|
||||
|
||||
---
|
||||
|
||||
## Benefits Achieved
|
||||
|
||||
### 1. Race Condition Elimination
|
||||
|
||||
Before:
|
||||
|
||||
```
|
||||
Test deploy: pm2 save -> writes to ~/.pm2/dump.pm2
|
||||
Prod deploy: pm2 save -> overwrites ~/.pm2/dump.pm2
|
||||
PM2 daemon restart -> incomplete process list
|
||||
```
|
||||
|
||||
After:
|
||||
|
||||
```
|
||||
Test deploy: pm2 save --namespace flyer-crawler-test -> writes to ~/.pm2/dump-flyer-crawler-test.pm2
|
||||
Prod deploy: pm2 save --namespace flyer-crawler-prod -> writes to ~/.pm2/dump-flyer-crawler-prod.pm2
|
||||
PM2 daemon restart -> both environments fully restored
|
||||
```
|
||||
|
||||
### 2. Safe Parallel Deployments
|
||||
|
||||
Test and production deployments can now run simultaneously without interference. Each namespace operates independently with its own:
|
||||
|
||||
- Process list
|
||||
- Dump file
|
||||
- Logs (when using namespace filter)
|
||||
|
||||
### 3. Simplified Commands
|
||||
|
||||
Before (with filtering logic):
|
||||
|
||||
```javascript
|
||||
// Complex inline JavaScript filtering
|
||||
const list = JSON.parse(execSync('pm2 jlist').toString());
|
||||
const prodProcesses = list.filter((p) =>
|
||||
['flyer-crawler-api', 'flyer-crawler-worker', 'flyer-crawler-analytics-worker'].includes(p.name),
|
||||
);
|
||||
prodProcesses.forEach((p) => execSync(`pm2 delete ${p.pm_id}`));
|
||||
```
|
||||
|
||||
After (simple namespace flag):
|
||||
|
||||
```bash
|
||||
pm2 delete all --namespace flyer-crawler-prod
|
||||
```
|
||||
|
||||
### 4. Clear Organization
|
||||
|
||||
```bash
|
||||
# View only production processes
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
|
||||
# View only test processes
|
||||
pm2 list --namespace flyer-crawler-test
|
||||
|
||||
# No more confusion about which process belongs to which environment
|
||||
```
|
||||
|
||||
### 5. Defense in Depth
|
||||
|
||||
The ADR-061 safeguards (name-based filtering, process count validation, logging) remain active as an additional protection layer, providing defense in depth.
|
||||
|
||||
---
|
||||
|
||||
## Usage Examples
|
||||
|
||||
### Starting Processes
|
||||
|
||||
```bash
|
||||
# Production
|
||||
cd /var/www/flyer-crawler.projectium.com
|
||||
pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
|
||||
# Test
|
||||
cd /var/www/flyer-crawler-test.projectium.com
|
||||
pm2 start ecosystem-test.config.cjs --namespace flyer-crawler-test
|
||||
pm2 save --namespace flyer-crawler-test
|
||||
```
|
||||
|
||||
### Restarting Processes
|
||||
|
||||
```bash
|
||||
# Production
|
||||
pm2 restart all --namespace flyer-crawler-prod
|
||||
|
||||
# Test
|
||||
pm2 restart all --namespace flyer-crawler-test
|
||||
|
||||
# Specific process
|
||||
pm2 restart flyer-crawler-api --namespace flyer-crawler-prod
|
||||
```
|
||||
|
||||
### Viewing Status
|
||||
|
||||
```bash
|
||||
# Production only
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
|
||||
# Test only
|
||||
pm2 list --namespace flyer-crawler-test
|
||||
|
||||
# JSON output for scripting
|
||||
pm2 jlist --namespace flyer-crawler-prod
|
||||
```
|
||||
|
||||
### Viewing Logs
|
||||
|
||||
```bash
|
||||
# All production logs
|
||||
pm2 logs --namespace flyer-crawler-prod
|
||||
|
||||
# Specific process logs
|
||||
pm2 logs flyer-crawler-api --namespace flyer-crawler-prod --lines 100
|
||||
```
|
||||
|
||||
### Stopping and Deleting
|
||||
|
||||
```bash
|
||||
# Stop all production (safe - only affects production namespace)
|
||||
pm2 stop all --namespace flyer-crawler-prod
|
||||
|
||||
# Delete all test (safe - only affects test namespace)
|
||||
pm2 delete all --namespace flyer-crawler-test
|
||||
```
|
||||
|
||||
### Saving State
|
||||
|
||||
```bash
|
||||
# IMPORTANT: Always use namespace when saving
|
||||
pm2 save --namespace flyer-crawler-prod
|
||||
pm2 save --namespace flyer-crawler-test
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Migration Instructions
|
||||
|
||||
For servers not yet using namespaces, run the migration script:
|
||||
|
||||
### Dry Run (Preview Changes)
|
||||
|
||||
```bash
|
||||
cd /var/www/flyer-crawler.projectium.com
|
||||
./scripts/migrate-pm2-namespaces.sh --dry-run
|
||||
```
|
||||
|
||||
### Test Environment Only
|
||||
|
||||
```bash
|
||||
./scripts/migrate-pm2-namespaces.sh --test-only
|
||||
```
|
||||
|
||||
### Production Environment Only
|
||||
|
||||
```bash
|
||||
./scripts/migrate-pm2-namespaces.sh --prod-only
|
||||
```
|
||||
|
||||
### Both Environments
|
||||
|
||||
```bash
|
||||
./scripts/migrate-pm2-namespaces.sh
|
||||
```
|
||||
|
||||
### Post-Migration Verification
|
||||
|
||||
```bash
|
||||
# Verify namespace isolation
|
||||
pm2 list --namespace flyer-crawler-prod
|
||||
pm2 list --namespace flyer-crawler-test
|
||||
|
||||
# Verify dump files exist
|
||||
ls -la ~/.pm2/dump-flyer-crawler-*.pm2
|
||||
|
||||
# Verify no orphaned processes
|
||||
pm2 list # Should show processes organized by namespace
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Related Documentation
|
||||
|
||||
| Document | Purpose |
|
||||
| -------------------------------------------------------------------------------------------- | ---------------------------------------------- |
|
||||
| [ADR-063: PM2 Namespace Implementation](../adr/0063-pm2-namespace-implementation.md) | Architecture decision record |
|
||||
| [ADR-061: PM2 Process Isolation Safeguards](../adr/0061-pm2-process-isolation-safeguards.md) | Prior safeguards (still active) |
|
||||
| [CLAUDE.md](../../CLAUDE.md) | PM2 Namespace Isolation section (lines 52-169) |
|
||||
| [PM2 Incident Response Runbook](./PM2-INCIDENT-RESPONSE.md) | Emergency procedures |
|
||||
| [Incident Report 2026-02-17](./INCIDENT-2026-02-17-PM2-PROCESS-KILL.md) | Root cause analysis |
|
||||
|
||||
---
|
||||
|
||||
## Recommendations for Team
|
||||
|
||||
1. **Always Include Namespace**: Every PM2 command should include `--namespace <namespace>`. Without it, the command may affect unintended processes or use the wrong dump file.
|
||||
|
||||
2. **Use CI/CD Workflows**: Prefer using the Gitea workflows (`restart-pm2.yml`, `deploy-to-*.yml`) over manual SSH commands when possible. The workflows have been validated to use correct namespaces.
|
||||
|
||||
3. **Run Tests Before Deployment**: The test suite validates all PM2 commands have proper namespace flags. Run `npm test` to catch any regressions.
|
||||
|
||||
4. **Monitor After Migration**: After running the migration script, monitor PM2 status and application health for 15-30 minutes to ensure stability.
|
||||
|
||||
5. **Review Logs by Namespace**: When debugging, always filter logs by namespace to avoid confusion between environments.
|
||||
|
||||
---
|
||||
|
||||
## Appendix: Command Quick Reference
|
||||
|
||||
| Action | Production | Test |
|
||||
| ----------- | --------------------------------------------------------------- | -------------------------------------------------------------------- |
|
||||
| Start | `pm2 start ecosystem.config.cjs --namespace flyer-crawler-prod` | `pm2 start ecosystem-test.config.cjs --namespace flyer-crawler-test` |
|
||||
| Stop all | `pm2 stop all --namespace flyer-crawler-prod` | `pm2 stop all --namespace flyer-crawler-test` |
|
||||
| Restart all | `pm2 restart all --namespace flyer-crawler-prod` | `pm2 restart all --namespace flyer-crawler-test` |
|
||||
| Delete all | `pm2 delete all --namespace flyer-crawler-prod` | `pm2 delete all --namespace flyer-crawler-test` |
|
||||
| List | `pm2 list --namespace flyer-crawler-prod` | `pm2 list --namespace flyer-crawler-test` |
|
||||
| Logs | `pm2 logs --namespace flyer-crawler-prod` | `pm2 logs --namespace flyer-crawler-test` |
|
||||
| Save | `pm2 save --namespace flyer-crawler-prod` | `pm2 save --namespace flyer-crawler-test` |
|
||||
| Describe | `pm2 describe flyer-crawler-api --namespace flyer-crawler-prod` | `pm2 describe flyer-crawler-api-test --namespace flyer-crawler-test` |
|
||||
|
||||
---
|
||||
|
||||
**Report Generated:** 2026-02-18
|
||||
**Author:** Lead Technical Archivist (Claude Code)
|
||||
@@ -102,6 +102,7 @@ const sharedEnv = {
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-test',
|
||||
apps: [
|
||||
// =========================================================================
|
||||
// TEST APPS
|
||||
|
||||
@@ -57,6 +57,7 @@ const sharedEnv = {
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-prod',
|
||||
apps: [
|
||||
// =========================================================================
|
||||
// PRODUCTION APPS
|
||||
@@ -68,8 +69,8 @@ module.exports = {
|
||||
args: 'server.ts',
|
||||
cwd: '/var/www/flyer-crawler.projectium.com',
|
||||
max_memory_restart: '500M',
|
||||
instances: 'max',
|
||||
exec_mode: 'cluster',
|
||||
instances: 1,
|
||||
exec_mode: 'fork',
|
||||
kill_timeout: 5000,
|
||||
log_date_format: 'YYYY-MM-DD HH:mm:ss Z',
|
||||
max_restarts: 40,
|
||||
|
||||
@@ -69,6 +69,7 @@ const sharedEnv = {
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
namespace: 'flyer-crawler-dev',
|
||||
apps: [
|
||||
// =========================================================================
|
||||
// API SERVER (Development)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user