debug: add PM2 crash debugging tools
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Has been cancelled

This commit is contained in:
2026-02-18 09:43:11 -08:00
parent cf2cc5b832
commit cd8ee92813
4 changed files with 605 additions and 6 deletions

View File

@@ -503,25 +503,52 @@ jobs:
- name: Deploy Application to Test Server - name: Deploy Application to Test Server
run: | run: |
set -x # Enable command tracing for debugging
echo "========================================="
echo "DEPLOYING TO TEST SERVER"
echo "========================================="
echo "Timestamp: $(date)"
echo "Deploying application files to /var/www/flyer-crawler-test.projectium.com..." echo "Deploying application files to /var/www/flyer-crawler-test.projectium.com..."
APP_PATH="/var/www/flyer-crawler-test.projectium.com" APP_PATH="/var/www/flyer-crawler-test.projectium.com"
# CRITICAL: Stop PM2 processes BEFORE deploying files to prevent CWD errors # CRITICAL: Stop PM2 processes BEFORE deploying files to prevent CWD errors
echo "--- Stopping test PM2 processes before file deployment ---" echo ""
pm2 stop flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test || echo "No test processes to stop" echo "--- Step 1: Stopping test PM2 processes ---"
echo "Current PM2 state:"
pm2 list || echo "PM2 list failed"
echo "Stopping flyer-crawler test processes..."
pm2 stop flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test 2>&1 || echo "No test processes to stop (exit code: $?)"
echo "PM2 state after stop:"
pm2 list || echo "PM2 list failed"
echo ""
echo "--- Step 2: Creating directories ---"
# Ensure the destination directory exists # Ensure the destination directory exists
mkdir -p "$APP_PATH" mkdir -p "$APP_PATH"
mkdir -p "$APP_PATH/flyer-images/icons" "$APP_PATH/flyer-images/archive" # Ensure all required subdirectories exist mkdir -p "$APP_PATH/flyer-images/icons" "$APP_PATH/flyer-images/archive"
echo "Directories created/verified"
echo ""
echo "--- Step 3: Deploying backend files ---"
# 1. Copy the backend source code and project files first. # 1. Copy the backend source code and project files first.
# CRITICAL: We exclude 'node_modules', '.git', and 'dist'. # CRITICAL: We exclude 'node_modules', '.git', and 'dist'.
rsync -avz --delete --exclude 'node_modules' --exclude '.git' --exclude 'dist' --exclude 'flyer-images' ./ "$APP_PATH/" rsync -avz --delete --exclude 'node_modules' --exclude '.git' --exclude 'dist' --exclude 'flyer-images' ./ "$APP_PATH/" 2>&1 | tail -20
echo "Backend files deployed"
echo ""
echo "--- Step 4: Deploying frontend assets ---"
# 2. Copy the built frontend assets into the same directory. # 2. Copy the built frontend assets into the same directory.
# This will correctly place index.html and the assets/ folder in the webroot. # This will correctly place index.html and the assets/ folder in the webroot.
rsync -avz dist/ "$APP_PATH" rsync -avz dist/ "$APP_PATH" 2>&1 | tail -10
echo "Application deployment complete." echo "Frontend assets deployed"
echo ""
echo "========================================="
echo "APPLICATION DEPLOYMENT COMPLETE"
echo "========================================="
set +x # Disable command tracing
- name: Deploy Coverage Report to Public URL - name: Deploy Coverage Report to Public URL
if: always() if: always()

View File

@@ -0,0 +1,188 @@
# .gitea/workflows/pm2-diagnostics.yml
#
# Comprehensive PM2 diagnostics to identify crash causes and problematic projects
name: PM2 Diagnostics
on:
workflow_dispatch:
inputs:
capture_interval:
description: 'Seconds between PM2 state captures (default: 5)'
required: false
default: '5'
duration:
description: 'Total monitoring duration in seconds (default: 60)'
required: false
default: '60'
jobs:
pm2-diagnostics:
runs-on: projectium.com
steps:
- name: PM2 Current State Snapshot
run: |
echo "========================================="
echo "PM2 CURRENT STATE SNAPSHOT"
echo "========================================="
echo ""
echo "--- PM2 List (Human Readable) ---"
pm2 list
echo ""
echo "--- PM2 List (JSON) ---"
pm2 jlist > /tmp/pm2-state-initial.json
cat /tmp/pm2-state-initial.json | jq '.'
echo ""
echo "--- PM2 Daemon Info ---"
pm2 info pm2-logrotate || echo "pm2-logrotate not found"
echo ""
echo "--- PM2 Version ---"
pm2 --version
echo ""
echo "--- Node Version ---"
node --version
- name: PM2 Process Working Directories
run: |
echo "========================================="
echo "PROCESS WORKING DIRECTORIES"
echo "========================================="
pm2 jlist | jq -r '.[] | "Process: \(.name) | CWD: \(.pm2_env.pm_cwd) | Exists: \(if .pm2_env.pm_cwd then "checking..." else "N/A" end)"'
echo ""
echo "--- Checking if CWDs still exist ---"
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' | while read cwd; do
if [ -d "$cwd" ]; then
echo "✅ EXISTS: $cwd"
else
echo "❌ MISSING: $cwd (THIS WILL CAUSE CRASHES!)"
fi
done
- name: PM2 Log Analysis
run: |
echo "========================================="
echo "PM2 LOG ANALYSIS"
echo "========================================="
echo ""
echo "--- PM2 Daemon Log (Last 100 Lines) ---"
tail -100 /home/gitea-runner/.pm2/pm2.log
echo ""
echo "--- Searching for ENOENT errors ---"
grep -i "ENOENT\|no such file or directory\|uv_cwd" /home/gitea-runner/.pm2/pm2.log || echo "No ENOENT errors found"
echo ""
echo "--- Searching for crash patterns ---"
grep -i "crash\|error\|exception" /home/gitea-runner/.pm2/pm2.log | tail -50 || echo "No crashes found"
- name: Identify All PM2-Managed Projects
run: |
echo "========================================="
echo "ALL PM2-MANAGED PROJECTS"
echo "========================================="
pm2 jlist | jq -r '.[] | "[\(.pm_id)] \(.name) - v\(.pm2_env.version // "N/A") - \(.pm2_env.status) - CWD: \(.pm2_env.pm_cwd)"'
echo ""
echo "--- Projects by CWD ---"
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' | sort -u
echo ""
echo "--- Checking which projects might interfere ---"
for dir in /var/www/*; do
if [ -d "$dir" ]; then
echo ""
echo "Directory: $dir"
ls -la "$dir" | grep -E "ecosystem|package.json|node_modules" || echo " No PM2/Node files"
fi
done
- name: Monitor PM2 State Over Time
run: |
echo "========================================="
echo "PM2 STATE MONITORING"
echo "========================================="
echo "Monitoring PM2 for ${{ gitea.event.inputs.duration }} seconds..."
echo "Capturing state every ${{ gitea.event.inputs.capture_interval }} seconds"
echo ""
INTERVAL=${{ gitea.event.inputs.capture_interval }}
DURATION=${{ gitea.event.inputs.duration }}
COUNT=$((DURATION / INTERVAL))
for i in $(seq 1 $COUNT); do
echo "--- Capture $i at $(date) ---"
pm2 jlist | jq -r '.[] | "\(.name): \(.pm2_env.status) (restarts: \(.pm2_env.restart_time))"'
# Check for new crashes
CRASHED=$(pm2 jlist | jq '[.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped")] | length')
if [ "$CRASHED" -gt 0 ]; then
echo "⚠️ WARNING: $CRASHED process(es) in crashed state!"
pm2 jlist | jq -r '.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped") | " - \(.name): \(.pm2_env.status)"'
fi
sleep $INTERVAL
done
- name: PM2 Dump File Analysis
run: |
echo "========================================="
echo "PM2 DUMP FILE ANALYSIS"
echo "========================================="
echo "--- Dump file location ---"
ls -lh /home/gitea-runner/.pm2/dump.pm2
echo ""
echo "--- Dump file contents ---"
cat /home/gitea-runner/.pm2/dump.pm2 | jq '.'
echo ""
echo "--- Processes in dump ---"
cat /home/gitea-runner/.pm2/dump.pm2 | jq -r '.apps[] | "\(.name) at \(.pm_cwd)"'
- name: Check for Rogue Deployment Scripts
run: |
echo "========================================="
echo "DEPLOYMENT SCRIPT ANALYSIS"
echo "========================================="
echo "Checking for scripts that might delete directories..."
echo ""
for project in flyer-crawler stock-alert; do
for env in "" "-test"; do
DIR="/var/www/$project$env.projectium.com"
if [ -d "$DIR" ]; then
echo "--- Project: $project$env ---"
echo "Location: $DIR"
if [ -f "$DIR/.gitea/workflows/deploy-to-test.yml" ]; then
echo "Has deploy-to-test workflow"
grep -n "rsync.*--delete\|rm -rf" "$DIR/.gitea/workflows/deploy-to-test.yml" | head -5 || echo "No dangerous commands found"
fi
if [ -f "$DIR/.gitea/workflows/deploy-to-prod.yml" ]; then
echo "Has deploy-to-prod workflow"
grep -n "rsync.*--delete\|rm -rf" "$DIR/.gitea/workflows/deploy-to-prod.yml" | head -5 || echo "No dangerous commands found"
fi
echo ""
fi
done
done
- name: Generate Diagnostic Report
run: |
echo "========================================="
echo "DIAGNOSTIC SUMMARY"
echo "========================================="
echo ""
echo "Total PM2 processes: $(pm2 jlist | jq 'length')"
echo "Online: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "online")] | length')"
echo "Stopped: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "stopped")] | length')"
echo "Errored: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "errored")] | length')"
echo ""
echo "Flyer-crawler processes:"
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler")) | " \(.name): \(.pm2_env.status)"'
echo ""
echo "Stock-alert processes:"
pm2 jlist | jq -r '.[] | select(.name | contains("stock-alert")) | " \(.name): \(.pm2_env.status)"'
echo ""
echo "Other processes:"
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler") | not) | select(.name | contains("stock-alert") | not) | " \(.name): \(.pm2_env.status)"'
echo ""
echo "========================================="
echo "RECOMMENDATIONS"
echo "========================================="
echo "1. Check for missing CWDs (marked with ❌ above)"
echo "2. Review PM2 daemon log for ENOENT errors"
echo "3. Verify no deployments are running rsync --delete while processes are online"
echo "4. Consider separating PM2 daemons by user or using PM2 namespaces"

View File

@@ -0,0 +1,278 @@
# PM2 Crash Debugging Guide
## Overview
This guide helps diagnose PM2 daemon crashes and identify which project is causing the issue.
## Common Symptoms
1. **PM2 processes disappear** between deployments
2. **`ENOENT: no such file or directory, uv_cwd`** errors in PM2 logs
3. **Processes require `pm2 resurrect`** after deployments
4. **PM2 daemon restarts** unexpectedly
## Root Cause
PM2 processes crash when their working directory (CWD) is deleted or modified while they're running. This typically happens when:
1. **rsync --delete** removes/recreates directories while processes are active
2. **npm install** modifies node_modules while processes are using them
3. **Deployments** don't stop processes before file operations
## Debugging Tools
### 1. PM2 Diagnostics Workflow
Run the comprehensive diagnostics workflow:
```bash
# In Gitea Actions UI:
# 1. Go to Actions → "PM2 Diagnostics"
# 2. Click "Run workflow"
# 3. Choose monitoring duration (default: 60s)
```
This workflow captures:
- Current PM2 state
- Working directory validation
- PM2 daemon logs
- All PM2-managed projects
- Crash patterns
- Deployment script analysis
### 2. PM2 Crash Analysis Script
Run the crash analysis script on the server:
```bash
# SSH to server
ssh gitea-runner@projectium.com
# Run analysis
cd /var/www/flyer-crawler.projectium.com
bash scripts/analyze-pm2-crashes.sh
# Or save to file
bash scripts/analyze-pm2-crashes.sh > pm2-crash-report.txt
```
### 3. Manual PM2 Inspection
Quick manual checks:
```bash
# Current PM2 state
pm2 list
# Detailed JSON state
pm2 jlist | jq '.'
# Check for missing CWDs
pm2 jlist | jq -r '.[] | "\(.name): \(.pm2_env.pm_cwd)"' | while read line; do
PROC=$(echo "$line" | cut -d: -f1)
CWD=$(echo "$line" | cut -d: -f2- | xargs)
[ -d "$CWD" ] && echo "$PROC" || echo "$PROC (CWD missing: $CWD)"
done
# View PM2 daemon log
tail -100 ~/.pm2/pm2.log
# Search for ENOENT errors
grep -i "ENOENT\|uv_cwd" ~/.pm2/pm2.log
```
## Identifying the Problematic Project
### Check Which Projects Share PM2 Daemon
```bash
pm2 list
# Group by project
pm2 jlist | jq -r '.[] | .name' | grep -oE "^[a-z-]+" | sort -u
```
**Projects on projectium.com:**
- `flyer-crawler` (production, test)
- `stock-alert` (production, test)
- Others?
### Check Deployment Timing
1. Review PM2 daemon restart times:
```bash
grep "New PM2 Daemon started" ~/.pm2/pm2.log
```
2. Compare with deployment times in Gitea Actions
3. Identify which deployment triggered the crash
### Check Deployment Scripts
For each project, check if deployment stops PM2 before rsync:
```bash
# Flyer-crawler
cat /var/www/flyer-crawler.projectium.com/.gitea/workflows/deploy-to-prod.yml | grep -B5 -A5 "rsync.*--delete"
# Stock-alert
cat /var/www/stock-alert.projectium.com/.gitea/workflows/deploy-to-prod.yml | grep -B5 -A5 "rsync.*--delete"
```
**Look for:**
- ❌ `rsync --delete` **before** `pm2 stop`
- ✅ `pm2 stop` **before** `rsync --delete`
## Common Culprits
### 1. Flyer-Crawler Deployments
**Before Fix:**
```yaml
# ❌ BAD - Deploys files while processes running
- name: Deploy Application
run: |
rsync --delete ./ /var/www/...
pm2 restart ...
```
**After Fix:**
```yaml
# ✅ GOOD - Stops processes first
- name: Deploy Application
run: |
pm2 stop flyer-crawler-api flyer-crawler-worker
rsync --delete ./ /var/www/...
pm2 startOrReload ...
```
### 2. Stock-Alert Deployments
Check if stock-alert follows the same pattern. If it deploys without stopping PM2, it could crash the shared PM2 daemon.
### 3. Cross-Project Interference
If multiple projects share PM2:
- One project's deployment can crash another project's processes
- The crashed project's processes lose their CWD
- PM2 daemon may restart, clearing all processes
## Solutions
### Immediate Fix (Manual)
```bash
# Restore processes from dump file
pm2 resurrect
# Verify all processes are running
pm2 list
```
### Permanent Fix
1. **Update deployment workflows** to stop PM2 before file operations
2. **Isolate PM2 daemons** by user or namespace
3. **Monitor deployments** to ensure proper sequencing
## Deployment Workflow Template
**Correct sequence:**
```yaml
- name: Deploy Application
run: |
# 1. STOP PROCESSES FIRST
pm2 stop my-api my-worker
# 2. THEN deploy files
rsync -avz --delete ./ /var/www/my-app/
# 3. Install dependencies (safe, no processes running)
cd /var/www/my-app
npm install --omit=dev
# 4. Clean up errored processes
pm2 delete my-api my-worker || true
# 5. START processes
pm2 startOrReload ecosystem.config.cjs
pm2 save
```
## Monitoring & Prevention
### Enable Verbose Logging
Enhanced deployment logging (already implemented in flyer-crawler):
```yaml
- name: Deploy Application
run: |
set -x # Command tracing
echo "Step 1: Stopping PM2..."
pm2 stop ...
pm2 list # Verify stopped
echo "Step 2: Deploying files..."
rsync --delete ...
echo "Step 3: Starting PM2..."
pm2 start ...
pm2 list # Verify started
```
### Regular Health Checks
```bash
# Add to cron or monitoring system
*/5 * * * * pm2 jlist | jq -r '.[] | select(.pm2_env.status != "online") | "ALERT: \(.name) is \(.pm2_env.status)"'
```
## Troubleshooting Decision Tree
```
PM2 processes missing?
├─ YES → Run `pm2 resurrect`
│ └─ Check PM2 daemon log for ENOENT errors
│ ├─ ENOENT found → Working directory deleted during deployment
│ │ └─ Fix: Add `pm2 stop` before rsync
│ └─ No ENOENT → Check other error patterns
└─ NO → Processes running but unstable?
└─ Check restart counts: `pm2 jlist | jq '.[].pm2_env.restart_time'`
└─ High restarts → Application-level issue (not PM2 crash)
```
## Related Documentation
- [PM2 Process Isolation Requirements](../../CLAUDE.md#pm2-process-isolation-productiontest-servers)
- [PM2 Incident Response Runbook](./PM2-INCIDENT-RESPONSE.md)
- [Incident Report 2026-02-17](./INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
## Quick Reference Commands
```bash
# Diagnose
pm2 list # Current state
pm2 jlist | jq '.' # Detailed JSON
tail -100 ~/.pm2/pm2.log # Recent logs
grep ENOENT ~/.pm2/pm2.log # Find crashes
# Fix
pm2 resurrect # Restore from dump
pm2 restart all # Restart everything
pm2 save # Save current state
# Analyze
bash scripts/analyze-pm2-crashes.sh # Run analysis script
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' # Check working dirs
```

View File

@@ -0,0 +1,106 @@
#!/bin/bash
# scripts/analyze-pm2-crashes.sh
#
# Analyzes PM2 logs to identify crash patterns and problematic projects
set -e
PM2_LOG="/home/gitea-runner/.pm2/pm2.log"
echo "========================================="
echo "PM2 CRASH ANALYSIS TOOL"
echo "========================================="
echo ""
if [ ! -f "$PM2_LOG" ]; then
echo "❌ PM2 log file not found at: $PM2_LOG"
exit 1
fi
echo "Analyzing PM2 log file: $PM2_LOG"
echo "Log file size: $(du -h "$PM2_LOG" | cut -f1)"
echo "Last modified: $(stat -c %y "$PM2_LOG")"
echo ""
echo "========================================="
echo "1. RECENT PM2 DAEMON RESTARTS"
echo "========================================="
grep -i "New PM2 Daemon started" "$PM2_LOG" | tail -5 || echo "No daemon restarts found"
echo ""
echo "========================================="
echo "2. ENOENT / CWD ERRORS"
echo "========================================="
grep -i "ENOENT\|uv_cwd\|no such file or directory" "$PM2_LOG" | tail -20 || echo "No ENOENT errors found"
echo ""
echo "========================================="
echo "3. PROCESS CRASH PATTERNS"
echo "========================================="
echo "Searching for app crash events..."
grep -i "App \[.*\] exited\|App \[.*\] errored\|App \[.*\] crashed" "$PM2_LOG" | tail -20 || echo "No app crashes found"
echo ""
echo "========================================="
echo "4. PROJECTS INVOLVED IN CRASHES"
echo "========================================="
echo "Extracting project names from crash logs..."
grep -i "ENOENT\|crash\|error" "$PM2_LOG" | grep -oE "flyer-crawler[a-z-]*|stock-alert[a-z-]*" | sort | uniq -c | sort -rn || echo "No project names found in crashes"
echo ""
echo "========================================="
echo "5. TIMELINE OF RECENT ERRORS (Last 50)"
echo "========================================="
grep -E "^[0-9]{4}-[0-9]{2}-[0-9]{2}" "$PM2_LOG" | grep -i "error\|crash\|ENOENT" | tail -50 || echo "No timestamped errors found"
echo ""
echo "========================================="
echo "6. CURRENT PM2 STATE"
echo "========================================="
pm2 list
echo ""
echo "========================================="
echo "7. PROCESSES WITH MISSING CWD"
echo "========================================="
pm2 jlist | jq -r '.[] | select(.pm2_env.pm_cwd) | "\(.name): \(.pm2_env.pm_cwd)"' | while read line; do
PROC_NAME=$(echo "$line" | cut -d: -f1)
CWD=$(echo "$line" | cut -d: -f2- | xargs)
if [ ! -d "$CWD" ]; then
echo "$PROC_NAME - CWD missing: $CWD"
else
echo "$PROC_NAME - CWD exists: $CWD"
fi
done
echo ""
echo "========================================="
echo "8. RECOMMENDATIONS"
echo "========================================="
echo ""
# Count ENOENT errors
ENOENT_COUNT=$(grep -c "ENOENT\|uv_cwd" "$PM2_LOG" 2>/dev/null || echo "0")
if [ "$ENOENT_COUNT" -gt 0 ]; then
echo "⚠️ Found $ENOENT_COUNT ENOENT/CWD errors in logs"
echo " This indicates processes losing their working directory during deployment"
echo " Solution: Ensure PM2 processes are stopped BEFORE rsync --delete operations"
echo ""
fi
# Check for multiple projects
FLYER_PROCESSES=$(pm2 jlist | jq '[.[] | select(.name | contains("flyer-crawler"))] | length' 2>/dev/null || echo "0")
STOCK_PROCESSES=$(pm2 jlist | jq '[.[] | select(.name | contains("stock-alert"))] | length' 2>/dev/null || echo "0")
if [ "$FLYER_PROCESSES" -gt 0 ] && [ "$STOCK_PROCESSES" -gt 0 ]; then
echo " Multiple projects detected:"
echo " - Flyer-crawler: $FLYER_PROCESSES processes"
echo " - Stock-alert: $STOCK_PROCESSES processes"
echo " Recommendation: Ensure deployments don't interfere with each other"
echo ""
fi
echo "✅ Analysis complete"
echo ""
echo "To save this report:"
echo " bash scripts/analyze-pm2-crashes.sh > pm2-crash-report.txt"