debug: add PM2 crash debugging tools
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Has been cancelled
Some checks failed
Deploy to Test Environment / deploy-to-test (push) Has been cancelled
This commit is contained in:
@@ -503,25 +503,52 @@ jobs:
|
|||||||
|
|
||||||
- name: Deploy Application to Test Server
|
- name: Deploy Application to Test Server
|
||||||
run: |
|
run: |
|
||||||
|
set -x # Enable command tracing for debugging
|
||||||
|
echo "========================================="
|
||||||
|
echo "DEPLOYING TO TEST SERVER"
|
||||||
|
echo "========================================="
|
||||||
|
echo "Timestamp: $(date)"
|
||||||
echo "Deploying application files to /var/www/flyer-crawler-test.projectium.com..."
|
echo "Deploying application files to /var/www/flyer-crawler-test.projectium.com..."
|
||||||
APP_PATH="/var/www/flyer-crawler-test.projectium.com"
|
APP_PATH="/var/www/flyer-crawler-test.projectium.com"
|
||||||
|
|
||||||
# CRITICAL: Stop PM2 processes BEFORE deploying files to prevent CWD errors
|
# CRITICAL: Stop PM2 processes BEFORE deploying files to prevent CWD errors
|
||||||
echo "--- Stopping test PM2 processes before file deployment ---"
|
echo ""
|
||||||
pm2 stop flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test || echo "No test processes to stop"
|
echo "--- Step 1: Stopping test PM2 processes ---"
|
||||||
|
echo "Current PM2 state:"
|
||||||
|
pm2 list || echo "PM2 list failed"
|
||||||
|
|
||||||
|
echo "Stopping flyer-crawler test processes..."
|
||||||
|
pm2 stop flyer-crawler-api-test flyer-crawler-worker-test flyer-crawler-analytics-worker-test 2>&1 || echo "No test processes to stop (exit code: $?)"
|
||||||
|
|
||||||
|
echo "PM2 state after stop:"
|
||||||
|
pm2 list || echo "PM2 list failed"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "--- Step 2: Creating directories ---"
|
||||||
# Ensure the destination directory exists
|
# Ensure the destination directory exists
|
||||||
mkdir -p "$APP_PATH"
|
mkdir -p "$APP_PATH"
|
||||||
mkdir -p "$APP_PATH/flyer-images/icons" "$APP_PATH/flyer-images/archive" # Ensure all required subdirectories exist
|
mkdir -p "$APP_PATH/flyer-images/icons" "$APP_PATH/flyer-images/archive"
|
||||||
|
echo "Directories created/verified"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "--- Step 3: Deploying backend files ---"
|
||||||
# 1. Copy the backend source code and project files first.
|
# 1. Copy the backend source code and project files first.
|
||||||
# CRITICAL: We exclude 'node_modules', '.git', and 'dist'.
|
# CRITICAL: We exclude 'node_modules', '.git', and 'dist'.
|
||||||
rsync -avz --delete --exclude 'node_modules' --exclude '.git' --exclude 'dist' --exclude 'flyer-images' ./ "$APP_PATH/"
|
rsync -avz --delete --exclude 'node_modules' --exclude '.git' --exclude 'dist' --exclude 'flyer-images' ./ "$APP_PATH/" 2>&1 | tail -20
|
||||||
|
echo "Backend files deployed"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "--- Step 4: Deploying frontend assets ---"
|
||||||
# 2. Copy the built frontend assets into the same directory.
|
# 2. Copy the built frontend assets into the same directory.
|
||||||
# This will correctly place index.html and the assets/ folder in the webroot.
|
# This will correctly place index.html and the assets/ folder in the webroot.
|
||||||
rsync -avz dist/ "$APP_PATH"
|
rsync -avz dist/ "$APP_PATH" 2>&1 | tail -10
|
||||||
echo "Application deployment complete."
|
echo "Frontend assets deployed"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "========================================="
|
||||||
|
echo "APPLICATION DEPLOYMENT COMPLETE"
|
||||||
|
echo "========================================="
|
||||||
|
set +x # Disable command tracing
|
||||||
|
|
||||||
- name: Deploy Coverage Report to Public URL
|
- name: Deploy Coverage Report to Public URL
|
||||||
if: always()
|
if: always()
|
||||||
|
|||||||
188
.gitea/workflows/pm2-diagnostics.yml
Normal file
188
.gitea/workflows/pm2-diagnostics.yml
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
# .gitea/workflows/pm2-diagnostics.yml
|
||||||
|
#
|
||||||
|
# Comprehensive PM2 diagnostics to identify crash causes and problematic projects
|
||||||
|
name: PM2 Diagnostics
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
capture_interval:
|
||||||
|
description: 'Seconds between PM2 state captures (default: 5)'
|
||||||
|
required: false
|
||||||
|
default: '5'
|
||||||
|
duration:
|
||||||
|
description: 'Total monitoring duration in seconds (default: 60)'
|
||||||
|
required: false
|
||||||
|
default: '60'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pm2-diagnostics:
|
||||||
|
runs-on: projectium.com
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: PM2 Current State Snapshot
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "PM2 CURRENT STATE SNAPSHOT"
|
||||||
|
echo "========================================="
|
||||||
|
echo ""
|
||||||
|
echo "--- PM2 List (Human Readable) ---"
|
||||||
|
pm2 list
|
||||||
|
echo ""
|
||||||
|
echo "--- PM2 List (JSON) ---"
|
||||||
|
pm2 jlist > /tmp/pm2-state-initial.json
|
||||||
|
cat /tmp/pm2-state-initial.json | jq '.'
|
||||||
|
echo ""
|
||||||
|
echo "--- PM2 Daemon Info ---"
|
||||||
|
pm2 info pm2-logrotate || echo "pm2-logrotate not found"
|
||||||
|
echo ""
|
||||||
|
echo "--- PM2 Version ---"
|
||||||
|
pm2 --version
|
||||||
|
echo ""
|
||||||
|
echo "--- Node Version ---"
|
||||||
|
node --version
|
||||||
|
|
||||||
|
- name: PM2 Process Working Directories
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "PROCESS WORKING DIRECTORIES"
|
||||||
|
echo "========================================="
|
||||||
|
pm2 jlist | jq -r '.[] | "Process: \(.name) | CWD: \(.pm2_env.pm_cwd) | Exists: \(if .pm2_env.pm_cwd then "checking..." else "N/A" end)"'
|
||||||
|
echo ""
|
||||||
|
echo "--- Checking if CWDs still exist ---"
|
||||||
|
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' | while read cwd; do
|
||||||
|
if [ -d "$cwd" ]; then
|
||||||
|
echo "✅ EXISTS: $cwd"
|
||||||
|
else
|
||||||
|
echo "❌ MISSING: $cwd (THIS WILL CAUSE CRASHES!)"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: PM2 Log Analysis
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "PM2 LOG ANALYSIS"
|
||||||
|
echo "========================================="
|
||||||
|
echo ""
|
||||||
|
echo "--- PM2 Daemon Log (Last 100 Lines) ---"
|
||||||
|
tail -100 /home/gitea-runner/.pm2/pm2.log
|
||||||
|
echo ""
|
||||||
|
echo "--- Searching for ENOENT errors ---"
|
||||||
|
grep -i "ENOENT\|no such file or directory\|uv_cwd" /home/gitea-runner/.pm2/pm2.log || echo "No ENOENT errors found"
|
||||||
|
echo ""
|
||||||
|
echo "--- Searching for crash patterns ---"
|
||||||
|
grep -i "crash\|error\|exception" /home/gitea-runner/.pm2/pm2.log | tail -50 || echo "No crashes found"
|
||||||
|
|
||||||
|
- name: Identify All PM2-Managed Projects
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "ALL PM2-MANAGED PROJECTS"
|
||||||
|
echo "========================================="
|
||||||
|
pm2 jlist | jq -r '.[] | "[\(.pm_id)] \(.name) - v\(.pm2_env.version // "N/A") - \(.pm2_env.status) - CWD: \(.pm2_env.pm_cwd)"'
|
||||||
|
echo ""
|
||||||
|
echo "--- Projects by CWD ---"
|
||||||
|
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' | sort -u
|
||||||
|
echo ""
|
||||||
|
echo "--- Checking which projects might interfere ---"
|
||||||
|
for dir in /var/www/*; do
|
||||||
|
if [ -d "$dir" ]; then
|
||||||
|
echo ""
|
||||||
|
echo "Directory: $dir"
|
||||||
|
ls -la "$dir" | grep -E "ecosystem|package.json|node_modules" || echo " No PM2/Node files"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Monitor PM2 State Over Time
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "PM2 STATE MONITORING"
|
||||||
|
echo "========================================="
|
||||||
|
echo "Monitoring PM2 for ${{ gitea.event.inputs.duration }} seconds..."
|
||||||
|
echo "Capturing state every ${{ gitea.event.inputs.capture_interval }} seconds"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
INTERVAL=${{ gitea.event.inputs.capture_interval }}
|
||||||
|
DURATION=${{ gitea.event.inputs.duration }}
|
||||||
|
COUNT=$((DURATION / INTERVAL))
|
||||||
|
|
||||||
|
for i in $(seq 1 $COUNT); do
|
||||||
|
echo "--- Capture $i at $(date) ---"
|
||||||
|
pm2 jlist | jq -r '.[] | "\(.name): \(.pm2_env.status) (restarts: \(.pm2_env.restart_time))"'
|
||||||
|
|
||||||
|
# Check for new crashes
|
||||||
|
CRASHED=$(pm2 jlist | jq '[.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped")] | length')
|
||||||
|
if [ "$CRASHED" -gt 0 ]; then
|
||||||
|
echo "⚠️ WARNING: $CRASHED process(es) in crashed state!"
|
||||||
|
pm2 jlist | jq -r '.[] | select(.pm2_env.status == "errored" or .pm2_env.status == "stopped") | " - \(.name): \(.pm2_env.status)"'
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep $INTERVAL
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: PM2 Dump File Analysis
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "PM2 DUMP FILE ANALYSIS"
|
||||||
|
echo "========================================="
|
||||||
|
echo "--- Dump file location ---"
|
||||||
|
ls -lh /home/gitea-runner/.pm2/dump.pm2
|
||||||
|
echo ""
|
||||||
|
echo "--- Dump file contents ---"
|
||||||
|
cat /home/gitea-runner/.pm2/dump.pm2 | jq '.'
|
||||||
|
echo ""
|
||||||
|
echo "--- Processes in dump ---"
|
||||||
|
cat /home/gitea-runner/.pm2/dump.pm2 | jq -r '.apps[] | "\(.name) at \(.pm_cwd)"'
|
||||||
|
|
||||||
|
- name: Check for Rogue Deployment Scripts
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "DEPLOYMENT SCRIPT ANALYSIS"
|
||||||
|
echo "========================================="
|
||||||
|
echo "Checking for scripts that might delete directories..."
|
||||||
|
echo ""
|
||||||
|
for project in flyer-crawler stock-alert; do
|
||||||
|
for env in "" "-test"; do
|
||||||
|
DIR="/var/www/$project$env.projectium.com"
|
||||||
|
if [ -d "$DIR" ]; then
|
||||||
|
echo "--- Project: $project$env ---"
|
||||||
|
echo "Location: $DIR"
|
||||||
|
if [ -f "$DIR/.gitea/workflows/deploy-to-test.yml" ]; then
|
||||||
|
echo "Has deploy-to-test workflow"
|
||||||
|
grep -n "rsync.*--delete\|rm -rf" "$DIR/.gitea/workflows/deploy-to-test.yml" | head -5 || echo "No dangerous commands found"
|
||||||
|
fi
|
||||||
|
if [ -f "$DIR/.gitea/workflows/deploy-to-prod.yml" ]; then
|
||||||
|
echo "Has deploy-to-prod workflow"
|
||||||
|
grep -n "rsync.*--delete\|rm -rf" "$DIR/.gitea/workflows/deploy-to-prod.yml" | head -5 || echo "No dangerous commands found"
|
||||||
|
fi
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
- name: Generate Diagnostic Report
|
||||||
|
run: |
|
||||||
|
echo "========================================="
|
||||||
|
echo "DIAGNOSTIC SUMMARY"
|
||||||
|
echo "========================================="
|
||||||
|
echo ""
|
||||||
|
echo "Total PM2 processes: $(pm2 jlist | jq 'length')"
|
||||||
|
echo "Online: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "online")] | length')"
|
||||||
|
echo "Stopped: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "stopped")] | length')"
|
||||||
|
echo "Errored: $(pm2 jlist | jq '[.[] | select(.pm2_env.status == "errored")] | length')"
|
||||||
|
echo ""
|
||||||
|
echo "Flyer-crawler processes:"
|
||||||
|
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler")) | " \(.name): \(.pm2_env.status)"'
|
||||||
|
echo ""
|
||||||
|
echo "Stock-alert processes:"
|
||||||
|
pm2 jlist | jq -r '.[] | select(.name | contains("stock-alert")) | " \(.name): \(.pm2_env.status)"'
|
||||||
|
echo ""
|
||||||
|
echo "Other processes:"
|
||||||
|
pm2 jlist | jq -r '.[] | select(.name | contains("flyer-crawler") | not) | select(.name | contains("stock-alert") | not) | " \(.name): \(.pm2_env.status)"'
|
||||||
|
echo ""
|
||||||
|
echo "========================================="
|
||||||
|
echo "RECOMMENDATIONS"
|
||||||
|
echo "========================================="
|
||||||
|
echo "1. Check for missing CWDs (marked with ❌ above)"
|
||||||
|
echo "2. Review PM2 daemon log for ENOENT errors"
|
||||||
|
echo "3. Verify no deployments are running rsync --delete while processes are online"
|
||||||
|
echo "4. Consider separating PM2 daemons by user or using PM2 namespaces"
|
||||||
278
docs/operations/PM2-CRASH-DEBUGGING.md
Normal file
278
docs/operations/PM2-CRASH-DEBUGGING.md
Normal file
@@ -0,0 +1,278 @@
|
|||||||
|
# PM2 Crash Debugging Guide
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
This guide helps diagnose PM2 daemon crashes and identify which project is causing the issue.
|
||||||
|
|
||||||
|
## Common Symptoms
|
||||||
|
|
||||||
|
1. **PM2 processes disappear** between deployments
|
||||||
|
2. **`ENOENT: no such file or directory, uv_cwd`** errors in PM2 logs
|
||||||
|
3. **Processes require `pm2 resurrect`** after deployments
|
||||||
|
4. **PM2 daemon restarts** unexpectedly
|
||||||
|
|
||||||
|
## Root Cause
|
||||||
|
|
||||||
|
PM2 processes crash when their working directory (CWD) is deleted or modified while they're running. This typically happens when:
|
||||||
|
|
||||||
|
1. **rsync --delete** removes/recreates directories while processes are active
|
||||||
|
2. **npm install** modifies node_modules while processes are using them
|
||||||
|
3. **Deployments** don't stop processes before file operations
|
||||||
|
|
||||||
|
## Debugging Tools
|
||||||
|
|
||||||
|
### 1. PM2 Diagnostics Workflow
|
||||||
|
|
||||||
|
Run the comprehensive diagnostics workflow:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# In Gitea Actions UI:
|
||||||
|
# 1. Go to Actions → "PM2 Diagnostics"
|
||||||
|
# 2. Click "Run workflow"
|
||||||
|
# 3. Choose monitoring duration (default: 60s)
|
||||||
|
```
|
||||||
|
|
||||||
|
This workflow captures:
|
||||||
|
|
||||||
|
- Current PM2 state
|
||||||
|
- Working directory validation
|
||||||
|
- PM2 daemon logs
|
||||||
|
- All PM2-managed projects
|
||||||
|
- Crash patterns
|
||||||
|
- Deployment script analysis
|
||||||
|
|
||||||
|
### 2. PM2 Crash Analysis Script
|
||||||
|
|
||||||
|
Run the crash analysis script on the server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# SSH to server
|
||||||
|
ssh gitea-runner@projectium.com
|
||||||
|
|
||||||
|
# Run analysis
|
||||||
|
cd /var/www/flyer-crawler.projectium.com
|
||||||
|
bash scripts/analyze-pm2-crashes.sh
|
||||||
|
|
||||||
|
# Or save to file
|
||||||
|
bash scripts/analyze-pm2-crashes.sh > pm2-crash-report.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Manual PM2 Inspection
|
||||||
|
|
||||||
|
Quick manual checks:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Current PM2 state
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Detailed JSON state
|
||||||
|
pm2 jlist | jq '.'
|
||||||
|
|
||||||
|
# Check for missing CWDs
|
||||||
|
pm2 jlist | jq -r '.[] | "\(.name): \(.pm2_env.pm_cwd)"' | while read line; do
|
||||||
|
PROC=$(echo "$line" | cut -d: -f1)
|
||||||
|
CWD=$(echo "$line" | cut -d: -f2- | xargs)
|
||||||
|
[ -d "$CWD" ] && echo "✅ $PROC" || echo "❌ $PROC (CWD missing: $CWD)"
|
||||||
|
done
|
||||||
|
|
||||||
|
# View PM2 daemon log
|
||||||
|
tail -100 ~/.pm2/pm2.log
|
||||||
|
|
||||||
|
# Search for ENOENT errors
|
||||||
|
grep -i "ENOENT\|uv_cwd" ~/.pm2/pm2.log
|
||||||
|
```
|
||||||
|
|
||||||
|
## Identifying the Problematic Project
|
||||||
|
|
||||||
|
### Check Which Projects Share PM2 Daemon
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pm2 list
|
||||||
|
|
||||||
|
# Group by project
|
||||||
|
pm2 jlist | jq -r '.[] | .name' | grep -oE "^[a-z-]+" | sort -u
|
||||||
|
```
|
||||||
|
|
||||||
|
**Projects on projectium.com:**
|
||||||
|
|
||||||
|
- `flyer-crawler` (production, test)
|
||||||
|
- `stock-alert` (production, test)
|
||||||
|
- Others?
|
||||||
|
|
||||||
|
### Check Deployment Timing
|
||||||
|
|
||||||
|
1. Review PM2 daemon restart times:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
grep "New PM2 Daemon started" ~/.pm2/pm2.log
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Compare with deployment times in Gitea Actions
|
||||||
|
|
||||||
|
3. Identify which deployment triggered the crash
|
||||||
|
|
||||||
|
### Check Deployment Scripts
|
||||||
|
|
||||||
|
For each project, check if deployment stops PM2 before rsync:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Flyer-crawler
|
||||||
|
cat /var/www/flyer-crawler.projectium.com/.gitea/workflows/deploy-to-prod.yml | grep -B5 -A5 "rsync.*--delete"
|
||||||
|
|
||||||
|
# Stock-alert
|
||||||
|
cat /var/www/stock-alert.projectium.com/.gitea/workflows/deploy-to-prod.yml | grep -B5 -A5 "rsync.*--delete"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Look for:**
|
||||||
|
|
||||||
|
- ❌ `rsync --delete` **before** `pm2 stop`
|
||||||
|
- ✅ `pm2 stop` **before** `rsync --delete`
|
||||||
|
|
||||||
|
## Common Culprits
|
||||||
|
|
||||||
|
### 1. Flyer-Crawler Deployments
|
||||||
|
|
||||||
|
**Before Fix:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# ❌ BAD - Deploys files while processes running
|
||||||
|
- name: Deploy Application
|
||||||
|
run: |
|
||||||
|
rsync --delete ./ /var/www/...
|
||||||
|
pm2 restart ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**After Fix:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# ✅ GOOD - Stops processes first
|
||||||
|
- name: Deploy Application
|
||||||
|
run: |
|
||||||
|
pm2 stop flyer-crawler-api flyer-crawler-worker
|
||||||
|
rsync --delete ./ /var/www/...
|
||||||
|
pm2 startOrReload ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2. Stock-Alert Deployments
|
||||||
|
|
||||||
|
Check if stock-alert follows the same pattern. If it deploys without stopping PM2, it could crash the shared PM2 daemon.
|
||||||
|
|
||||||
|
### 3. Cross-Project Interference
|
||||||
|
|
||||||
|
If multiple projects share PM2:
|
||||||
|
|
||||||
|
- One project's deployment can crash another project's processes
|
||||||
|
- The crashed project's processes lose their CWD
|
||||||
|
- PM2 daemon may restart, clearing all processes
|
||||||
|
|
||||||
|
## Solutions
|
||||||
|
|
||||||
|
### Immediate Fix (Manual)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Restore processes from dump file
|
||||||
|
pm2 resurrect
|
||||||
|
|
||||||
|
# Verify all processes are running
|
||||||
|
pm2 list
|
||||||
|
```
|
||||||
|
|
||||||
|
### Permanent Fix
|
||||||
|
|
||||||
|
1. **Update deployment workflows** to stop PM2 before file operations
|
||||||
|
2. **Isolate PM2 daemons** by user or namespace
|
||||||
|
3. **Monitor deployments** to ensure proper sequencing
|
||||||
|
|
||||||
|
## Deployment Workflow Template
|
||||||
|
|
||||||
|
**Correct sequence:**
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Deploy Application
|
||||||
|
run: |
|
||||||
|
# 1. STOP PROCESSES FIRST
|
||||||
|
pm2 stop my-api my-worker
|
||||||
|
|
||||||
|
# 2. THEN deploy files
|
||||||
|
rsync -avz --delete ./ /var/www/my-app/
|
||||||
|
|
||||||
|
# 3. Install dependencies (safe, no processes running)
|
||||||
|
cd /var/www/my-app
|
||||||
|
npm install --omit=dev
|
||||||
|
|
||||||
|
# 4. Clean up errored processes
|
||||||
|
pm2 delete my-api my-worker || true
|
||||||
|
|
||||||
|
# 5. START processes
|
||||||
|
pm2 startOrReload ecosystem.config.cjs
|
||||||
|
pm2 save
|
||||||
|
```
|
||||||
|
|
||||||
|
## Monitoring & Prevention
|
||||||
|
|
||||||
|
### Enable Verbose Logging
|
||||||
|
|
||||||
|
Enhanced deployment logging (already implemented in flyer-crawler):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
- name: Deploy Application
|
||||||
|
run: |
|
||||||
|
set -x # Command tracing
|
||||||
|
echo "Step 1: Stopping PM2..."
|
||||||
|
pm2 stop ...
|
||||||
|
pm2 list # Verify stopped
|
||||||
|
|
||||||
|
echo "Step 2: Deploying files..."
|
||||||
|
rsync --delete ...
|
||||||
|
|
||||||
|
echo "Step 3: Starting PM2..."
|
||||||
|
pm2 start ...
|
||||||
|
pm2 list # Verify started
|
||||||
|
```
|
||||||
|
|
||||||
|
### Regular Health Checks
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Add to cron or monitoring system
|
||||||
|
*/5 * * * * pm2 jlist | jq -r '.[] | select(.pm2_env.status != "online") | "ALERT: \(.name) is \(.pm2_env.status)"'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting Decision Tree
|
||||||
|
|
||||||
|
```
|
||||||
|
PM2 processes missing?
|
||||||
|
├─ YES → Run `pm2 resurrect`
|
||||||
|
│ └─ Check PM2 daemon log for ENOENT errors
|
||||||
|
│ ├─ ENOENT found → Working directory deleted during deployment
|
||||||
|
│ │ └─ Fix: Add `pm2 stop` before rsync
|
||||||
|
│ └─ No ENOENT → Check other error patterns
|
||||||
|
│
|
||||||
|
└─ NO → Processes running but unstable?
|
||||||
|
└─ Check restart counts: `pm2 jlist | jq '.[].pm2_env.restart_time'`
|
||||||
|
└─ High restarts → Application-level issue (not PM2 crash)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Related Documentation
|
||||||
|
|
||||||
|
- [PM2 Process Isolation Requirements](../../CLAUDE.md#pm2-process-isolation-productiontest-servers)
|
||||||
|
- [PM2 Incident Response Runbook](./PM2-INCIDENT-RESPONSE.md)
|
||||||
|
- [Incident Report 2026-02-17](./INCIDENT-2026-02-17-PM2-PROCESS-KILL.md)
|
||||||
|
|
||||||
|
## Quick Reference Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Diagnose
|
||||||
|
pm2 list # Current state
|
||||||
|
pm2 jlist | jq '.' # Detailed JSON
|
||||||
|
tail -100 ~/.pm2/pm2.log # Recent logs
|
||||||
|
grep ENOENT ~/.pm2/pm2.log # Find crashes
|
||||||
|
|
||||||
|
# Fix
|
||||||
|
pm2 resurrect # Restore from dump
|
||||||
|
pm2 restart all # Restart everything
|
||||||
|
pm2 save # Save current state
|
||||||
|
|
||||||
|
# Analyze
|
||||||
|
bash scripts/analyze-pm2-crashes.sh # Run analysis script
|
||||||
|
pm2 jlist | jq -r '.[].pm2_env.pm_cwd' # Check working dirs
|
||||||
|
```
|
||||||
106
scripts/analyze-pm2-crashes.sh
Normal file
106
scripts/analyze-pm2-crashes.sh
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# scripts/analyze-pm2-crashes.sh
|
||||||
|
#
|
||||||
|
# Analyzes PM2 logs to identify crash patterns and problematic projects
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
PM2_LOG="/home/gitea-runner/.pm2/pm2.log"
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "PM2 CRASH ANALYSIS TOOL"
|
||||||
|
echo "========================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
if [ ! -f "$PM2_LOG" ]; then
|
||||||
|
echo "❌ PM2 log file not found at: $PM2_LOG"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Analyzing PM2 log file: $PM2_LOG"
|
||||||
|
echo "Log file size: $(du -h "$PM2_LOG" | cut -f1)"
|
||||||
|
echo "Last modified: $(stat -c %y "$PM2_LOG")"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "1. RECENT PM2 DAEMON RESTARTS"
|
||||||
|
echo "========================================="
|
||||||
|
grep -i "New PM2 Daemon started" "$PM2_LOG" | tail -5 || echo "No daemon restarts found"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "2. ENOENT / CWD ERRORS"
|
||||||
|
echo "========================================="
|
||||||
|
grep -i "ENOENT\|uv_cwd\|no such file or directory" "$PM2_LOG" | tail -20 || echo "No ENOENT errors found"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "3. PROCESS CRASH PATTERNS"
|
||||||
|
echo "========================================="
|
||||||
|
echo "Searching for app crash events..."
|
||||||
|
grep -i "App \[.*\] exited\|App \[.*\] errored\|App \[.*\] crashed" "$PM2_LOG" | tail -20 || echo "No app crashes found"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "4. PROJECTS INVOLVED IN CRASHES"
|
||||||
|
echo "========================================="
|
||||||
|
echo "Extracting project names from crash logs..."
|
||||||
|
grep -i "ENOENT\|crash\|error" "$PM2_LOG" | grep -oE "flyer-crawler[a-z-]*|stock-alert[a-z-]*" | sort | uniq -c | sort -rn || echo "No project names found in crashes"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "5. TIMELINE OF RECENT ERRORS (Last 50)"
|
||||||
|
echo "========================================="
|
||||||
|
grep -E "^[0-9]{4}-[0-9]{2}-[0-9]{2}" "$PM2_LOG" | grep -i "error\|crash\|ENOENT" | tail -50 || echo "No timestamped errors found"
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "6. CURRENT PM2 STATE"
|
||||||
|
echo "========================================="
|
||||||
|
pm2 list
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "7. PROCESSES WITH MISSING CWD"
|
||||||
|
echo "========================================="
|
||||||
|
pm2 jlist | jq -r '.[] | select(.pm2_env.pm_cwd) | "\(.name): \(.pm2_env.pm_cwd)"' | while read line; do
|
||||||
|
PROC_NAME=$(echo "$line" | cut -d: -f1)
|
||||||
|
CWD=$(echo "$line" | cut -d: -f2- | xargs)
|
||||||
|
if [ ! -d "$CWD" ]; then
|
||||||
|
echo "❌ $PROC_NAME - CWD missing: $CWD"
|
||||||
|
else
|
||||||
|
echo "✅ $PROC_NAME - CWD exists: $CWD"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
echo "========================================="
|
||||||
|
echo "8. RECOMMENDATIONS"
|
||||||
|
echo "========================================="
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Count ENOENT errors
|
||||||
|
ENOENT_COUNT=$(grep -c "ENOENT\|uv_cwd" "$PM2_LOG" 2>/dev/null || echo "0")
|
||||||
|
if [ "$ENOENT_COUNT" -gt 0 ]; then
|
||||||
|
echo "⚠️ Found $ENOENT_COUNT ENOENT/CWD errors in logs"
|
||||||
|
echo " This indicates processes losing their working directory during deployment"
|
||||||
|
echo " Solution: Ensure PM2 processes are stopped BEFORE rsync --delete operations"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check for multiple projects
|
||||||
|
FLYER_PROCESSES=$(pm2 jlist | jq '[.[] | select(.name | contains("flyer-crawler"))] | length' 2>/dev/null || echo "0")
|
||||||
|
STOCK_PROCESSES=$(pm2 jlist | jq '[.[] | select(.name | contains("stock-alert"))] | length' 2>/dev/null || echo "0")
|
||||||
|
|
||||||
|
if [ "$FLYER_PROCESSES" -gt 0 ] && [ "$STOCK_PROCESSES" -gt 0 ]; then
|
||||||
|
echo "ℹ️ Multiple projects detected:"
|
||||||
|
echo " - Flyer-crawler: $FLYER_PROCESSES processes"
|
||||||
|
echo " - Stock-alert: $STOCK_PROCESSES processes"
|
||||||
|
echo " Recommendation: Ensure deployments don't interfere with each other"
|
||||||
|
echo ""
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✅ Analysis complete"
|
||||||
|
echo ""
|
||||||
|
echo "To save this report:"
|
||||||
|
echo " bash scripts/analyze-pm2-crashes.sh > pm2-crash-report.txt"
|
||||||
Reference in New Issue
Block a user