Filesystem Size Used Avail Use% Mounted on
/dev/sda1 80G 80G 0 100% /
Filesystem Size Used Avail Use% Mounted on
/dev/sda1 80G 80G 0 100% /
Filesystem Size Used Avail Use% Mounted on
/dev/sda1 80G 80G 0 100% /
# Step 1: Confirm the problem
df -h # Step 2: Find the culprit
du -sh /var/log/*
du -sh /var/log/nginx/* # Step 3: Immediate relief - clear old compressed logs
find /var/log -name "*.gz" -mtime +7 -delete # Step 4: Truncate (don't delete) the active log file
truncate -s 0 /var/log/myapp/app.log # Step 5: Check logrotate config
cat /etc/logrotate.d/myapp
# Step 1: Confirm the problem
df -h # Step 2: Find the culprit
du -sh /var/log/*
du -sh /var/log/nginx/* # Step 3: Immediate relief - clear old compressed logs
find /var/log -name "*.gz" -mtime +7 -delete # Step 4: Truncate (don't delete) the active log file
truncate -s 0 /var/log/myapp/app.log # Step 5: Check logrotate config
cat /etc/logrotate.d/myapp
# Step 1: Confirm the problem
df -h # Step 2: Find the culprit
du -sh /var/log/*
du -sh /var/log/nginx/* # Step 3: Immediate relief - clear old compressed logs
find /var/log -name "*.gz" -mtime +7 -delete # Step 4: Truncate (don't delete) the active log file
truncate -s 0 /var/log/myapp/app.log # Step 5: Check logrotate config
cat /etc/logrotate.d/myapp
Error: Cannot acquire connection from pool
TimeoutError: timeout of 5000ms exceeded
Error: Cannot acquire connection from pool
TimeoutError: timeout of 5000ms exceeded
Error: Cannot acquire connection from pool
TimeoutError: timeout of 5000ms exceeded
// This is a leak
async function getUser(id) { const conn = await pool.acquire(); const result = await db.query('SELECT * FROM users WHERE id = $1', [id]); // If the query throws, conn is never released conn.release(); return result;
} // This is correct
async function getUser(id) { const conn = await pool.acquire(); try { return await db.query('SELECT * FROM users WHERE id = $1', [id]); } finally { conn.release(); // Always runs, even on error }
}
// This is a leak
async function getUser(id) { const conn = await pool.acquire(); const result = await db.query('SELECT * FROM users WHERE id = $1', [id]); // If the query throws, conn is never released conn.release(); return result;
} // This is correct
async function getUser(id) { const conn = await pool.acquire(); try { return await db.query('SELECT * FROM users WHERE id = $1', [id]); } finally { conn.release(); // Always runs, even on error }
}
// This is a leak
async function getUser(id) { const conn = await pool.acquire(); const result = await db.query('SELECT * FROM users WHERE id = $1', [id]); // If the query throws, conn is never released conn.release(); return result;
} // This is correct
async function getUser(id) { const conn = await pool.acquire(); try { return await db.query('SELECT * FROM users WHERE id = $1', [id]); } finally { conn.release(); // Always runs, even on error }
}
-- On PostgreSQL: see all active connections
SELECT state, count(*)
FROM pg_stat_activity
GROUP BY state; -- See who is holding connections longest
SELECT pid, now() - pg_stat_activity.query_start AS duration, query, state
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;
-- On PostgreSQL: see all active connections
SELECT state, count(*)
FROM pg_stat_activity
GROUP BY state; -- See who is holding connections longest
SELECT pid, now() - pg_stat_activity.query_start AS duration, query, state
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;
-- On PostgreSQL: see all active connections
SELECT state, count(*)
FROM pg_stat_activity
GROUP BY state; -- See who is holding connections longest
SELECT pid, now() - pg_stat_activity.query_start AS duration, query, state
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;
NAME READY STATUS RESTARTS AGE
myapp-7d9f8b-xkj2p 0/1 CrashLoopBackOff 4 3m
NAME READY STATUS RESTARTS AGE
myapp-7d9f8b-xkj2p 0/1 CrashLoopBackOff 4 3m
NAME READY STATUS RESTARTS AGE
myapp-7d9f8b-xkj2p 0/1 CrashLoopBackOff 4 3m
kubectl logs myapp-7d9f8b-xkj2p
kubectl logs myapp-7d9f8b-xkj2p
kubectl logs myapp-7d9f8b-xkj2p
Error: Required environment variable DATABASE_PASSWORD is not set
Process exited with code 1
Error: Required environment variable DATABASE_PASSWORD is not set
Process exited with code 1
Error: Required environment variable DATABASE_PASSWORD is not set
Process exited with code 1
kubectl describe pod myapp-7d9f8b-xkj2p
kubectl describe pod myapp-7d9f8b-xkj2p
kubectl describe pod myapp-7d9f8b-xkj2p
Events: Warning Failed 2m kubelet Error: secret "myapp-credentials" not found
Events: Warning Failed 2m kubelet Error: secret "myapp-credentials" not found
Events: Warning Failed 2m kubelet Error: secret "myapp-credentials" not found
# Step 1: Get the actual error
kubectl logs <pod-name>
kubectl logs <pod-name> --previous # Logs from the crashed instance # Step 2: Describe the pod for Kubernetes-level events
kubectl describe pod <pod-name> # Step 3: Check if the secret exists
kubectl get secrets -n <namespace> # Step 4: Verify the secret has the expected keys
kubectl describe secret myapp-credentials
# Step 1: Get the actual error
kubectl logs <pod-name>
kubectl logs <pod-name> --previous # Logs from the crashed instance # Step 2: Describe the pod for Kubernetes-level events
kubectl describe pod <pod-name> # Step 3: Check if the secret exists
kubectl get secrets -n <namespace> # Step 4: Verify the secret has the expected keys
kubectl describe secret myapp-credentials
# Step 1: Get the actual error
kubectl logs <pod-name>
kubectl logs <pod-name> --previous # Logs from the crashed instance # Step 2: Describe the pod for Kubernetes-level events
kubectl describe pod <pod-name> # Step 3: Check if the secret exists
kubectl get secrets -n <namespace> # Step 4: Verify the secret has the expected keys
kubectl describe secret myapp-credentials
// This leaks memory on every new WebSocket connection
function setupWebSocket(socket) { // This listener is added fresh on every call // But the reference to process.on keeps the socket alive // even after the connection closes process.on('SIGTERM', () => { socket.close(); }); socket.on('message', handleMessage);
}
// This leaks memory on every new WebSocket connection
function setupWebSocket(socket) { // This listener is added fresh on every call // But the reference to process.on keeps the socket alive // even after the connection closes process.on('SIGTERM', () => { socket.close(); }); socket.on('message', handleMessage);
}
// This leaks memory on every new WebSocket connection
function setupWebSocket(socket) { // This listener is added fresh on every call // But the reference to process.on keeps the socket alive // even after the connection closes process.on('SIGTERM', () => { socket.close(); }); socket.on('message', handleMessage);
}
MaxListenersExceededWarning: Possible EventEmitter memory leak detected.
11 SIGTERM listeners added to [process]. Use emitter.setMaxListeners() to increase limit
MaxListenersExceededWarning: Possible EventEmitter memory leak detected.
11 SIGTERM listeners added to [process]. Use emitter.setMaxListeners() to increase limit
MaxListenersExceededWarning: Possible EventEmitter memory leak detected.
11 SIGTERM listeners added to [process]. Use emitter.setMaxListeners() to increase limit
# Get a heap snapshot from a running Node.js process
kill -USR2 <pid> # Or via the Node.js inspector
node --inspect app.js
# Then open chrome://inspect and take a heap snapshot
# Get a heap snapshot from a running Node.js process
kill -USR2 <pid> # Or via the Node.js inspector
node --inspect app.js
# Then open chrome://inspect and take a heap snapshot
# Get a heap snapshot from a running Node.js process
kill -USR2 <pid> # Or via the Node.js inspector
node --inspect app.js
# Then open chrome://inspect and take a heap snapshot
npx clinic heapprofiler -- node app.js
npx clinic heapprofiler -- node app.js
npx clinic heapprofiler -- node app.js
function setupWebSocket(socket) { const cleanup = () => socket.close(); process.on('SIGTERM', cleanup); socket.on('close', () => { // Remove the listener when the connection closes process.removeListener('SIGTERM', cleanup); });
}
function setupWebSocket(socket) { const cleanup = () => socket.close(); process.on('SIGTERM', cleanup); socket.on('close', () => { // Remove the listener when the connection closes process.removeListener('SIGTERM', cleanup); });
}
function setupWebSocket(socket) { const cleanup = () => socket.close(); process.on('SIGTERM', cleanup); socket.on('close', () => { // Remove the listener when the connection closes process.removeListener('SIGTERM', cleanup); });
}
Cache hit rate: dropped from 95% to 0%
Database connections: spiked from 50 to 800 in 30 seconds
Database CPU: 100%
API P99 latency: 50ms -> 12,000ms
Cache hit rate: dropped from 95% to 0%
Database connections: spiked from 50 to 800 in 30 seconds
Database CPU: 100%
API P99 latency: 50ms -> 12,000ms
Cache hit rate: dropped from 95% to 0%
Database connections: spiked from 50 to 800 in 30 seconds
Database CPU: 100%
API P99 latency: 50ms -> 12,000ms
import redis
import time def get_with_lock(key, fetch_fn, ttl=300): r = redis.Redis() value = r.get(key) if value: return value # Try to acquire a lock lock_key = f"lock:{key}" if r.set(lock_key, "1", nx=True, ex=10): # We got the lock - populate the cache try: value = fetch_fn() r.setex(key, ttl, value) return value finally: r.delete(lock_key) else: # Someone else has the lock - wait briefly and retry time.sleep(0.1) return r.get(key)
import redis
import time def get_with_lock(key, fetch_fn, ttl=300): r = redis.Redis() value = r.get(key) if value: return value # Try to acquire a lock lock_key = f"lock:{key}" if r.set(lock_key, "1", nx=True, ex=10): # We got the lock - populate the cache try: value = fetch_fn() r.setex(key, ttl, value) return value finally: r.delete(lock_key) else: # Someone else has the lock - wait briefly and retry time.sleep(0.1) return r.get(key)
import redis
import time def get_with_lock(key, fetch_fn, ttl=300): r = redis.Redis() value = r.get(key) if value: return value # Try to acquire a lock lock_key = f"lock:{key}" if r.set(lock_key, "1", nx=True, ex=10): # We got the lock - populate the cache try: value = fetch_fn() r.setex(key, ttl, value) return value finally: r.delete(lock_key) else: # Someone else has the lock - wait briefly and retry time.sleep(0.1) return r.get(key)
import random base_ttl = 300
jitter = random.randint(-30, 30)
r.setex(key, base_ttl + jitter, value)
import random base_ttl = 300
jitter = random.randint(-30, 30)
r.setex(key, base_ttl + jitter, value)
import random base_ttl = 300
jitter = random.randint(-30, 30)
r.setex(key, base_ttl + jitter, value) - Request comes in for /api/products
- All 50 servers check the cache - cache miss
- All 50 servers query the database for product data
- All 50 servers write the result back to cache
- 49 of those database queries were wasted
- Under high traffic, "50 servers" becomes "50,000 requests per second" - The symptoms lied. Disk full causing database errors. Connection pool causing "database" problems. A cache issue causing what looks like a database overload.
- The actual cause was one layer removed from where the pain was visible.
- All five are preventable with the right monitoring thresholds, code patterns, and configuration choices.
- All five are faster to debug if you have seen them before.