--- a/incident-review.canvas +++ b/incident-review.canvas @@ -29,6 +29,52 @@ "text": "## Initial Response\n\nOn-call engineer Alex paged at 14:35\nWar room opened in Slack #incident-042" }, { + "id": "1a2b3c4d5e6f7890", + "type": "text", + "x": 0, + "y": 560, + "width": 350, + "height": 120, + "text": "## Investigation (14:41)\n\nNarrowed to connection pool exhaustion on payment-db-primary" + }, + { + "id": "2b3c4d5e6f789012", + "type": "text", + "x": 0, + "y": 760, + "width": 350, + "height": 120, + "text": "## Root Cause Identified (14:48)\n\nMigration job `backfill_invoice_idx` running unthrottled since 14:15, starving the connection pool" + }, + { + "id": "3c4d5e6f78901234", + "type": "text", + "x": 0, + "y": 960, + "width": 350, + "height": 120, + "text": "## Mitigation (14:52)\n\nKilled the migration job\nConnections began recovering" + }, + { + "id": "4d5e6f7890123456", + "type": "text", + "x": 0, + "y": 1160, + "width": 350, + "height": 100, + "text": "## Recovery (14:58)\n\nAPI latency back to normal\nError rate below 0.1%" + }, + { + "id": "5e6f789012345678", + "type": "text", + "x": 0, + "y": 1340, + "width": 350, + "height": 100, + "text": "## All-Clear (15:05)\n\nAll-clear given in Slack\nPagerDuty incident resolved", + "color": "4" + }, + { "id": "d4e5f6a789012345", "type": "link", "x": 500, @@ -36,6 +82,36 @@ "width": 300, "height": 120, "url": "https://app.datadoghq.com/dashboard/abc-123/api-health" + }, + { + "id": "6f78901234567890", + "type": "text", + "x": 500, + "y": 560, + "width": 420, + "height": 220, + "text": "## Root Cause Analysis\n\nMigration `backfill_invoice_idx` deployed with `batch_size=50000` (intended: 500)\n\nOversized batches held row-level locks for 30+ seconds each, exhausting the 50-connection pool on payment-db-primary\n\n**Underlying issue:** Migration configs aren't validated by CI — batch_size was set via env var and never reviewed", + "color": "1" + }, + { + "id": "7890123456789012", + "type": "text", + "x": 500, + "y": 860, + "width": 420, + "height": 200, + "text": "## Mitigation Steps\n\n1. Killed migration via `pg_terminate_backend`\n2. Restarted payment-api pods to clear stale connections\n3. Verified pool recovery via Datadog connection metrics\n4. Paused all pending migration jobs in scheduler", + "color": "3" + }, + { + "id": "8901234567890123", + "type": "text", + "x": 500, + "y": 1140, + "width": 420, + "height": 280, + "text": "## Follow-up Actions\n\n- [ ] Add CI check: migration batch_size ≤ 1000\n- [ ] Connection pool monitoring alert (80% utilization >60s)\n- [ ] Add kill-switch for migration scheduler in ops runbook\n- [ ] Retro: why backfill scheduled during peak hours (cron UTC vs PST)\n- [ ] Update incident runbook: \"check for active migrations\" step", + "color": "5" } ], "edges": [ @@ -54,6 +130,73 @@ "toNode": "c3d4e5f6a7890123", "toSide": "top", "toEnd": "arrow" + }, + { + "id": "aa11bb22cc33dd44", + "fromNode": "c3d4e5f6a7890123", + "fromSide": "bottom", + "toNode": "1a2b3c4d5e6f7890", + "toSide": "top", + "toEnd": "arrow" + }, + { + "id": "bb22cc33dd44ee55", + "fromNode": "1a2b3c4d5e6f7890", + "fromSide": "bottom", + "toNode": "2b3c4d5e6f789012", + "toSide": "top", + "toEnd": "arrow" + }, + { + "id": "cc33dd44ee55ff66", + "fromNode": "2b3c4d5e6f789012", + "fromSide": "bottom", + "toNode": "3c4d5e6f78901234", + "toSide": "top", + "toEnd": "arrow" + }, + { + "id": "dd44ee55ff667788", + "fromNode": "3c4d5e6f78901234", + "fromSide": "bottom", + "toNode": "4d5e6f7890123456", + "toSide": "top", + "toEnd": "arrow" + }, + { + "id": "ee55ff66778899aa", + "fromNode": "4d5e6f7890123456", + "fromSide": "bottom", + "toNode": "5e6f789012345678", + "toSide": "top", + "toEnd": "arrow" + }, + { + "id": "ff6677889900aabb", + "fromNode": "2b3c4d5e6f789012", + "fromSide": "right", + "toNode": "6f78901234567890", + "toSide": "left", + "toEnd": "arrow", + "label": "details" + }, + { + "id": "0011223344556677", + "fromNode": "3c4d5e6f78901234", + "fromSide": "right", + "toNode": "7890123456789012", + "toSide": "left", + "toEnd": "arrow", + "label": "steps taken" + }, + { + "id": "1122334455667788", + "fromNode": "5e6f789012345678", + "fromSide": "right", + "toNode": "8901234567890123", + "toSide": "left", + "toEnd": "arrow", + "label": "next steps" } ] }