--- a/prometheus/alerts.yml +++ b/prometheus/alerts.yml @@ -1,7 +1,6 @@ groups: - name: payments_basic rules: - # This is the only alert we have — clearly insufficient - alert: PaymentsAPIDown expr: up{job="payments-api"} == 0 for: 2m @@ -13,6 +12,123 @@ description: "{{ $labels.instance }} has been unreachable for more than 2 minutes." runbook_url: https://wiki.internal/runbooks/payments-api-down - # TODO: Need alerts for error rate and latency — these were the blind spots - # in the April 8 and April 9 incidents. Rohan asked for these to be added - # as part of the new dashboard work. + - name: payments_reliability + rules: + # Error rate alerts — would have caught the April 9 incident (12% errors on /charge) + - alert: PaymentsHighErrorRate + expr: service:http_error_ratio:rate5m{service="payments-api"} > 0.05 + for: 2m + labels: + severity: warning + team: payments + annotations: + summary: "Payments API error rate above 5%" + description: "Error rate is {{ $value | humanizePercentage }} for the past 2 minutes." + runbook_url: https://wiki.internal/runbooks/payments-high-error-rate + + - alert: PaymentsHighErrorRate + expr: service:http_error_ratio:rate5m{service="payments-api"} > 0.10 + for: 1m + labels: + severity: critical + team: payments + annotations: + summary: "Payments API error rate above 10%" + description: "Error rate is {{ $value | humanizePercentage }} for the past minute. Immediate investigation required." + runbook_url: https://wiki.internal/runbooks/payments-high-error-rate + + # Latency alerts — would have caught the April 8 incident (P95 at 4.2s vs 500ms SLO) + - alert: PaymentsP95LatencyHigh + expr: service:http_duration:p95{service="payments-api"} > 0.5 + for: 3m + labels: + severity: warning + team: payments + annotations: + summary: "Payments API P95 latency above 500ms SLO" + description: "P95 latency is {{ $value | humanizeDuration }} (SLO: 500ms)." + runbook_url: https://wiki.internal/runbooks/payments-high-latency + + - alert: PaymentsP95LatencyHigh + expr: service:http_duration:p95{service="payments-api"} > 1.0 + for: 2m + labels: + severity: critical + team: payments + annotations: + summary: "Payments API P95 latency above 1s" + description: "P95 latency is {{ $value | humanizeDuration }}. Severely degraded checkout experience." + runbook_url: https://wiki.internal/runbooks/payments-high-latency + + # Payment failure rate — catches provider-side issues + - alert: PaymentsHighFailureRate + expr: | + sum(payments:failed:rate5m) + / + (sum(payments:success:rate5m) + sum(payments:failed:rate5m) + sum(payments:timeout:rate5m)) + > 0.05 + for: 2m + labels: + severity: warning + team: payments + annotations: + summary: "Payment failure rate above 5%" + description: "{{ $value | humanizePercentage }} of payment attempts are failing." + runbook_url: https://wiki.internal/runbooks/payments-high-failure-rate + + - name: host_health + rules: + - alert: HostHighCPU + expr: instance:node_cpu:utilization > 0.7 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "High CPU on {{ $labels.instance }}" + description: "CPU utilization is {{ $value | humanizePercentage }} for 5+ minutes." + runbook_url: https://wiki.internal/runbooks/host-high-cpu + + - alert: HostHighCPU + expr: instance:node_cpu:utilization > 0.9 + for: 2m + labels: + severity: critical + team: platform + annotations: + summary: "Critical CPU on {{ $labels.instance }}" + description: "CPU utilization is {{ $value | humanizePercentage }}. Risk of GC pauses and request timeouts." + runbook_url: https://wiki.internal/runbooks/host-high-cpu + + - alert: HostHighMemory + expr: instance:node_memory:utilization > 0.8 + for: 5m + labels: + severity: warning + team: platform + annotations: + summary: "High memory on {{ $labels.instance }}" + description: "Memory utilization is {{ $value | humanizePercentage }} for 5+ minutes." + runbook_url: https://wiki.internal/runbooks/host-high-memory + + - alert: HostHighMemory + expr: instance:node_memory:utilization > 0.95 + for: 2m + labels: + severity: critical + team: platform + annotations: + summary: "Critical memory on {{ $labels.instance }}" + description: "Memory utilization is {{ $value | humanizePercentage }}. OOM kill risk — likely cause of GC-induced latency spikes." + runbook_url: https://wiki.internal/runbooks/host-high-memory + + - alert: HostDiskSpaceLow + expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15 + for: 5m + labels: + severity: critical + team: platform + annotations: + summary: "Disk space low on {{ $labels.instance }} ({{ $labels.mountpoint }})" + description: "Only {{ $value | humanizePercentage }} disk space remaining on {{ $labels.mountpoint }}." + runbook_url: https://wiki.internal/runbooks/host-disk-space-low