--- a/prometheus/alerts.yml
+++ b/prometheus/alerts.yml
@@ -1,7 +1,6 @@
 groups:
   - name: payments_basic
     rules:
-      # This is the only alert we have — clearly insufficient
       - alert: PaymentsAPIDown
         expr: up{job="payments-api"} == 0
         for: 2m
@@ -13,6 +12,123 @@
           description: "{{ $labels.instance }} has been unreachable for more than 2 minutes."
           runbook_url: https://wiki.internal/runbooks/payments-api-down
 
-      # TODO: Need alerts for error rate and latency — these were the blind spots
-      # in the April 8 and April 9 incidents. Rohan asked for these to be added
-      # as part of the new dashboard work.
+  - name: payments_reliability
+    rules:
+      # Error rate alerts — would have caught the April 9 incident (12% errors on /charge)
+      - alert: PaymentsHighErrorRate
+        expr: service:http_error_ratio:rate5m{service="payments-api"} > 0.05
+        for: 2m
+        labels:
+          severity: warning
+          team: payments
+        annotations:
+          summary: "Payments API error rate above 5%"
+          description: "Error rate is {{ $value | humanizePercentage }} for the past 2 minutes."
+          runbook_url: https://wiki.internal/runbooks/payments-high-error-rate
+
+      - alert: PaymentsHighErrorRate
+        expr: service:http_error_ratio:rate5m{service="payments-api"} > 0.10
+        for: 1m
+        labels:
+          severity: critical
+          team: payments
+        annotations:
+          summary: "Payments API error rate above 10%"
+          description: "Error rate is {{ $value | humanizePercentage }} for the past minute. Immediate investigation required."
+          runbook_url: https://wiki.internal/runbooks/payments-high-error-rate
+
+      # Latency alerts — would have caught the April 8 incident (P95 at 4.2s vs 500ms SLO)
+      - alert: PaymentsP95LatencyHigh
+        expr: service:http_duration:p95{service="payments-api"} > 0.5
+        for: 3m
+        labels:
+          severity: warning
+          team: payments
+        annotations:
+          summary: "Payments API P95 latency above 500ms SLO"
+          description: "P95 latency is {{ $value | humanizeDuration }} (SLO: 500ms)."
+          runbook_url: https://wiki.internal/runbooks/payments-high-latency
+
+      - alert: PaymentsP95LatencyHigh
+        expr: service:http_duration:p95{service="payments-api"} > 1.0
+        for: 2m
+        labels:
+          severity: critical
+          team: payments
+        annotations:
+          summary: "Payments API P95 latency above 1s"
+          description: "P95 latency is {{ $value | humanizeDuration }}. Severely degraded checkout experience."
+          runbook_url: https://wiki.internal/runbooks/payments-high-latency
+
+      # Payment failure rate — catches provider-side issues
+      - alert: PaymentsHighFailureRate
+        expr: |
+          sum(payments:failed:rate5m)
+          /
+          (sum(payments:success:rate5m) + sum(payments:failed:rate5m) + sum(payments:timeout:rate5m))
+          > 0.05
+        for: 2m
+        labels:
+          severity: warning
+          team: payments
+        annotations:
+          summary: "Payment failure rate above 5%"
+          description: "{{ $value | humanizePercentage }} of payment attempts are failing."
+          runbook_url: https://wiki.internal/runbooks/payments-high-failure-rate
+
+  - name: host_health
+    rules:
+      - alert: HostHighCPU
+        expr: instance:node_cpu:utilization > 0.7
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High CPU on {{ $labels.instance }}"
+          description: "CPU utilization is {{ $value | humanizePercentage }} for 5+ minutes."
+          runbook_url: https://wiki.internal/runbooks/host-high-cpu
+
+      - alert: HostHighCPU
+        expr: instance:node_cpu:utilization > 0.9
+        for: 2m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Critical CPU on {{ $labels.instance }}"
+          description: "CPU utilization is {{ $value | humanizePercentage }}. Risk of GC pauses and request timeouts."
+          runbook_url: https://wiki.internal/runbooks/host-high-cpu
+
+      - alert: HostHighMemory
+        expr: instance:node_memory:utilization > 0.8
+        for: 5m
+        labels:
+          severity: warning
+          team: platform
+        annotations:
+          summary: "High memory on {{ $labels.instance }}"
+          description: "Memory utilization is {{ $value | humanizePercentage }} for 5+ minutes."
+          runbook_url: https://wiki.internal/runbooks/host-high-memory
+
+      - alert: HostHighMemory
+        expr: instance:node_memory:utilization > 0.95
+        for: 2m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Critical memory on {{ $labels.instance }}"
+          description: "Memory utilization is {{ $value | humanizePercentage }}. OOM kill risk — likely cause of GC-induced latency spikes."
+          runbook_url: https://wiki.internal/runbooks/host-high-memory
+
+      - alert: HostDiskSpaceLow
+        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) < 0.15
+        for: 5m
+        labels:
+          severity: critical
+          team: platform
+        annotations:
+          summary: "Disk space low on {{ $labels.instance }} ({{ $labels.mountpoint }})"
+          description: "Only {{ $value | humanizePercentage }} disk space remaining on {{ $labels.mountpoint }}."
+          runbook_url: https://wiki.internal/runbooks/host-disk-space-low