#!/usr/bin/env python3
"""Job Runner Reliability Analysis — equivalent to MATLAB/Octave numerical pipeline."""

import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime

# ── Load data ──────────────────────────────────────────────────────────────
df = pd.read_csv('runner_metrics.csv')
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
# Fix rows with invalid timestamps (e.g. 09:60:08) — assign to same date
for i in df[df['timestamp'].isna()].index:
    raw = pd.read_csv('runner_metrics.csv').iloc[i]['timestamp']
    # Extract date portion and use it
    df.loc[i, 'timestamp'] = pd.to_datetime(raw[:10] + ' 10:00:00')
df['date'] = df['timestamp'].dt.date

# ── 1. Daily reliability trend ────────────────────────────────────────────
print("=" * 70)
print("  JOB RUNNER RELIABILITY REPORT  —  Generated", datetime.now().strftime("%Y-%m-%d %H:%M"))
print("=" * 70)

print("\n── 1. DAILY RELIABILITY TREND ──────────────────────────────────────\n")
print(f"{'Date':<14} {'Jobs':>5} {'Success':>8} {'Rate':>8} {'Mean Lat':>10} {'P95 Lat':>10} {'Mean Queue':>11}")
print("-" * 70)

daily = []
for date, grp in df.groupby('date'):
    n = len(grp)
    n_ok = (grp['status'] == 'success').sum()
    rate = n_ok / n * 100
    mean_lat = grp['latency_ms'].mean()
    p95_lat = np.percentile(grp['latency_ms'], 95)
    mean_q = grp['queue_time_ms'].mean()
    daily.append({'date': date, 'jobs': n, 'success': n_ok, 'rate': rate,
                  'mean_lat': mean_lat, 'p95_lat': p95_lat, 'mean_q': mean_q})
    print(f"{str(date):<14} {n:>5} {n_ok:>8} {rate:>7.1f}% {mean_lat:>9.1f}ms {p95_lat:>9.1f}ms {mean_q:>10.1f}ms")

daily_df = pd.DataFrame(daily)

# Trend direction (linear regression on success rate)
x = np.arange(len(daily_df))
slope, intercept = np.polyfit(x, daily_df['rate'].values, 1)
trend_dir = "IMPROVING" if slope > 0 else "DEGRADING" if slope < 0 else "STABLE"
print(f"\nTrend: {trend_dir} (slope = {slope:+.2f}% per day)")

# ── 2. Anomaly detection ─────────────────────────────────────────────────
print("\n── 2. ANOMALY DETECTION (latency > mean + 2*std) ────────────────────\n")

lat_mean = df['latency_ms'].mean()
lat_std = df['latency_ms'].std()
threshold = lat_mean + 2 * lat_std

print(f"Global latency:  mean = {lat_mean:.1f} ms,  std = {lat_std:.1f} ms")
print(f"Anomaly threshold: {threshold:.1f} ms\n")

anomalies = df[df['latency_ms'] > threshold].copy()
if len(anomalies) > 0:
    print(f"{'Timestamp':<22} {'Job ID':<12} {'Latency':>10} {'Queue':>10} {'Status':<10} {'Worker':<10}")
    print("-" * 76)
    for _, row in anomalies.iterrows():
        print(f"{str(row['timestamp']):<22} {row['job_id']:<12} {row['latency_ms']:>9}ms {row['queue_time_ms']:>9}ms {row['status']:<10} {row['worker_node']:<10}")
else:
    print("No anomalies detected.")

# Also flag queue_time anomalies
q_mean = df['queue_time_ms'].mean()
q_std = df['queue_time_ms'].std()
q_threshold = q_mean + 2 * q_std
q_anomalies = df[df['queue_time_ms'] > q_threshold]
print(f"\nQueue time anomalies (>{q_threshold:.1f}ms): {len(q_anomalies)} jobs")

# ── 3. Per-worker breakdown ───────────────────────────────────────────────
print("\n── 3. PER-WORKER NODE BREAKDOWN ────────────────────────────────────\n")
print(f"{'Worker':<12} {'Jobs':>5} {'Success':>8} {'Rate':>8} {'Mean Lat':>10} {'Timeouts':>9} {'Failures':>9}")
print("-" * 65)

for worker, grp in df.groupby('worker_node'):
    n = len(grp)
    n_ok = (grp['status'] == 'success').sum()
    n_to = (grp['status'] == 'timeout').sum()
    n_fail = (grp['status'] == 'failure').sum()
    rate = n_ok / n * 100
    mean_lat = grp['latency_ms'].mean()
    print(f"{worker:<12} {n:>5} {n_ok:>8} {rate:>7.1f}% {mean_lat:>9.1f}ms {n_to:>9} {n_fail:>9}")

# ── 4. Overall summary ───────────────────────────────────────────────────
print("\n── 4. OVERALL SUMMARY ──────────────────────────────────────────────\n")

total = len(df)
total_ok = (df['status'] == 'success').sum()
total_to = (df['status'] == 'timeout').sum()
total_fail = (df['status'] == 'failure').sum()
overall_rate = total_ok / total * 100

# Healthy job stats (exclude anomalies)
healthy = df[df['latency_ms'] <= threshold]
healthy_mean = healthy['latency_ms'].mean()
healthy_p95 = np.percentile(healthy['latency_ms'], 95)

print(f"  Total jobs:          {total}")
print(f"  Successful:          {total_ok}  ({overall_rate:.1f}%)")
print(f"  Timeouts:            {total_to}  ({total_to/total*100:.1f}%)")
print(f"  Failures:            {total_fail}  ({total_fail/total*100:.1f}%)")
print(f"  Overall success:     {overall_rate:.1f}%")
print(f"  Mean latency (all):  {lat_mean:.1f} ms")
print(f"  P95 latency (all):   {np.percentile(df['latency_ms'], 95):.1f} ms")
print(f"  Mean latency (ok):   {healthy_mean:.1f} ms")
print(f"  P95 latency (ok):    {healthy_p95:.1f} ms")
print(f"  Mean queue time:     {q_mean:.1f} ms")

# ── 5. Trend chart ────────────────────────────────────────────────────────
fig, axes = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
fig.suptitle('Job Runner Reliability Report  (Apr 1–4, 2026)', fontsize=14, fontweight='bold')

dates = daily_df['date']

# 5a — Success rate
ax1 = axes[0]
ax1.bar(dates, daily_df['rate'], color=['#2ecc71' if r >= 90 else '#e74c3c' for r in daily_df['rate']], width=0.6)
ax1.axhline(y=95, color='gray', linestyle='--', alpha=0.6, label='95% target')
ax1.set_ylabel('Success Rate (%)')
ax1.set_ylim(70, 105)
ax1.legend(loc='lower left')
ax1.set_title('Daily Success Rate')
for i, r in enumerate(daily_df['rate']):
    ax1.text(dates.iloc[i], r + 1.5, f'{r:.0f}%', ha='center', fontweight='bold')

# 5b — Latency (mean + p95)
ax2 = axes[1]
ax2.plot(dates, daily_df['mean_lat'], 'o-', color='#3498db', linewidth=2, label='Mean')
ax2.plot(dates, daily_df['p95_lat'], 's--', color='#e67e22', linewidth=2, label='P95')
ax2.axhline(y=threshold, color='red', linestyle=':', alpha=0.5, label=f'Anomaly ({threshold:.0f}ms)')
ax2.set_ylabel('Latency (ms)')
ax2.legend()
ax2.set_title('Latency Trend')

# 5c — Scatter: all jobs colored by status
ax3 = axes[2]
color_map = {'success': '#2ecc71', 'timeout': '#e67e22', 'failure': '#e74c3c'}
for status, grp in df.groupby('status'):
    ax3.scatter(grp['timestamp'], grp['latency_ms'], c=color_map[status],
                label=status, s=50, edgecolors='white', linewidth=0.5, zorder=3)
ax3.axhline(y=threshold, color='red', linestyle=':', alpha=0.5)
ax3.set_ylabel('Latency (ms)')
ax3.set_xlabel('Timestamp')
ax3.legend()
ax3.set_title('Individual Job Latency (colored by status)')

plt.tight_layout()
plt.savefig('runner_reliability_report.png', dpi=150, bbox_inches='tight')
print("\n[Chart saved to runner_reliability_report.png]")

# ── 6. Go / No-Go note ───────────────────────────────────────────────────
print("\n── 5. GO / NO-GO NOTE ─────────────────────────────────────────────\n")

risks = []
if overall_rate < 95:
    risks.append(f"Overall success rate ({overall_rate:.1f}%) is below 95% target")
if total_fail > 0:
    risks.append(f"{total_fail} hard failure(s) detected (not just timeouts)")
if slope < -1:
    risks.append(f"Success rate trending downward at {slope:.1f}% per day")

# Check worker-1 specifically (it has the most issues)
w1 = df[df['worker_node'] == 'worker-1']
w1_rate = (w1['status'] == 'success').sum() / len(w1) * 100
if w1_rate < 80:
    risks.append(f"worker-1 reliability is only {w1_rate:.0f}% — investigate node health")

if len(risks) == 0:
    print("RECOMMENDATION: GO")
    print("All metrics within acceptable thresholds.")
else:
    print("RECOMMENDATION: CONDITIONAL GO — address risks below")
    for i, r in enumerate(risks, 1):
        print(f"  {i}. {r}")
    print(f"\nMitigation: {len(anomalies)} of {total} jobs ({len(anomalies)/total*100:.0f}%) are anomalous.")
    print("If anomalies cluster on a single node, consider draining that node before proceeding.")

print("\n" + "=" * 70)