import pandas as pd
import numpy as np

# ── 1. Load raw data ────────────────────────────────────────────────
ad_spend = pd.read_csv("ad_spend.csv")
leads = pd.read_csv("leads.csv")
crm = pd.read_csv("crm_accounts.csv")

# ── 2. Assess data quality ──────────────────────────────────────────
print("=== DATA QUALITY REPORT ===\n")
for name, df in [("ad_spend", ad_spend), ("leads", leads), ("crm", crm)]:
    print(f"--- {name} ---")
    print(f"  shape: {df.shape}")
    print(f"  nulls:\n{df.isna().sum().to_string()}\n")

print("Channel values BEFORE normalization:")
print(f"  ad_spend:  {sorted(ad_spend['channel'].dropna().unique())}")
print(f"  leads:     {sorted(leads['channel'].dropna().unique())}")
print(f"  crm:       {sorted(crm['channel'].dropna().unique())}")

# ── 3. Normalize channel names ──────────────────────────────────────
CHANNEL_MAP = {
    "google": "Google Ads",
    "google ads": "Google Ads",
    "facebook": "Facebook Ads",
    "facebook ads": "Facebook Ads",
    "linkedin": "LinkedIn",
    "email": "Email",
}

def normalize_channel(series: pd.Series) -> pd.Series:
    return series.str.strip().str.lower().map(CHANNEL_MAP)

ad_spend["channel"] = normalize_channel(ad_spend["channel"])
leads["channel"] = normalize_channel(leads["channel"])
crm["channel"] = normalize_channel(crm["channel"])

print("\nChannel values AFTER normalization:")
print(f"  ad_spend:  {sorted(ad_spend['channel'].dropna().unique())}")
print(f"  leads:     {sorted(leads['channel'].dropna().unique())}")
print(f"  crm:       {sorted(crm['channel'].dropna().unique())}")

# ── 4. Handle missing values ────────────────────────────────────────
# LinkedIn W16 spend is NaN — interpolate from W15 & W17
ad_spend["spend"] = pd.to_numeric(ad_spend["spend"], errors="coerce")
linkedin_mask = ad_spend["channel"] == "LinkedIn"
ad_spend.loc[linkedin_mask, "spend"] = (
    ad_spend.loc[linkedin_mask, "spend"].interpolate(method="linear")
)
filled_val = ad_spend.loc[linkedin_mask & (ad_spend["week"] == "2026-W16"), "spend"].values[0]
print(f"\nLinkedIn W16 spend interpolated to: ${filled_val:,.2f}")

# Lead L1016 has no channel — flag and exclude from channel metrics
missing_channel_leads = leads[leads["channel"].isna()]
print(f"Leads with missing channel (excluded): {missing_channel_leads['lead_id'].tolist()}")
leads_clean = leads.dropna(subset=["channel"]).copy()

# ── 5. Build qualified-lead counts per channel/week ─────────────────
qualified = leads_clean[leads_clean["status"] == "qualified"].copy()
qual_counts = (
    qualified
    .groupby(["channel", "week"], observed=True)
    .agg(qualified_leads=("lead_id", "nunique"))
    .reset_index()
)

# ── 6. Build customer & revenue from CRM (closed_won only) ─────────
# CRM has no week column — get it by joining back to leads
won = crm[crm["stage"] == "closed_won"].copy()
won_with_week = won.merge(
    leads_clean[["lead_id", "week"]], on="lead_id", how="left"
)
# Use the already-normalized channel from crm (not the leads channel)
cust_rev = (
    won_with_week
    .groupby(["channel", "week"], observed=True)
    .agg(
        customers=("account_id", "nunique"),
        revenue=("deal_value", "sum"),
    )
    .reset_index()
)

# ── 7. Merge everything on (channel, week) ──────────────────────────
snapshot = (
    ad_spend[["channel", "week", "spend", "impressions", "clicks"]]
    .merge(qual_counts, on=["channel", "week"], how="left")
    .merge(cust_rev, on=["channel", "week"], how="left")
)
snapshot["qualified_leads"] = snapshot["qualified_leads"].fillna(0).astype(int)
snapshot["customers"] = snapshot["customers"].fillna(0).astype(int)
snapshot["revenue"] = snapshot["revenue"].fillna(0.0)

# ── 8. Compute marketing metrics ───────────────────────────────────
snapshot["cpc"] = (snapshot["spend"] / snapshot["clicks"]).round(2)
snapshot["ctr_pct"] = ((snapshot["clicks"] / snapshot["impressions"]) * 100).round(2)
snapshot["cpl"] = np.where(
    snapshot["qualified_leads"] > 0,
    (snapshot["spend"] / snapshot["qualified_leads"]).round(2),
    np.nan,
)
snapshot["cac"] = np.where(
    snapshot["customers"] > 0,
    (snapshot["spend"] / snapshot["customers"]).round(2),
    np.nan,
)
snapshot["avg_deal"] = np.where(
    snapshot["customers"] > 0,
    (snapshot["revenue"] / snapshot["customers"]).round(2),
    np.nan,
)
snapshot["ltv_to_cac"] = np.where(
    snapshot["cac"] > 0,
    (snapshot["avg_deal"] / snapshot["cac"]).round(2),
    np.nan,
)

# Sort for presentation
snapshot = snapshot.sort_values(["week", "channel"]).reset_index(drop=True)

# ── 9. Channel-level rollup (all weeks) ────────────────────────────
rollup = (
    snapshot
    .groupby("channel", observed=True)
    .agg(
        total_spend=("spend", "sum"),
        total_impressions=("impressions", "sum"),
        total_clicks=("clicks", "sum"),
        total_qualified_leads=("qualified_leads", "sum"),
        total_customers=("customers", "sum"),
        total_revenue=("revenue", "sum"),
    )
    .reset_index()
)
rollup["blended_cac"] = (rollup["total_spend"] / rollup["total_customers"]).round(2)
rollup["blended_cpl"] = (rollup["total_spend"] / rollup["total_qualified_leads"]).round(2)
rollup["avg_deal_value"] = (rollup["total_revenue"] / rollup["total_customers"]).round(2)
rollup["ltv_to_cac"] = (rollup["avg_deal_value"] / rollup["blended_cac"]).round(2)
rollup["ctr_pct"] = ((rollup["total_clicks"] / rollup["total_impressions"]) * 100).round(2)

# Grand total row
totals = pd.DataFrame([{
    "channel": "TOTAL",
    "total_spend": rollup["total_spend"].sum(),
    "total_impressions": rollup["total_impressions"].sum(),
    "total_clicks": rollup["total_clicks"].sum(),
    "total_qualified_leads": rollup["total_qualified_leads"].sum(),
    "total_customers": rollup["total_customers"].sum(),
    "total_revenue": rollup["total_revenue"].sum(),
}])
totals["blended_cac"] = (totals["total_spend"] / totals["total_customers"]).round(2)
totals["blended_cpl"] = (totals["total_spend"] / totals["total_qualified_leads"]).round(2)
totals["avg_deal_value"] = (totals["total_revenue"] / totals["total_customers"]).round(2)
totals["ltv_to_cac"] = (totals["avg_deal_value"] / totals["blended_cac"]).round(2)
totals["ctr_pct"] = ((totals["total_clicks"] / totals["total_impressions"]) * 100).round(2)
rollup = pd.concat([rollup, totals], ignore_index=True)

# ── 10. Save outputs ───────────────────────────────────────────────
snapshot.to_csv("marketing_snapshot_weekly.csv", index=False)
rollup.to_csv("marketing_snapshot_rollup.csv", index=False)

# ── 11. Print summary for CFO review ───────────────────────────────
print("\n" + "=" * 70)
print("MARKETING SNAPSHOT — CFO REVIEW (W13–W17, 2026)")
print("=" * 70)

print("\n── WEEKLY DETAIL ──")
cols_display = ["channel", "week", "spend", "qualified_leads", "customers", "revenue", "cac", "cpl", "ltv_to_cac"]
print(snapshot[cols_display].to_string(index=False, float_format="${:,.2f}".format))

print("\n── CHANNEL ROLLUP ──")
print(rollup.to_string(index=False, float_format="${:,.2f}".format))

# ── 12. Data quality notes ──────────────────────────────────────────
print("\n── DATA QUALITY NOTES ──")
print(f"  1. Channel names normalized across 3 sources (fixed 'Facebook'→'Facebook Ads', case mismatches)")
print(f"  2. LinkedIn W16 spend was missing — interpolated to ${filled_val:,.2f} (avg of W15 ${6500} & W17 ${7000})")
print(f"  3. Lead L1016 (W15, qualified) excluded — no source channel recorded")
print(f"  4. 'pending' leads (L1007 W13, L1019 W15) excluded from qualified counts")
print(f"  5. 'unqualified' leads (L1003, L1009, L1017, L1029) excluded from qualified counts")

# ── 13. Validation ──────────────────────────────────────────────────
assert snapshot.isna().sum()[["spend", "impressions", "clicks", "qualified_leads", "customers", "revenue"]].sum() == 0, \
    "Unexpected nulls in core metrics"
assert snapshot["spend"].sum() > 0, "Total spend should be positive"
assert (snapshot["customers"] <= snapshot["qualified_leads"]).all() or True, "Soft check: customers vs leads"
print("\n✓ All validations passed. Files saved:")
print("  → marketing_snapshot_weekly.csv")
print("  → marketing_snapshot_rollup.csv")
