#!/usr/bin/env python3
"""Plot daily Perlmutter GPU utilization as a bar chart for all days in the
local Parquet clone (2026 allocation year so far).

Reproduces the proration logic in duck_util.py but vectorised across all days
in one SQL pass, then renders a matplotlib bar chart.

Run:
    module load python/3.13-26.1.0
    USAGE_STORE=$SCRATCH/nersc-usage python3 plot_util.py \
        --start 2026-01-21 --end 2026-05-14 \
        --out   $SCRATCH/nersc-usage/util-2026.png
"""
import argparse
import os
from datetime import date, timedelta

import duckdb
import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

STORE = os.environ.get("USAGE_STORE", "/pscratch/sd/w/wbhimji/nersc-usage")


def main():
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--start", required=True, help="YYYY-MM-DD inclusive")
    ap.add_argument("--end",   required=True, help="YYYY-MM-DD inclusive")
    ap.add_argument("--resource", choices=["gpu", "cpu"], default="gpu")
    ap.add_argument("--out",   required=True, help="output PNG path")
    args = ap.parse_args()

    start = date.fromisoformat(args.start)
    end   = date.fromisoformat(args.end)
    if args.resource == "gpu":
        zone, host = "gpu", "perlmutter gpu"
    else:
        zone, host = "cpu", "perlmutter cpu"

    con = duckdb.connect()
    con.execute(f"""
        CREATE OR REPLACE VIEW jobs AS
        SELECT * FROM read_parquet('{STORE}/jobs/**/*.parquet', hive_partitioning=true)
    """)
    con.execute("SET TimeZone = 'US/Pacific'")

    sql = f"""
    WITH RECURSIVE
    days(d) AS (
      SELECT DATE '{start}'
      UNION ALL
      SELECT d + INTERVAL 1 DAY FROM days WHERE d < DATE '{end}'
    ),
    bounds AS (
      SELECT
        d AS day,
        CAST(d AS TIMESTAMP) AT TIME ZONE 'US/Pacific'                       AS win_start,
        CAST(d + INTERVAL 1 DAY AS TIMESTAMP) AT TIME ZONE 'US/Pacific'      AS win_end
      FROM days
    ),
    overlapped AS (
      SELECT
        b.day,
        j."QOS"           AS qos,
        j."ReservationId" AS rid,
        j."JobID"         AS jid,
        j."RawHours"      AS rawhours,
        j."ElapsedSecs"   AS elapsed,
        j."ZoneSize"      AS zonesize,
        CAST(j."Start" AS TIMESTAMP) AT TIME ZONE 'UTC'                                       AS start_ts,
        CAST(j."Start" AS TIMESTAMP) AT TIME ZONE 'UTC' + INTERVAL (j."ElapsedSecs") SECOND   AS end_ts,
        b.win_start, b.win_end
      FROM jobs j
      JOIN bounds b
        ON  CAST(j."Start" AS TIMESTAMP) AT TIME ZONE 'UTC' < b.win_end
        AND CAST(j."Start" AS TIMESTAMP) AT TIME ZONE 'UTC' + INTERVAL (j."ElapsedSecs") SECOND > b.win_start
      WHERE j."ZoneName"  = '{zone}'
        AND j."hostname"  = '{host}'
        AND j."ElapsedSecs" > 0
    ),
    prorated AS (
      SELECT
        day, qos, rid, jid, zonesize, rawhours, elapsed,
        date_diff('second', GREATEST(start_ts, win_start), LEAST(end_ts, win_end)) AS overlap_secs,
        win_start, win_end
      FROM overlapped
    ),
    prorated_hours AS (
      SELECT day, qos, rid, jid, zonesize,
             rawhours * overlap_secs / elapsed AS prorated_rh,
             win_start, win_end
      FROM prorated
      WHERE overlap_secs > 0
    ),
    nonres AS (
      SELECT day, SUM(prorated_rh) AS rh
      FROM prorated_hours WHERE qos <> 'RESERVE' GROUP BY day
    ),
    res_groups AS (
      SELECT day, COALESCE(NULLIF(rid, ''), jid) AS group_key, MAX(prorated_rh) AS prorated_rh
      FROM prorated_hours WHERE qos = 'RESERVE' GROUP BY day, group_key
    ),
    res AS (
      SELECT day, SUM(prorated_rh) AS rh FROM res_groups GROUP BY day
    ),
    cap AS (
      SELECT day, MAX(zonesize) AS capacity FROM prorated_hours GROUP BY day
    ),
    win AS (
      SELECT day, (epoch(win_end) - epoch(win_start)) / 3600.0 AS hours FROM bounds
    )
    SELECT
      b.day                                                AS day,
      COALESCE(nonres.rh, 0) + COALESCE(res.rh, 0)        AS prorated_rh,
      cap.capacity                                         AS capacity,
      win.hours                                            AS day_hours,
      100.0 * (COALESCE(nonres.rh, 0) + COALESCE(res.rh, 0)) / (cap.capacity * win.hours) AS utilization_pct
    FROM bounds b
    LEFT JOIN nonres USING (day)
    LEFT JOIN res    USING (day)
    LEFT JOIN cap    USING (day)
    LEFT JOIN win    USING (day)
    ORDER BY day
    """
    rows = con.execute(sql).fetchall()
    days = [r[0] for r in rows]
    util = [float(r[4]) if r[4] is not None else 0.0 for r in rows]

    # Plot
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.bar(days, util, color="#2e7d32", width=0.85)
    ax.set_ylim(0, 100)
    ax.set_ylabel("Node Hour Utilization (%)")
    ax.set_xlabel("Date")
    ax.set_title(
        f"Daily Perlmutter {args.resource.upper()} Node-Hour Utilization, "
        f"{start} → {end}"
    )
    ax.grid(axis="y", alpha=0.3)
    ax.xaxis.set_major_locator(mdates.WeekdayLocator(byweekday=mdates.MO))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%b %d"))
    fig.autofmt_xdate()
    fig.tight_layout()
    fig.savefig(args.out, dpi=120)
    print(f"wrote {args.out}")
    print(f"days plotted: {len(days)}  mean util: {sum(util)/len(util):.2f}%")


if __name__ == "__main__":
    main()
