diff --git a/.gitignore b/.gitignore index ae0906e..db975fe 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /*.csv -/*.numbers \ No newline at end of file +/*.numbers +**/__pycache__ \ No newline at end of file diff --git a/build_season_schedule.py b/build_season_schedule.py index 008d6bc..d6b1046 100644 --- a/build_season_schedule.py +++ b/build_season_schedule.py @@ -30,6 +30,7 @@ from urllib.parse import urlencode import requests from bs4 import BeautifulSoup from dateutil import parser as dtp +import typer # ----------------- logging ----------------- logging.basicConfig( @@ -264,16 +265,14 @@ def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: return None # ----------------- build & merge ----------------- -def main(): - ap = argparse.ArgumentParser(description="Build a deduped season schedule with IDs, winners/losers, runs, and times.") - ap.add_argument("--subseason", required=True, help="Subseason ID, e.g. 942425") - ap.add_argument("--teams", required=True, help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)") - ap.add_argument("--out", default="season_schedule.csv", help="Output CSV path") - ap.add_argument("--fetch-time", action="store_true", help="Fetch game time from /game/show/") - ap.add_argument("--sleep", type=float, default=0.35, help="Delay between requests (seconds)") - args = ap.parse_args() - - by_instance, by_slug, by_norm = load_teams(args.teams) +def main( + subseason: str = typer.Option(..., help="Subseason ID, e.g. 942425"), + teams: str = typer.Option(..., help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)"), + out: str = typer.Option("season_schedule.csv", help="Output CSV path"), + fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/"), + sleep: float = typer.Option(0.35, help="Delay between requests (seconds)") +): + by_instance, by_slug, by_norm = load_teams(teams) instance_ids = sorted(by_instance.keys()) session = requests.Session() @@ -283,8 +282,8 @@ def main(): raw: List[dict] = [] for i, iid in enumerate(instance_ids, 1): logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}") - raw.extend(parse_printable(iid, args.subseason, session=session)) - time.sleep(args.sleep) # be polite + raw.extend(parse_printable(iid, subseason, session=session)) + time.sleep(sleep) # be polite def rec_from_instance(iid: str) -> Optional[TeamRec]: return by_instance.get(iid) @@ -407,7 +406,7 @@ def main(): # -------- NEW: fetch game start time from game page -------- time_local = "" - if args.fetch_time and game_id: + if fetch_time and game_id: if game_id in time_cache: tval = time_cache[game_id] else: @@ -415,8 +414,7 @@ def main(): tval = fetch_game_time(game_id, session=session) time_cache[game_id] = tval if tval is None: - # small backoff to be nice if many misses - time.sleep(min(args.sleep * 2, 1.0)) + time.sleep(min(sleep * 2, 1.0)) if tval: time_local = tval @@ -452,13 +450,13 @@ def main(): "loser_slug","loser_instance","loser_id", "location","status","game_id","source_urls", ] - with open(args.out, "w", newline="", encoding="utf-8") as f: + with open(out, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=fieldnames) w.writeheader() for r in out_rows: w.writerow(r) - logging.info(f"Wrote {len(out_rows)} games → {args.out}") + logging.info(f"Wrote {len(out_rows)} games → {out}") if __name__ == "__main__": - main() \ No newline at end of file + typer.run(main) diff --git a/compute_ratings.py b/compute_ratings.py index 9d7a59e..434049a 100644 --- a/compute_ratings.py +++ b/compute_ratings.py @@ -19,43 +19,27 @@ Defaults: """ from __future__ import annotations -import argparse import math import numpy as np import pandas as pd +import typer -def parse_args(): - p = argparse.ArgumentParser(description="Power ratings from season_schedule.csv") - p.add_argument("--in", dest="inp", required=True, help="Input CSV (season_schedule.csv)") - p.add_argument("--out", dest="out", required=True, help="Output ratings CSV") - p.add_argument("--team-id", choices=["names","slugs"], default="names", - help="Use team names or slugs as identifiers (default: names)") - p.add_argument("--final-status", default=None, - help="Only include games where status == this value (e.g., 'final'). If omitted, any row with scores is included.") - # Tunables - p.add_argument("--pyexp", type=float, default=1.83, help="Pythagorean exponent") - p.add_argument("--massey-cap", type=float, default=8.0, help="Cap for run margins in Massey") - p.add_argument("--no-massey-home-adj", action="store_true", - help="Disable subtracting estimated home-field runs in Massey") - p.add_argument("--elo-k", type=float, default=24.0, help="Elo K-factor") - p.add_argument("--elo-home", type=float, default=30.0, help="Elo home bonus (points)") - p.add_argument("--elo-mcap", type=float, default=2.0, help="Cap for margin factor ln(|m|+1)") - p.add_argument("--elo-shuffles", type=int, default=20, help="Random shuffles to average Elo") - p.add_argument("--elo-seed", type=int, default=42, help="RNG seed for shuffles") - return p.parse_args() - -def load_games(a) -> pd.DataFrame: - df = pd.read_csv(a.inp) +def load_games( + inp: str, + team_id: str = "names", + final_status: str | None = None, +) -> pd.DataFrame: + df = pd.read_csv(inp) # Choose identifiers - home_id_col = "home_name" if a.team_id == "names" else "home_slug" - away_id_col = "away_name" if a.team_id == "names" else "away_slug" + home_id_col = "home_name" if team_id == "names" else "home_slug" + away_id_col = "away_name" if team_id == "names" else "away_slug" for c in [home_id_col, away_id_col, "home_runs", "away_runs"]: if c not in df.columns: raise ValueError(f"Missing required column: {c}") # Optional status filter (helps exclude postponed/canceled) - if a.final_status is not None and "status" in df.columns: - df = df[df["status"].astype(str).str.lower() == str(a.final_status).lower()] + if final_status is not None and "status" in df.columns: + df = df[df["status"].astype(str).str.lower() == str(final_status).lower()] # Keep only games with numeric scores df = df.copy() @@ -173,52 +157,71 @@ def zscore(s: pd.Series) -> pd.Series: mu, sd = s.mean(), s.std(ddof=0) return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd -def main(): - a = parse_args() - games = load_games(a) +def main( + inp: str = typer.Option(..., help="Input CSV (season_schedule.csv)"), + out: str = typer.Option(..., help="Output ratings CSV"), + team_id: str = typer.Option( + "names", + help="Use team names or slugs as identifiers (default: names)", + show_default=True, + case_sensitive=False, + prompt=False, + ), + final_status: str | None = typer.Option(None, help="Only include games where status == this value (e.g., 'final'). If omitted, any row with scores is included."), + pyexp: float = typer.Option(1.83, help="Pythagorean exponent"), + massey_cap: float = typer.Option(8.0, help="Cap for run margins in Massey"), + no_massey_home_adj: bool = typer.Option(False, help="Disable subtracting estimated home-field runs in Massey"), + elo_k: float = typer.Option(24.0, help="Elo K-factor"), + elo_home: float = typer.Option(30.0, help="Elo home bonus (points)"), + elo_mcap: float = typer.Option(2.0, help="Cap for margin factor ln(|m|+1)"), + elo_shuffles: int = typer.Option(20, help="Random shuffles to average Elo"), + elo_seed: int = typer.Option(42, help="RNG seed for shuffles") +): + team_id = team_id.lower() + # Load games + games = load_games(inp, team_id=team_id, final_status=final_status) # Aggregates team = aggregate_team_stats(games) - team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], a.pyexp) + team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], pyexp) # Ratings - massey_r, h_runs = massey(games, cap=a.massey_cap, subtract_home=(not a.no_massey_home_adj)) - sos = ( - games.assign(OppTeam=np.where(True, games["AwayTeam"], games["AwayTeam"])) # placeholder - ) - # Strength of schedule: avg opponent Massey rating faced + massey_r, h_runs = massey(games, cap=massey_cap, subtract_home=not no_massey_home_adj) + + # Strength of schedule opps = {t: [] for t in massey_r.index} for _, r in games.iterrows(): opps[r["HomeTeam"]].append(r["AwayTeam"]) opps[r["AwayTeam"]].append(r["HomeTeam"]) sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps}) - elo_r = elo(games, K=a.elo_k, H=a.elo_home, mcap=a.elo_mcap, shuffles=a.elo_shuffles, seed=a.elo_seed) + elo_r = elo(games, K=elo_k, H=elo_home, mcap=elo_mcap, shuffles=elo_shuffles, seed=elo_seed) # Merge - out = team.set_index("Team") - out["MasseyRating"] = massey_r - out["EloRating"] = elo_r - out["StrengthOfSchedule"] = sos_series + out_df = team.set_index("Team") + out_df["MasseyRating"] = massey_r + out_df["EloRating"] = elo_r + out_df["StrengthOfSchedule"] = sos_series # Composite - Z_r, Z_e, Z_p = zscore(out["MasseyRating"]), zscore(out["EloRating"]), zscore(out["PythagoreanWinPct"]) - out["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p + Z_r, Z_e, Z_p = zscore(out_df["MasseyRating"]), zscore(out_df["EloRating"]), zscore(out_df["PythagoreanWinPct"]) + out_df["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p - out = out.reset_index() - out = out[[ + out_df = out_df.reset_index() + out_df = out_df[[ "Team","GP","W","L","T","WinPct","RS","RA","RunDiff", "PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating" ]].sort_values("CompositeRating", ascending=False) # Round for readability for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]: - out[c] = out[c].astype(float).round(5) + out_df[c] = out_df[c].astype(float).round(5) - out.to_csv(a.out, index=False) + out_df.to_csv(out, index=False) print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}") - print(f"Teams ranked: {len(out)} | Games processed: {len(games)}") - print(f"Output -> {a.out}") + print(f"Teams ranked: {len(out_df)} | Games processed: {len(games)}") + print(f"Output -> {out}") if __name__ == "__main__": - main() \ No newline at end of file + typer.run(main) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..eb8d711 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +typer[all]==0.16.1 +pandas==2.3.2 +numpy==2.3.2 +beautifulsoup4==4.13.5 +requests==2.32.5 \ No newline at end of file