Files
csyba/build_season_schedule.py
Anthony Correa c541c3fc51 feat(cli): migrate build_season_schedule and compute_ratings to typer CLI
- add typer-based CLI to build_season_schedule.py for structured option handling
- refactor compute_ratings.py to remove argparse and support typer CLI
- improve typing and option descriptions in compute_ratings.py main function
- add .gitignore entry for __pycache__
- add requirements.txt with dependencies for the project
2025-08-29 16:14:50 -05:00

463 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# build_season_schedule.py
#
# Build a deduped season schedule from SportsEngine team-instance printable pages.
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
# - Determines home/away using the '@' marker on the opponent cell.
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
#
# Usage:
# pip install requests beautifulsoup4 python-dateutil
# python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
#
# Example teams.json (array):
# [
# {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
# ...
# ]
import argparse
import csv
import json
import logging
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtp
import typer
# ----------------- logging -----------------
logging.basicConfig(
level=logging.INFO, # change to DEBUG for verbose tracing
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
# ----------------- constants -----------------
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
HEADERS = {"User-Agent": UA}
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"
GAME_BASE = "https://www.csyba.com/game/show/{gid}"
SCORE_RE = re.compile(r"\b(\d+)\s*[-]\s*(\d+)\b")
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)
# ----------------- helpers -----------------
def clean(x: str) -> str:
return re.sub(r"\s+", " ", (x or "")).strip()
def slugify(s: str) -> str:
s = s.lower()
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def norm_name(s: str) -> str:
s = s.lower()
s = re.sub(r"[^a-z0-9 ]+", " ", s)
s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
@dataclass(frozen=True)
class TeamRec:
name: str
slug: str
team_id: str
instance_id: str
subseason_id: str
def load_teams(teams_path: str):
"""Load mapping tables from teams.json you provided."""
with open(teams_path, "r", encoding="utf-8") as f:
arr = json.load(f)
by_instance: Dict[str, TeamRec] = {}
by_slug: Dict[str, TeamRec] = {}
by_norm: Dict[str, TeamRec] = {}
for t in arr:
rec = TeamRec(
name=str(t["teamName"]),
slug=str(t["team_slug"]),
team_id=str(t["team_id"]),
instance_id=str(t["instance_id"]),
subseason_id=str(t["subseason_id"]),
)
by_instance[rec.instance_id] = rec
by_slug[rec.slug] = rec
by_norm[norm_name(rec.name)] = rec
return by_instance, by_slug, by_norm
def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
"""Match opponent using slug first, then normalized name, then loose containment."""
s = slugify(opponent_text)
if s in by_slug:
return by_slug[s]
n = norm_name(opponent_text)
if n in by_norm:
return by_norm[n]
for key, rec in by_norm.items():
if key in n or n in key:
return rec
return None
def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
"""
Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs.
We don't reorder; we only validate with W/L/T if needed.
"""
if not (s_a.isdigit() and s_b.isdigit()):
return None, None
a, b = int(s_a), int(s_b)
if result_flag == "W" and a <= b:
logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
if result_flag == "L" and a >= b:
logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
return a, b
# ----------------- HTTP utils -----------------
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
try:
sess = session or requests.Session()
r = sess.get(url, headers=HEADERS, timeout=timeout)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
except Exception as e:
logging.error(f"GET failed {url}: {e}")
return None
# ----------------- scraping -----------------
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
"""Parse one team-instance printable schedule page into perspective rows."""
url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
"schedule_type": "index",
"subseason": subseason_id,
})
soup = get_soup(url, session=session)
if not soup:
return []
table = soup.select_one("table")
if not table:
logging.warning(f"No table found for team_instance={instance_id}")
return []
games = []
for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
tds = tr.select("td")
if len(tds) < 5:
continue
# Cells: Date | Result | Opponent | Location | Status
date_txt = clean(tds[0].get_text(" "))
result_txt = clean(tds[1].get_text(" "))
opp_txt = clean(tds[2].get_text(" "))
loc_txt = clean(tds[3].get_text(" "))
status_txt = clean(tds[4].get_text(" "))
# Date → ISO
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except Exception:
date_iso = date_txt
# Pull a game_id if present (from any link in the row)
game_id = ""
for a in tr.select("a[href]"):
m = GAME_LINK_RE.search(a.get("href", ""))
if m:
game_id = m.group(1)
break
# Extract W/L/T (Result cell)
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result_flag = m_res.group(1).upper() if m_res else ""
# Extract score from Result cell; if missing, also try Opponent cell
m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")
# Opponent + home/away flag
is_away = opp_txt.startswith("@")
opponent_name = opp_txt.lstrip("@").strip()
# Compute team/opp runs (TEAM-FIRST orientation)
team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)
logging.debug(
f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
f"→ team_runs={team_runs}, opp_runs={opp_runs}"
)
games.append({
"team_instance": instance_id,
"game_id": game_id, # may be empty
"date": date_iso,
"result": result_flag, # W/L/T from THIS TEAM's perspective
"team_runs": team_runs,
"opp_runs": opp_runs,
"opponent_name": opponent_name,
"is_away": is_away,
"location": loc_txt,
"status": status_txt,
"source_url": url,
})
logging.info(f"Team {instance_id}: parsed {len(games)} rows")
return games
def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
"""
Fetch the game's local start time from the /game/show/<id> page.
Looks inside the tab with id 'tab_boxscores_content' but also
falls back to scanning the page for common time patterns.
Returns a zero-padded 24h 'HH:MM' string or None if unavailable.
"""
if not game_id:
return None
url = GAME_BASE.format(gid=game_id)
soup = get_soup(url, session=session, timeout=30)
if not soup:
return None
# Prefer the boxscores tab content
box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
text = ""
if box:
text = " ".join(box.stripped_strings)
else:
# Fall back to page-wide text (but avoid pulling too much)
main = soup.select_one("div.page") or soup
text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())
m = TIME_RE.search(text)
if not m:
logging.debug(f"TIME: no time found in game {game_id}")
return None
hhmm = m.group(1)
ampm = (m.group(2) or "").lower().replace(".", "")
try:
# Normalize to 24h HH:MM
from datetime import datetime
if ampm:
dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
else:
# already 24h-ish
dt = datetime.strptime(hhmm, "%H:%M")
return dt.strftime("%H:%M")
except Exception:
# Be forgiving (e.g., "6:00pm" without space)
try:
from datetime import datetime
hhmm2 = hhmm
if ampm:
dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p")
return dt.strftime("%H:%M")
except Exception:
logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
return None
# ----------------- build & merge -----------------
def main(
subseason: str = typer.Option(..., help="Subseason ID, e.g. 942425"),
teams: str = typer.Option(..., help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)"),
out: str = typer.Option("season_schedule.csv", help="Output CSV path"),
fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/<id>"),
sleep: float = typer.Option(0.35, help="Delay between requests (seconds)")
):
by_instance, by_slug, by_norm = load_teams(teams)
instance_ids = sorted(by_instance.keys())
session = requests.Session()
session.headers.update(HEADERS)
# Scrape all teams
raw: List[dict] = []
for i, iid in enumerate(instance_ids, 1):
logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
raw.extend(parse_printable(iid, subseason, session=session))
time.sleep(sleep) # be polite
def rec_from_instance(iid: str) -> Optional[TeamRec]:
return by_instance.get(iid)
def match_opponent(text: str) -> Optional[TeamRec]:
return best_match_team(text, by_slug, by_norm)
# Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present)
buckets: Dict[str, dict] = {}
fallback_rows = 0
for row in raw:
team_rec = rec_from_instance(row["team_instance"])
if not team_rec:
logging.warning(f"Unknown instance {row['team_instance']}; skipping")
continue
opp_rec = match_opponent(row["opponent_name"])
opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
pair = tuple(sorted([team_rec.slug, opp_slug]))
if row["game_id"]:
key = f"id:{row['game_id']}"
else:
runs_sig = ""
if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
fallback_rows += 1
perspective = {
"team": team_rec,
"opp": opp_rec, # may be None
"is_away": row["is_away"],
"team_runs": row["team_runs"],
"opp_runs": row["opp_runs"],
"location": row["location"],
"status": row["status"],
"source_url": row["source_url"],
"pair": pair,
"date": row["date"],
"game_id": row["game_id"],
}
if key not in buckets:
buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
else:
buckets[key]["persp"].append(perspective)
if fallback_rows:
logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")
# Merge perspectives into a single home/away row
out_rows = []
time_cache: Dict[str, Optional[str]] = {}
for key, bucket in buckets.items():
p = bucket["persp"]
date = p[0]["date"]
game_id = bucket.get("game_id", "")
# Identify home/away perspectives
p_home = next((x for x in p if x["is_away"] is False), None)
p_away = next((x for x in p if x["is_away"] is True), None)
# Team identities
home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))
def pack_team(rec: Optional[TeamRec], fallback_slug: str):
if rec:
return rec.slug, rec.instance_id, rec.team_id, rec.name
return fallback_slug, "", "", fallback_slug.replace("-", " ").title()
# Prefer runs from the explicit perspective (home if available; otherwise away)
home_runs = away_runs = None
if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
home_runs = p_home["team_runs"]
away_runs = p_home["opp_runs"]
elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
away_runs = p_away["team_runs"]
home_runs = p_away["opp_runs"]
# Fallback: single perspective present but numbers known → place by is_away
if (home_runs is None or away_runs is None) and p:
one = p[0]
if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
if one["is_away"]:
away_runs = one["team_runs"]; home_runs = one["opp_runs"]
away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team
else:
home_runs = one["team_runs"]; away_runs = one["opp_runs"]
home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team
# Pack final team identifiers (fallback slug = guess from perspectives)
guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
p_away["opp"].slug if p_away and p_away["opp"] else
p[0]["pair"][0])
guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
p_home["opp"].slug if p_home and p_home["opp"] else
p[0]["pair"][1])
home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)
# Winner/loser
winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
if isinstance(home_runs, int) and isinstance(away_runs, int):
if home_runs > away_runs:
winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
elif away_runs > home_runs:
winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id
# Meta from perspectives
loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
source_urls = sorted({x["source_url"] for x in p})
# -------- NEW: fetch game start time from game page --------
time_local = ""
if fetch_time and game_id:
if game_id in time_cache:
tval = time_cache[game_id]
else:
logging.debug(f"TIME: fetching game {game_id}")
tval = fetch_game_time(game_id, session=session)
time_cache[game_id] = tval
if tval is None:
time.sleep(min(sleep * 2, 1.0))
if tval:
time_local = tval
logging.debug(
f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
)
out_rows.append({
"date_local": date,
"time_local": time_local,
"home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
"away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
"home_runs": "" if home_runs is None else home_runs,
"away_runs": "" if away_runs is None else away_runs,
"winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
"loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
"location": loc, "status": status,
"game_id": game_id,
"source_urls": " ".join(source_urls),
})
if not out_rows:
logging.warning("No games produced.")
return
fieldnames = [
"date_local","time_local",
"home_slug","home_instance","home_id","home_name",
"away_slug","away_instance","away_id","away_name",
"home_runs","away_runs",
"winner_slug","winner_instance","winner_id",
"loser_slug","loser_instance","loser_id",
"location","status","game_id","source_urls",
]
with open(out, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for r in out_rows:
w.writerow(r)
logging.info(f"Wrote {len(out_rows)} games → {out}")
if __name__ == "__main__":
typer.run(main)