#!/usr/bin/env python3 # build_season_schedule.py # # Build a deduped season schedule from SportsEngine team-instance printable pages. # - Assumes team-instance schedule pages are TEAM-FIRST for scores. # - Determines home/away using the '@' marker on the opponent cell. # - Deduplicates primarily by game_id (from /game/show/ links), otherwise by a fallback key. # - Optionally fetches each game's time from the /game/show/ page ("tab_boxscores_content"). # # Usage: # pip install requests beautifulsoup4 python-dateutil # python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv # # Example teams.json (array): # [ # {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"}, # ... # ] import argparse import csv import json import logging import re import time from dataclasses import dataclass from typing import Dict, List, Optional, Tuple from urllib.parse import urlencode import requests from bs4 import BeautifulSoup from dateutil import parser as dtp import typer # ----------------- logging ----------------- logging.basicConfig( level=logging.INFO, # change to DEBUG for verbose tracing format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) # ----------------- constants ----------------- UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36" HEADERS = {"User-Agent": UA} PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" GAME_BASE = "https://www.csyba.com/game/show/{gid}" SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b") GAME_LINK_RE = re.compile(r"/game/show/(\d+)") TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I) # ----------------- helpers ----------------- def clean(x: str) -> str: return re.sub(r"\s+", " ", (x or "")).strip() def slugify(s: str) -> str: s = s.lower() s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") return s def norm_name(s: str) -> str: s = s.lower() s = re.sub(r"[^a-z0-9 ]+", " ", s) s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s) s = re.sub(r"\s+", " ", s).strip() return s @dataclass(frozen=True) class TeamRec: name: str slug: str team_id: str instance_id: str subseason_id: str def load_teams(teams_path: str): """Load mapping tables from teams.json you provided.""" with open(teams_path, "r", encoding="utf-8") as f: arr = json.load(f) by_instance: Dict[str, TeamRec] = {} by_slug: Dict[str, TeamRec] = {} by_norm: Dict[str, TeamRec] = {} for t in arr: rec = TeamRec( name=str(t["teamName"]), slug=str(t["team_slug"]), team_id=str(t["team_id"]), instance_id=str(t["instance_id"]), subseason_id=str(t["subseason_id"]), ) by_instance[rec.instance_id] = rec by_slug[rec.slug] = rec by_norm[norm_name(rec.name)] = rec return by_instance, by_slug, by_norm def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]: """Match opponent using slug first, then normalized name, then loose containment.""" s = slugify(opponent_text) if s in by_slug: return by_slug[s] n = norm_name(opponent_text) if n in by_norm: return by_norm[n] for key, rec in by_norm.items(): if key in n or n in key: return rec return None def runs_from_team_pov(result_flag: str, s_a: str, s_b: str): """ Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs. We don't reorder; we only validate with W/L/T if needed. """ if not (s_a.isdigit() and s_b.isdigit()): return None, None a, b = int(s_a), int(s_b) if result_flag == "W" and a <= b: logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).") if result_flag == "L" and a >= b: logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).") return a, b # ----------------- HTTP utils ----------------- def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]: try: sess = session or requests.Session() r = sess.get(url, headers=HEADERS, timeout=timeout) r.raise_for_status() return BeautifulSoup(r.text, "html.parser") except Exception as e: logging.error(f"GET failed {url}: {e}") return None # ----------------- scraping ----------------- def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]: """Parse one team-instance printable schedule page into perspective rows.""" url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({ "schedule_type": "index", "subseason": subseason_id, }) soup = get_soup(url, session=session) if not soup: return [] table = soup.select_one("table") if not table: logging.warning(f"No table found for team_instance={instance_id}") return [] games = [] for row_idx, tr in enumerate(table.select("tr")[1:], start=1): tds = tr.select("td") if len(tds) < 5: continue # Cells: Date | Result | Opponent | Location | Status date_txt = clean(tds[0].get_text(" ")) result_txt = clean(tds[1].get_text(" ")) opp_txt = clean(tds[2].get_text(" ")) loc_txt = clean(tds[3].get_text(" ")) status_txt = clean(tds[4].get_text(" ")) # Date → ISO try: date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat() except Exception: date_iso = date_txt # Pull a game_id if present (from any link in the row) game_id = "" for a in tr.select("a[href]"): m = GAME_LINK_RE.search(a.get("href", "")) if m: game_id = m.group(1) break # Extract W/L/T (Result cell) m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I) result_flag = m_res.group(1).upper() if m_res else "" # Extract score from Result cell; if missing, also try Opponent cell m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt) s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "") # Opponent + home/away flag is_away = opp_txt.startswith("@") opponent_name = opp_txt.lstrip("@").strip() # Compute team/opp runs (TEAM-FIRST orientation) team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b) logging.debug( f"PARSER: inst={instance_id} row={row_idx} date={date_iso} " f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} " f"→ team_runs={team_runs}, opp_runs={opp_runs}" ) games.append({ "team_instance": instance_id, "game_id": game_id, # may be empty "date": date_iso, "result": result_flag, # W/L/T from THIS TEAM's perspective "team_runs": team_runs, "opp_runs": opp_runs, "opponent_name": opponent_name, "is_away": is_away, "location": loc_txt, "status": status_txt, "source_url": url, }) logging.info(f"Team {instance_id}: parsed {len(games)} rows") return games def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: """ Fetch the game's local start time from the /game/show/ page. Looks inside the tab with id 'tab_boxscores_content' but also falls back to scanning the page for common time patterns. Returns a zero-padded 24h 'HH:MM' string or None if unavailable. """ if not game_id: return None url = GAME_BASE.format(gid=game_id) soup = get_soup(url, session=session, timeout=30) if not soup: return None # Prefer the boxscores tab content box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content") text = "" if box: text = " ".join(box.stripped_strings) else: # Fall back to page-wide text (but avoid pulling too much) main = soup.select_one("div.page") or soup text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split()) m = TIME_RE.search(text) if not m: logging.debug(f"TIME: no time found in game {game_id}") return None hhmm = m.group(1) ampm = (m.group(2) or "").lower().replace(".", "") try: # Normalize to 24h HH:MM from datetime import datetime if ampm: dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p") else: # already 24h-ish dt = datetime.strptime(hhmm, "%H:%M") return dt.strftime("%H:%M") except Exception: # Be forgiving (e.g., "6:00pm" without space) try: from datetime import datetime hhmm2 = hhmm if ampm: dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p") return dt.strftime("%H:%M") except Exception: logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}") return None # ----------------- build & merge ----------------- def main( subseason: str = typer.Option(..., help="Subseason ID, e.g. 942425"), teams: str = typer.Option(..., help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)"), out: str = typer.Option("season_schedule.csv", help="Output CSV path"), fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/"), sleep: float = typer.Option(0.35, help="Delay between requests (seconds)") ): by_instance, by_slug, by_norm = load_teams(teams) instance_ids = sorted(by_instance.keys()) session = requests.Session() session.headers.update(HEADERS) # Scrape all teams raw: List[dict] = [] for i, iid in enumerate(instance_ids, 1): logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}") raw.extend(parse_printable(iid, subseason, session=session)) time.sleep(sleep) # be polite def rec_from_instance(iid: str) -> Optional[TeamRec]: return by_instance.get(iid) def match_opponent(text: str) -> Optional[TeamRec]: return best_match_team(text, by_slug, by_norm) # Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present) buckets: Dict[str, dict] = {} fallback_rows = 0 for row in raw: team_rec = rec_from_instance(row["team_instance"]) if not team_rec: logging.warning(f"Unknown instance {row['team_instance']}; skipping") continue opp_rec = match_opponent(row["opponent_name"]) opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"]) pair = tuple(sorted([team_rec.slug, opp_slug])) if row["game_id"]: key = f"id:{row['game_id']}" else: runs_sig = "" if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int): runs_sig = f"{row['team_runs']}-{row['opp_runs']}" key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}" fallback_rows += 1 perspective = { "team": team_rec, "opp": opp_rec, # may be None "is_away": row["is_away"], "team_runs": row["team_runs"], "opp_runs": row["opp_runs"], "location": row["location"], "status": row["status"], "source_url": row["source_url"], "pair": pair, "date": row["date"], "game_id": row["game_id"], } if key not in buckets: buckets[key] = {"persp": [perspective], "game_id": row["game_id"]} else: buckets[key]["persp"].append(perspective) if fallback_rows: logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.") # Merge perspectives into a single home/away row out_rows = [] time_cache: Dict[str, Optional[str]] = {} for key, bucket in buckets.items(): p = bucket["persp"] date = p[0]["date"] game_id = bucket.get("game_id", "") # Identify home/away perspectives p_home = next((x for x in p if x["is_away"] is False), None) p_away = next((x for x in p if x["is_away"] is True), None) # Team identities home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None)) away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None)) def pack_team(rec: Optional[TeamRec], fallback_slug: str): if rec: return rec.slug, rec.instance_id, rec.team_id, rec.name return fallback_slug, "", "", fallback_slug.replace("-", " ").title() # Prefer runs from the explicit perspective (home if available; otherwise away) home_runs = away_runs = None if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int): home_runs = p_home["team_runs"] away_runs = p_home["opp_runs"] elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int): away_runs = p_away["team_runs"] home_runs = p_away["opp_runs"] # Fallback: single perspective present but numbers known → place by is_away if (home_runs is None or away_runs is None) and p: one = p[0] if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int): if one["is_away"]: away_runs = one["team_runs"]; home_runs = one["opp_runs"] away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team else: home_runs = one["team_runs"]; away_runs = one["opp_runs"] home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team # Pack final team identifiers (fallback slug = guess from perspectives) guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else p_away["opp"].slug if p_away and p_away["opp"] else p[0]["pair"][0]) guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else p_home["opp"].slug if p_home and p_home["opp"] else p[0]["pair"][1]) home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback) away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback) # Winner/loser winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = "" if isinstance(home_runs, int) and isinstance(away_runs, int): if home_runs > away_runs: winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id elif away_runs > home_runs: winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id # Meta from perspectives loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "") status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "") source_urls = sorted({x["source_url"] for x in p}) # -------- NEW: fetch game start time from game page -------- time_local = "" if fetch_time and game_id: if game_id in time_cache: tval = time_cache[game_id] else: logging.debug(f"TIME: fetching game {game_id}") tval = fetch_game_time(game_id, session=session) time_cache[game_id] = tval if tval is None: time.sleep(min(sleep * 2, 1.0)) if tval: time_local = tval logging.debug( f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) " f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}" ) out_rows.append({ "date_local": date, "time_local": time_local, "home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name, "away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name, "home_runs": "" if home_runs is None else home_runs, "away_runs": "" if away_runs is None else away_runs, "winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id, "loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id, "location": loc, "status": status, "game_id": game_id, "source_urls": " ".join(source_urls), }) if not out_rows: logging.warning("No games produced.") return fieldnames = [ "date_local","time_local", "home_slug","home_instance","home_id","home_name", "away_slug","away_instance","away_id","away_name", "home_runs","away_runs", "winner_slug","winner_instance","winner_id", "loser_slug","loser_instance","loser_id", "location","status","game_id","source_urls", ] with open(out, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=fieldnames) w.writeheader() for r in out_rows: w.writerow(r) logging.info(f"Wrote {len(out_rows)} games → {out}") if __name__ == "__main__": typer.run(main)