#!/usr/bin/env python3 # build_season_schedule.py # # Build a deduped season schedule from SportsEngine team-instance printable pages. # - Assumes team-instance schedule pages are TEAM-FIRST for scores. # - Determines home/away using the '@' marker on the opponent cell. # - Deduplicates primarily by game_id (from /game/show/ links), otherwise by a fallback key. # - Optionally fetches each game's time from the /game/show/ page ("tab_boxscores_content"). # # Usage: # pip install requests beautifulsoup4 python-dateutil # python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv # # Example teams.json (array): # [ # {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"}, # ... # ] import argparse import csv import json import logging import re import time from dataclasses import dataclass from typing import Dict, List, Optional, Tuple from urllib.parse import urlencode import requests from bs4 import BeautifulSoup from dateutil import parser as dtp import typer # ----------------- logging ----------------- logging.basicConfig( level=logging.INFO, # change to DEBUG for verbose tracing format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S", ) # ----------------- constants ----------------- UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36" HEADERS = {"User-Agent": UA} # HTTP headers with custom User-Agent for requests PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" # base URL for team-instance printable schedule GAME_BASE = "https://www.csyba.com/game/show/{gid}" # base URL for game detail page # Regular expressions for parsing scores, game links, and time strings SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b") GAME_LINK_RE = re.compile(r"/game/show/(\d+)") TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I) # ----------------- helpers ----------------- def clean(x: str) -> str: """Normalize whitespace and strip input string.""" return re.sub(r"\s+", " ", (x or "")).strip() def slugify(s: str) -> str: """Convert string to lowercase slug with words separated by hyphens.""" s = s.lower() s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") return s def norm_name(s: str) -> str: """ Normalize team names by lowercasing, removing common words like 'the', 'club', and stripping punctuation, to help with loose matching. """ s = s.lower() s = re.sub(r"[^a-z0-9 ]+", " ", s) s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s) s = re.sub(r"\s+", " ", s).strip() return s @dataclass(frozen=True) class TeamRec: """Data class representing a team record with identifying information.""" name: str slug: str team_id: str instance_id: str subseason_id: str def load_teams(teams_path: str): """ Load team mapping data from JSON file. Returns dictionaries keyed by instance_id, slug, and normalized names for lookups. """ with open(teams_path, "r", encoding="utf-8") as f: arr = json.load(f) by_instance: Dict[str, TeamRec] = {} by_slug: Dict[str, TeamRec] = {} by_norm: Dict[str, TeamRec] = {} for t in arr: rec = TeamRec( name=str(t["teamName"]), slug=str(t["team_slug"]), team_id=str(t["team_id"]), instance_id=str(t["instance_id"]), subseason_id=str(t["subseason_id"]), ) by_instance[rec.instance_id] = rec by_slug[rec.slug] = rec by_norm[norm_name(rec.name)] = rec return by_instance, by_slug, by_norm def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]: """ Attempt to match the opponent team name to a known team record. Tries slug first, then normalized name exact match, then loose containment matching on normalized names. """ s = slugify(opponent_text) if s in by_slug: return by_slug[s] n = norm_name(opponent_text) if n in by_norm: return by_norm[n] for key, rec in by_norm.items(): if key in n or n in key: return rec return None def runs_from_team_pov(result_flag: str, s_a: str, s_b: str): """ Parse runs scored by team and opponent, assuming team-first order. Validate results with result_flag (W/L/T). """ if not (s_a.isdigit() and s_b.isdigit()): return None, None a, b = int(s_a), int(s_b) if result_flag == "W" and a <= b: logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).") if result_flag == "L" and a >= b: logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).") return a, b # ----------------- HTTP utils ----------------- def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]: """ Fetch a URL and return a BeautifulSoup parsed document. Uses a shared requests.Session if provided. """ try: sess = session or requests.Session() r = sess.get(url, headers=HEADERS, timeout=timeout) r.raise_for_status() return BeautifulSoup(r.text, "html.parser") except Exception as e: logging.error(f"GET failed {url}: {e}") return None # ----------------- scraping ----------------- def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]: """ Download and parse the team-instance printable schedule page, extracting a list of game dictionaries from the perspective of that team. """ url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({ "schedule_type": "index", "subseason": subseason_id, }) soup = get_soup(url, session=session) if not soup: return [] table = soup.select_one("table") if not table: logging.warning(f"No table found for team_instance={instance_id}") return [] games = [] # Skip header row; iterate over game rows for row_idx, tr in enumerate(table.select("tr")[1:], start=1): tds = tr.select("td") if len(tds) < 5: continue # Extract text from each relevant cell: # Date | Result | Opponent | Location | Status date_txt = clean(tds[0].get_text(" ")) result_txt = clean(tds[1].get_text(" ")) opp_txt = clean(tds[2].get_text(" ")) loc_txt = clean(tds[3].get_text(" ")) status_txt = clean(tds[4].get_text(" ")) # Parse date into ISO format (YYYY-MM-DD) if possible try: date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat() except Exception: date_iso = date_txt # leave raw if parsing fails # Find game ID from any game/show links in the row, if present game_id = "" for a in tr.select("a[href]"): m = GAME_LINK_RE.search(a.get("href", "")) if m: game_id = m.group(1) break # Extract W/L/T indicator from Result cell m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I) result_flag = m_res.group(1).upper() if m_res else "" # Extract numeric scores from Result or Opponent cell m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt) s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "") # Determine if game is away based on '@' prefix in opponent cell is_away = opp_txt.startswith("@") opponent_name = opp_txt.lstrip("@").strip() # Convert scores to integers with team-first orientation team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b) logging.debug( f"PARSER: inst={instance_id} row={row_idx} date={date_iso} " f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} " f"→ team_runs={team_runs}, opp_runs={opp_runs}" ) games.append({ "team_instance": instance_id, "game_id": game_id, # may be empty "date": date_iso, "result": result_flag, # W/L/T from THIS TEAM's perspective "team_runs": team_runs, "opp_runs": opp_runs, "opponent_name": opponent_name, "is_away": is_away, "location": loc_txt, "status": status_txt, "source_url": url, }) logging.info(f"Team {instance_id}: parsed {len(games)} rows") return games def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: """ Fetch the start time of a game from its detail page. Looks inside the boxscores tab or scans text for time patterns. Returns a 24-hour formatted 'HH:MM' string or None if not found. """ if not game_id: return None url = GAME_BASE.format(gid=game_id) soup = get_soup(url, session=session, timeout=30) if not soup: return None # Prefer boxscores tab content to search for time string box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content") text = "" if box: text = " ".join(box.stripped_strings) else: # Fall back to main page text with length limit to prevent excessive text processing main = soup.select_one("div.page") or soup text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split()) m = TIME_RE.search(text) if not m: logging.debug(f"TIME: no time found in game {game_id}") return None hhmm = m.group(1) ampm = (m.group(2) or "").lower().replace(".", "") try: # Normalize time to 24h format from datetime import datetime if ampm: dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p") else: dt = datetime.strptime(hhmm, "%H:%M") return dt.strftime("%H:%M") except Exception: # Try forgiving parse if combined time/ampm without space try: from datetime import datetime if ampm: dt = datetime.strptime(f"{hhmm}{ampm}", "%I:%M%p") return dt.strftime("%H:%M") except Exception: logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}") return None # ----------------- build & merge ----------------- def main( subseason: str = typer.Option(..., help="Subseason ID, e.g. 942425"), teams: str = typer.Option(..., help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)"), out: str = typer.Option("season_schedule.csv", help="Output CSV path"), fetch_time: bool = typer.Option(False, help="Fetch game time from /game/show/"), sleep: float = typer.Option(0.35, help="Delay between requests (seconds)") ): """ Main function to scrape schedules for all teams, merge them, deduplicate entries (primary by game_id), and output a consolidated CSV. Optionally fetches start times per game. """ # Load teams data and indexes by_instance, by_slug, by_norm = load_teams(teams) instance_ids = sorted(by_instance.keys()) # Requests session with custom headers session = requests.Session() session.headers.update(HEADERS) # Scrape all team instance printable schedules raw: List[dict] = [] for i, iid in enumerate(instance_ids, 1): logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}") raw.extend(parse_printable(iid, subseason, session=session)) time.sleep(sleep) # be polite # Helper lookups for team records def rec_from_instance(iid: str) -> Optional[TeamRec]: return by_instance.get(iid) def match_opponent(text: str) -> Optional[TeamRec]: return best_match_team(text, by_slug, by_norm) # Deduplicate buckets keyed by game_id or fallback composite keys buckets: Dict[str, dict] = {} fallback_rows = 0 for row in raw: team_rec = rec_from_instance(row["team_instance"]) if not team_rec: logging.warning(f"Unknown instance {row['team_instance']}; skipping") continue opp_rec = match_opponent(row["opponent_name"]) opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"]) pair = tuple(sorted([team_rec.slug, opp_slug])) if row["game_id"]: key = f"id:{row['game_id']}" else: runs_sig = "" if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int): runs_sig = f"{row['team_runs']}-{row['opp_runs']}" key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}" fallback_rows += 1 # Store perspective of one team's view of the game perspective = { "team": team_rec, "opp": opp_rec, # may be None "is_away": row["is_away"], "team_runs": row["team_runs"], "opp_runs": row["opp_runs"], "location": row["location"], "status": row["status"], "source_url": row["source_url"], "pair": pair, "date": row["date"], "game_id": row["game_id"], } if key not in buckets: buckets[key] = {"persp": [perspective], "game_id": row["game_id"]} else: buckets[key]["persp"].append(perspective) if fallback_rows: logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.") out_rows = [] time_cache: Dict[str, Optional[str]] = {} # cache game times to avoid re-fetching # Merge perspectives and produce consolidated rows for key, bucket in buckets.items(): p = bucket["persp"] date = p[0]["date"] game_id = bucket.get("game_id", "") # Try to identify home and away perspectives p_home = next((x for x in p if x["is_away"] is False), None) p_away = next((x for x in p if x["is_away"] is True), None) # Home is the team who is not away, else fallback to the other team's opponent home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None)) away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None)) def pack_team(rec: Optional[TeamRec], fallback_slug: str): """Pack team record to tuple or fallback to slug-based default values.""" if rec: return rec.slug, rec.instance_id, rec.team_id, rec.name return fallback_slug, "", "", fallback_slug.replace("-", " ").title() # Attempt to get runs from home perspective home_runs = away_runs = None if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int): home_runs = p_home["team_runs"] away_runs = p_home["opp_runs"] # Otherwise try away perspective with reversed runs elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int): away_runs = p_away["team_runs"] home_runs = p_away["opp_runs"] # If runs still missing, guess from first perspective, adjusting for is_away if (home_runs is None or away_runs is None) and p: one = p[0] if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int): if one["is_away"]: away_runs = one["team_runs"] home_runs = one["opp_runs"] away_team = one["team"] home_team = one["opp"] if one["opp"] else home_team else: home_runs = one["team_runs"] away_runs = one["opp_runs"] home_team = one["team"] away_team = one["opp"] if one["opp"] else away_team # Fallback guesses for home and away slugs if team data missing guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else p_away["opp"].slug if p_away and p_away["opp"] else p[0]["pair"][0]) guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else p_home["opp"].slug if p_home and p_home["opp"] else p[0]["pair"][1]) home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback) away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback) # Determine winner and loser slugs based on runs winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = "" if isinstance(home_runs, int) and isinstance(away_runs, int): if home_runs > away_runs: winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id elif away_runs > home_runs: winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id # Consolidate location and status from home or away perspectives loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "") status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "") source_urls = sorted({x["source_url"] for x in p}) # Optionally fetch game start time time_local = "" if fetch_time and game_id: if game_id in time_cache: tval = time_cache[game_id] else: logging.debug(f"TIME: fetching game {game_id}") tval = fetch_game_time(game_id, session=session) time_cache[game_id] = tval # If no time found, wait longer before next request to be polite if tval is None: time.sleep(min(sleep * 2, 1.0)) if tval: time_local = tval logging.debug( f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) " f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}" ) # Append consolidated game record for CSV output out_rows.append({ "date_local": date, "time_local": time_local, "home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name, "away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name, "home_runs": "" if home_runs is None else home_runs, "away_runs": "" if away_runs is None else away_runs, "winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id, "loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id, "location": loc, "status": status, "game_id": game_id, "source_urls": " ".join(source_urls), }) if not out_rows: logging.warning("No games produced.") return # Define CSV output columns fieldnames = [ "date_local","time_local", "home_slug","home_instance","home_id","home_name", "away_slug","away_instance","away_id","away_name", "home_runs","away_runs", "winner_slug","winner_instance","winner_id", "loser_slug","loser_instance","loser_id", "location","status","game_id","source_urls", ] # Write consolidated game data to CSV with open(out, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=fieldnames) w.writeheader() for r in out_rows: w.writerow(r) logging.info(f"Wrote {len(out_rows)} games → {out}") if __name__ == "__main__": typer.run(main)