From 5cecc6e280311512b1b54ebfaf8f75c85050bbff Mon Sep 17 00:00:00 2001 From: Tony Date: Wed, 27 Aug 2025 11:23:48 -0500 Subject: [PATCH] initial commit --- .gitignore | 2 + 2025-csyba.json | 376 +++++++++++++++++++++++++++++++ build_season_schedule.py | 464 +++++++++++++++++++++++++++++++++++++++ compute_ratings.py | 224 +++++++++++++++++++ csyba.py | 93 ++++++++ 5 files changed, 1159 insertions(+) create mode 100644 .gitignore create mode 100644 2025-csyba.json create mode 100644 build_season_schedule.py create mode 100644 compute_ratings.py create mode 100644 csyba.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ae0906e --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/*.csv +/*.numbers \ No newline at end of file diff --git a/2025-csyba.json b/2025-csyba.json new file mode 100644 index 0000000..a5fda7d --- /dev/null +++ b/2025-csyba.json @@ -0,0 +1,376 @@ +[{ + "teamName": "Carol Stream Cheaties", + "team_id": "8944347", + "team_slug": "carol-stream-cheaties", + "subseason_id": "942425", + "instance_id": "10119604", + "w": "15", + "l": "2", + "t": "1", + "rf": "139", + "ra": "41", + "division_record": "10-2-1", + "division": "North", + "link": "https://www.csyba.com/page/show/8944347-carol-stream-cheaties?subseason=942425" + }, + { + "teamName": "Deerfield Dynasty", + "team_id": "8944348", + "team_slug": "deerfield-dynasty", + "subseason_id": "942425", + "instance_id": "10119605", + "w": "15", + "l": "3", + "t": "0", + "rf": "152", + "ra": "52", + "division_record": "12-2-0", + "division": "North", + "link": "https://www.csyba.com/page/show/8944348-deerfield-dynasty?subseason=942425" + }, + { + "teamName": "Buffalo Grove Marlins", + "team_id": "8944344", + "team_slug": "buffalo-grove-marlins", + "subseason_id": "942425", + "instance_id": "10119601", + "w": "15", + "l": "3", + "t": "0", + "rf": "127", + "ra": "47", + "division_record": "11-2-0", + "division": "North", + "link": "https://www.csyba.com/page/show/8944344-buffalo-grove-marlins?subseason=942425" + }, + { + "teamName": "Buffalo Grove White Sox", + "team_id": "8944346", + "team_slug": "buffalo-grove-white-sox", + "subseason_id": "942425", + "instance_id": "10119603", + "w": "12", + "l": "5", + "t": "0", + "rf": "140", + "ra": "58", + "division_record": "10-5-0", + "division": "North", + "link": "https://www.csyba.com/page/show/8944346-buffalo-grove-white-sox?subseason=942425" + }, + { + "teamName": "Arlington Hts Shamrocks", + "team_id": "8944342", + "team_slug": "arlington-hts-shamrocks", + "subseason_id": "942425", + "instance_id": "10119599", + "w": "9", + "l": "9", + "t": "3", + "rf": "120", + "ra": "119", + "division_record": "5-8-3", + "division": "North", + "link": "https://www.csyba.com/page/show/8944342-arlington-hts-shamrocks?subseason=942425" + }, + { + "teamName": "Waukegan Alacranes", + "team_id": "9024497", + "team_slug": "waukegan-alacranes", + "subseason_id": "942425", + "instance_id": "10185021", + "w": "7", + "l": "7", + "t": "2", + "rf": "96", + "ra": "88", + "division_record": "6-5-2", + "division": "North", + "link": "https://www.csyba.com/page/show/9024497-waukegan-alacranes?subseason=942425" + }, + { + "teamName": "Palatine Pelicans", + "team_id": "8944350", + "team_slug": "palatine-pelicans", + "subseason_id": "942425", + "instance_id": "10119607", + "w": "6", + "l": "10", + "t": "2", + "rf": "91", + "ra": "128", + "division_record": "3-9-2", + "division": "North", + "link": "https://www.csyba.com/page/show/8944350-palatine-pelicans?subseason=942425" + }, + { + "teamName": "Buffalo Grove Blue Wahoos", + "team_id": "9071622", + "team_slug": "buffalo-grove-blue-wahoos", + "subseason_id": "942425", + "instance_id": "10219990", + "w": "5", + "l": "10", + "t": "1", + "rf": "57", + "ra": "115", + "division_record": "3-8-0", + "division": "North", + "link": "https://www.csyba.com/page/show/9071622-buffalo-grove-blue-wahoos?subseason=942425" + }, + { + "teamName": "Arlington Hts Freeze", + "team_id": "8944343", + "team_slug": "arlington-hts-freeze", + "subseason_id": "942425", + "instance_id": "10119600", + "w": "6", + "l": "13", + "t": "0", + "rf": "87", + "ra": "116", + "division_record": "3-11-0", + "division": "North", + "link": "https://www.csyba.com/page/show/8944343-arlington-hts-freeze?subseason=942425" + }, + { + "teamName": "Buffalo Grove Orioles", + "team_id": "8944345", + "team_slug": "buffalo-grove-orioles", + "subseason_id": "942425", + "instance_id": "10119602", + "w": "2", + "l": "16", + "t": "0", + "rf": "76", + "ra": "175", + "division_record": "1-12-0", + "division": "North", + "link": "https://www.csyba.com/page/show/8944345-buffalo-grove-orioles?subseason=942425" + }, + { + "teamName": "Dunham Dash", + "team_id": "8944355", + "team_slug": "dunham-dash", + "subseason_id": "942425", + "instance_id": "10119611", + "w": "12", + "l": "3", + "t": "0", + "rf": "117", + "ra": "57", + "division_record": "9-0-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8944355-dunham-dash?subseason=942425" + }, + { + "teamName": "Skokie Vikings", + "team_id": "8944360", + "team_slug": "skokie-vikings", + "subseason_id": "942425", + "instance_id": "10119616", + "w": "9", + "l": "6", + "t": "1", + "rf": "93", + "ra": "72", + "division_record": "6-3-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8944360-skokie-vikings?subseason=942425" + }, + { + "teamName": "Elmhurst White Sox", + "team_id": "8944356", + "team_slug": "elmhurst-white-sox", + "subseason_id": "942425", + "instance_id": "10119612", + "w": "4", + "l": "3", + "t": "0", + "rf": "35", + "ra": "31", + "division_record": "3-2-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8944356-elmhurst-white-sox?subseason=942425" + }, + { + "teamName": "Lombard Expos", + "team_id": "8974790", + "team_slug": "lombard-expos", + "subseason_id": "942425", + "instance_id": "10148204", + "w": "8", + "l": "7", + "t": "1", + "rf": "97", + "ra": "68", + "division_record": "5-4-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8974790-lombard-expos?subseason=942425" + }, + { + "teamName": "Chicago Rebels", + "team_id": "8974058", + "team_slug": "chicago-rebels", + "subseason_id": "942425", + "instance_id": "10147713", + "w": "9", + "l": "9", + "t": "0", + "rf": "104", + "ra": "81", + "division_record": "6-4-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8974058-chicago-rebels?subseason=942425" + }, + { + "teamName": "Westchester Knights", + "team_id": "8944361", + "team_slug": "westchester-knights", + "subseason_id": "942425", + "instance_id": "10119617", + "w": "5", + "l": "10", + "t": "0", + "rf": "82", + "ra": "155", + "division_record": "4-4-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8944361-westchester-knights?subseason=942425" + }, + { + "teamName": "Melrose Park Thorns", + "team_id": "9014143", + "team_slug": "melrose-park-thorns", + "subseason_id": "942425", + "instance_id": "10178191", + "w": "5", + "l": "12", + "t": "1", + "rf": "106", + "ra": "139", + "division_record": "3-7-0", + "division": "South", + "link": "https://www.csyba.com/page/show/9014143-melrose-park-thorns?subseason=942425" + }, + { + "teamName": "Bedford Park Bombers", + "team_id": "8944352", + "team_slug": "bedford-park-bombers", + "subseason_id": "942425", + "instance_id": "10119608", + "w": "3", + "l": "12", + "t": "0", + "rf": "48", + "ra": "133", + "division_record": "2-6-0", + "division": "South", + "link": "https://www.csyba.com/page/show/8944352-bedford-park-bombers?subseason=942425" + }, + { + "teamName": "Skokie Classics", + "team_id": "8944359", + "team_slug": "skokie-classics", + "subseason_id": "942425", + "instance_id": "10119615", + "w": "5", + "l": "15", + "t": "1", + "rf": "105", + "ra": "177", + "division_record": "4-6-1", + "division": "South", + "link": "https://www.csyba.com/page/show/8944359-skokie-classics?subseason=942425" + }, + { + "teamName": "Park Ridge White Sox", + "team_id": "8944358", + "team_slug": "park-ridge-white-sox", + "subseason_id": "942425", + "instance_id": "10119614", + "w": "1", + "l": "11", + "t": "3", + "rf": "42", + "ra": "142", + "division_record": "0-6-1", + "division": "South", + "link": "https://www.csyba.com/page/show/8944358-park-ridge-white-sox?subseason=942425" + }, + { + "teamName": "Chicago White Sox", + "team_id": "9002208", + "team_slug": "chicago-white-sox", + "subseason_id": "942425", + "instance_id": "10168648", + "w": "19", + "l": "6", + "t": "0", + "rf": "162", + "ra": "73", + "division_record": "10-4-0", + "division": "CMBA", + "link": "https://www.csyba.com/page/show/9002208-chicago-white-sox?subseason=942425" + }, + { + "teamName": "Chicago Blazers", + "team_id": "9002204", + "team_slug": "chicago-blazers", + "subseason_id": "942425", + "instance_id": "10168644", + "w": "17", + "l": "7", + "t": "0", + "rf": "239", + "ra": "94", + "division_record": "9-4-0", + "division": "CMBA", + "link": "https://www.csyba.com/page/show/9002204-chicago-blazers?subseason=942425" + }, + { + "teamName": "Chicago Electrons", + "team_id": "9002205", + "team_slug": "chicago-electrons", + "subseason_id": "942425", + "instance_id": "10168645", + "w": "16", + "l": "6", + "t": "2", + "rf": "170", + "ra": "112", + "division_record": "9-4-0", + "division": "CMBA", + "link": "https://www.csyba.com/page/show/9002205-chicago-electrons?subseason=942425" + }, + { + "teamName": "Chicago Hounds", + "team_id": "9002206", + "team_slug": "chicago-hounds", + "subseason_id": "942425", + "instance_id": "10168646", + "w": "15", + "l": "11", + "t": "0", + "rf": "182", + "ra": "126", + "division_record": "7-8-0", + "division": "CMBA", + "link": "https://www.csyba.com/page/show/9002206-chicago-hounds?subseason=942425" + }, + { + "teamName": "Chicago Hawks", + "team_id": "9002209", + "team_slug": "chicago-hawks", + "subseason_id": "942425", + "instance_id": "10168649", + "w": "1", + "l": "25", + "t": "2", + "rf": "87", + "ra": "355", + "division_record": "0-15-0", + "division": "CMBA", + "link": "https://www.csyba.com/page/show/9002209-chicago-hawks?subseason=942425" + } +] \ No newline at end of file diff --git a/build_season_schedule.py b/build_season_schedule.py new file mode 100644 index 0000000..008d6bc --- /dev/null +++ b/build_season_schedule.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +# build_season_schedule.py +# +# Build a deduped season schedule from SportsEngine team-instance printable pages. +# - Assumes team-instance schedule pages are TEAM-FIRST for scores. +# - Determines home/away using the '@' marker on the opponent cell. +# - Deduplicates primarily by game_id (from /game/show/ links), otherwise by a fallback key. +# - Optionally fetches each game's time from the /game/show/ page ("tab_boxscores_content"). +# +# Usage: +# pip install requests beautifulsoup4 python-dateutil +# python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv +# +# Example teams.json (array): +# [ +# {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"}, +# ... +# ] + +import argparse +import csv +import json +import logging +import re +import time +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +from urllib.parse import urlencode + +import requests +from bs4 import BeautifulSoup +from dateutil import parser as dtp + +# ----------------- logging ----------------- +logging.basicConfig( + level=logging.INFO, # change to DEBUG for verbose tracing + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S", +) + +# ----------------- constants ----------------- +UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36" +HEADERS = {"User-Agent": UA} +PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}" +GAME_BASE = "https://www.csyba.com/game/show/{gid}" + +SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b") +GAME_LINK_RE = re.compile(r"/game/show/(\d+)") +TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I) + +# ----------------- helpers ----------------- +def clean(x: str) -> str: + return re.sub(r"\s+", " ", (x or "")).strip() + +def slugify(s: str) -> str: + s = s.lower() + s = re.sub(r"[^a-z0-9]+", "-", s).strip("-") + return s + +def norm_name(s: str) -> str: + s = s.lower() + s = re.sub(r"[^a-z0-9 ]+", " ", s) + s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s) + s = re.sub(r"\s+", " ", s).strip() + return s + +@dataclass(frozen=True) +class TeamRec: + name: str + slug: str + team_id: str + instance_id: str + subseason_id: str + +def load_teams(teams_path: str): + """Load mapping tables from teams.json you provided.""" + with open(teams_path, "r", encoding="utf-8") as f: + arr = json.load(f) + by_instance: Dict[str, TeamRec] = {} + by_slug: Dict[str, TeamRec] = {} + by_norm: Dict[str, TeamRec] = {} + for t in arr: + rec = TeamRec( + name=str(t["teamName"]), + slug=str(t["team_slug"]), + team_id=str(t["team_id"]), + instance_id=str(t["instance_id"]), + subseason_id=str(t["subseason_id"]), + ) + by_instance[rec.instance_id] = rec + by_slug[rec.slug] = rec + by_norm[norm_name(rec.name)] = rec + return by_instance, by_slug, by_norm + +def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]: + """Match opponent using slug first, then normalized name, then loose containment.""" + s = slugify(opponent_text) + if s in by_slug: + return by_slug[s] + n = norm_name(opponent_text) + if n in by_norm: + return by_norm[n] + for key, rec in by_norm.items(): + if key in n or n in key: + return rec + return None + +def runs_from_team_pov(result_flag: str, s_a: str, s_b: str): + """ + Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs. + We don't reorder; we only validate with W/L/T if needed. + """ + if not (s_a.isdigit() and s_b.isdigit()): + return None, None + a, b = int(s_a), int(s_b) + if result_flag == "W" and a <= b: + logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).") + if result_flag == "L" and a >= b: + logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).") + return a, b + +# ----------------- HTTP utils ----------------- +def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]: + try: + sess = session or requests.Session() + r = sess.get(url, headers=HEADERS, timeout=timeout) + r.raise_for_status() + return BeautifulSoup(r.text, "html.parser") + except Exception as e: + logging.error(f"GET failed {url}: {e}") + return None + +# ----------------- scraping ----------------- +def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]: + """Parse one team-instance printable schedule page into perspective rows.""" + url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({ + "schedule_type": "index", + "subseason": subseason_id, + }) + soup = get_soup(url, session=session) + if not soup: + return [] + + table = soup.select_one("table") + if not table: + logging.warning(f"No table found for team_instance={instance_id}") + return [] + + games = [] + for row_idx, tr in enumerate(table.select("tr")[1:], start=1): + tds = tr.select("td") + if len(tds) < 5: + continue + + # Cells: Date | Result | Opponent | Location | Status + date_txt = clean(tds[0].get_text(" ")) + result_txt = clean(tds[1].get_text(" ")) + opp_txt = clean(tds[2].get_text(" ")) + loc_txt = clean(tds[3].get_text(" ")) + status_txt = clean(tds[4].get_text(" ")) + + # Date → ISO + try: + date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat() + except Exception: + date_iso = date_txt + + # Pull a game_id if present (from any link in the row) + game_id = "" + for a in tr.select("a[href]"): + m = GAME_LINK_RE.search(a.get("href", "")) + if m: + game_id = m.group(1) + break + + # Extract W/L/T (Result cell) + m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I) + result_flag = m_res.group(1).upper() if m_res else "" + + # Extract score from Result cell; if missing, also try Opponent cell + m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt) + s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "") + + # Opponent + home/away flag + is_away = opp_txt.startswith("@") + opponent_name = opp_txt.lstrip("@").strip() + + # Compute team/opp runs (TEAM-FIRST orientation) + team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b) + + logging.debug( + f"PARSER: inst={instance_id} row={row_idx} date={date_iso} " + f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} " + f"→ team_runs={team_runs}, opp_runs={opp_runs}" + ) + + games.append({ + "team_instance": instance_id, + "game_id": game_id, # may be empty + "date": date_iso, + "result": result_flag, # W/L/T from THIS TEAM's perspective + "team_runs": team_runs, + "opp_runs": opp_runs, + "opponent_name": opponent_name, + "is_away": is_away, + "location": loc_txt, + "status": status_txt, + "source_url": url, + }) + + logging.info(f"Team {instance_id}: parsed {len(games)} rows") + return games + +def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]: + """ + Fetch the game's local start time from the /game/show/ page. + Looks inside the tab with id 'tab_boxscores_content' but also + falls back to scanning the page for common time patterns. + Returns a zero-padded 24h 'HH:MM' string or None if unavailable. + """ + if not game_id: + return None + url = GAME_BASE.format(gid=game_id) + soup = get_soup(url, session=session, timeout=30) + if not soup: + return None + + # Prefer the boxscores tab content + box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content") + text = "" + if box: + text = " ".join(box.stripped_strings) + else: + # Fall back to page-wide text (but avoid pulling too much) + main = soup.select_one("div.page") or soup + text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split()) + + m = TIME_RE.search(text) + if not m: + logging.debug(f"TIME: no time found in game {game_id}") + return None + + hhmm = m.group(1) + ampm = (m.group(2) or "").lower().replace(".", "") + try: + # Normalize to 24h HH:MM + from datetime import datetime + if ampm: + dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p") + else: + # already 24h-ish + dt = datetime.strptime(hhmm, "%H:%M") + return dt.strftime("%H:%M") + except Exception: + # Be forgiving (e.g., "6:00pm" without space) + try: + from datetime import datetime + hhmm2 = hhmm + if ampm: + dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p") + return dt.strftime("%H:%M") + except Exception: + logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}") + return None + +# ----------------- build & merge ----------------- +def main(): + ap = argparse.ArgumentParser(description="Build a deduped season schedule with IDs, winners/losers, runs, and times.") + ap.add_argument("--subseason", required=True, help="Subseason ID, e.g. 942425") + ap.add_argument("--teams", required=True, help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)") + ap.add_argument("--out", default="season_schedule.csv", help="Output CSV path") + ap.add_argument("--fetch-time", action="store_true", help="Fetch game time from /game/show/") + ap.add_argument("--sleep", type=float, default=0.35, help="Delay between requests (seconds)") + args = ap.parse_args() + + by_instance, by_slug, by_norm = load_teams(args.teams) + instance_ids = sorted(by_instance.keys()) + + session = requests.Session() + session.headers.update(HEADERS) + + # Scrape all teams + raw: List[dict] = [] + for i, iid in enumerate(instance_ids, 1): + logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}") + raw.extend(parse_printable(iid, args.subseason, session=session)) + time.sleep(args.sleep) # be polite + + def rec_from_instance(iid: str) -> Optional[TeamRec]: + return by_instance.get(iid) + + def match_opponent(text: str) -> Optional[TeamRec]: + return best_match_team(text, by_slug, by_norm) + + # Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present) + buckets: Dict[str, dict] = {} + fallback_rows = 0 + + for row in raw: + team_rec = rec_from_instance(row["team_instance"]) + if not team_rec: + logging.warning(f"Unknown instance {row['team_instance']}; skipping") + continue + + opp_rec = match_opponent(row["opponent_name"]) + opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"]) + pair = tuple(sorted([team_rec.slug, opp_slug])) + + if row["game_id"]: + key = f"id:{row['game_id']}" + else: + runs_sig = "" + if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int): + runs_sig = f"{row['team_runs']}-{row['opp_runs']}" + key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}" + fallback_rows += 1 + + perspective = { + "team": team_rec, + "opp": opp_rec, # may be None + "is_away": row["is_away"], + "team_runs": row["team_runs"], + "opp_runs": row["opp_runs"], + "location": row["location"], + "status": row["status"], + "source_url": row["source_url"], + "pair": pair, + "date": row["date"], + "game_id": row["game_id"], + } + + if key not in buckets: + buckets[key] = {"persp": [perspective], "game_id": row["game_id"]} + else: + buckets[key]["persp"].append(perspective) + + if fallback_rows: + logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.") + + # Merge perspectives into a single home/away row + out_rows = [] + time_cache: Dict[str, Optional[str]] = {} + + for key, bucket in buckets.items(): + p = bucket["persp"] + date = p[0]["date"] + game_id = bucket.get("game_id", "") + + # Identify home/away perspectives + p_home = next((x for x in p if x["is_away"] is False), None) + p_away = next((x for x in p if x["is_away"] is True), None) + + # Team identities + home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None)) + away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None)) + + def pack_team(rec: Optional[TeamRec], fallback_slug: str): + if rec: + return rec.slug, rec.instance_id, rec.team_id, rec.name + return fallback_slug, "", "", fallback_slug.replace("-", " ").title() + + # Prefer runs from the explicit perspective (home if available; otherwise away) + home_runs = away_runs = None + if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int): + home_runs = p_home["team_runs"] + away_runs = p_home["opp_runs"] + elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int): + away_runs = p_away["team_runs"] + home_runs = p_away["opp_runs"] + + # Fallback: single perspective present but numbers known → place by is_away + if (home_runs is None or away_runs is None) and p: + one = p[0] + if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int): + if one["is_away"]: + away_runs = one["team_runs"]; home_runs = one["opp_runs"] + away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team + else: + home_runs = one["team_runs"]; away_runs = one["opp_runs"] + home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team + + # Pack final team identifiers (fallback slug = guess from perspectives) + guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else + p_away["opp"].slug if p_away and p_away["opp"] else + p[0]["pair"][0]) + guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else + p_home["opp"].slug if p_home and p_home["opp"] else + p[0]["pair"][1]) + + home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback) + away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback) + + # Winner/loser + winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = "" + if isinstance(home_runs, int) and isinstance(away_runs, int): + if home_runs > away_runs: + winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id + loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id + elif away_runs > home_runs: + winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id + loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id + + # Meta from perspectives + loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "") + status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "") + source_urls = sorted({x["source_url"] for x in p}) + + # -------- NEW: fetch game start time from game page -------- + time_local = "" + if args.fetch_time and game_id: + if game_id in time_cache: + tval = time_cache[game_id] + else: + logging.debug(f"TIME: fetching game {game_id}") + tval = fetch_game_time(game_id, session=session) + time_cache[game_id] = tval + if tval is None: + # small backoff to be nice if many misses + time.sleep(min(args.sleep * 2, 1.0)) + if tval: + time_local = tval + + logging.debug( + f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) " + f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}" + ) + + out_rows.append({ + "date_local": date, + "time_local": time_local, + "home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name, + "away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name, + "home_runs": "" if home_runs is None else home_runs, + "away_runs": "" if away_runs is None else away_runs, + "winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id, + "loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id, + "location": loc, "status": status, + "game_id": game_id, + "source_urls": " ".join(source_urls), + }) + + if not out_rows: + logging.warning("No games produced.") + return + + fieldnames = [ + "date_local","time_local", + "home_slug","home_instance","home_id","home_name", + "away_slug","away_instance","away_id","away_name", + "home_runs","away_runs", + "winner_slug","winner_instance","winner_id", + "loser_slug","loser_instance","loser_id", + "location","status","game_id","source_urls", + ] + with open(args.out, "w", newline="", encoding="utf-8") as f: + w = csv.DictWriter(f, fieldnames=fieldnames) + w.writeheader() + for r in out_rows: + w.writerow(r) + + logging.info(f"Wrote {len(out_rows)} games → {args.out}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/compute_ratings.py b/compute_ratings.py new file mode 100644 index 0000000..9d7a59e --- /dev/null +++ b/compute_ratings.py @@ -0,0 +1,224 @@ +#!/usr/bin/env python3 +""" +Rank baseball teams from a season_schedule.csv that has columns: +date_local,time_local,home_slug,home_instance,home_id,home_name, +away_slug,away_instance,away_id,away_name,home_runs,away_runs, +winner_slug,winner_instance,winner_id,loser_slug,loser_instance,loser_id, +location,status,game_id,source_urls + +Output CSV columns (one row per team): +Team, GP, W, L, T, WinPct, RS, RA, RunDiff, PythagoreanWinPct, +MasseyRating, EloRating, StrengthOfSchedule, CompositeRating + +Defaults: +- Team identity uses *_name; switch to slugs with --team-id slugs +- Pythagorean exponent = 1.83 +- Massey caps margins at 8 runs and subtracts estimated home-field runs +- Elo: start 1500, K=24, home bonus H=30, margin factor ln(|m|+1) capped at 2.0 +- Elo averaged over 20 random shuffles (reduces order dependence) +""" + +from __future__ import annotations +import argparse +import math +import numpy as np +import pandas as pd + +def parse_args(): + p = argparse.ArgumentParser(description="Power ratings from season_schedule.csv") + p.add_argument("--in", dest="inp", required=True, help="Input CSV (season_schedule.csv)") + p.add_argument("--out", dest="out", required=True, help="Output ratings CSV") + p.add_argument("--team-id", choices=["names","slugs"], default="names", + help="Use team names or slugs as identifiers (default: names)") + p.add_argument("--final-status", default=None, + help="Only include games where status == this value (e.g., 'final'). If omitted, any row with scores is included.") + # Tunables + p.add_argument("--pyexp", type=float, default=1.83, help="Pythagorean exponent") + p.add_argument("--massey-cap", type=float, default=8.0, help="Cap for run margins in Massey") + p.add_argument("--no-massey-home-adj", action="store_true", + help="Disable subtracting estimated home-field runs in Massey") + p.add_argument("--elo-k", type=float, default=24.0, help="Elo K-factor") + p.add_argument("--elo-home", type=float, default=30.0, help="Elo home bonus (points)") + p.add_argument("--elo-mcap", type=float, default=2.0, help="Cap for margin factor ln(|m|+1)") + p.add_argument("--elo-shuffles", type=int, default=20, help="Random shuffles to average Elo") + p.add_argument("--elo-seed", type=int, default=42, help="RNG seed for shuffles") + return p.parse_args() + +def load_games(a) -> pd.DataFrame: + df = pd.read_csv(a.inp) + # Choose identifiers + home_id_col = "home_name" if a.team_id == "names" else "home_slug" + away_id_col = "away_name" if a.team_id == "names" else "away_slug" + for c in [home_id_col, away_id_col, "home_runs", "away_runs"]: + if c not in df.columns: + raise ValueError(f"Missing required column: {c}") + + # Optional status filter (helps exclude postponed/canceled) + if a.final_status is not None and "status" in df.columns: + df = df[df["status"].astype(str).str.lower() == str(a.final_status).lower()] + + # Keep only games with numeric scores + df = df.copy() + df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce") + df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce") + df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"]) + + # Parse datetime (robust to missing either field) + date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce") + time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time + # Combine when possible + dt = date + if "time_local" in df.columns: + # build datetime only where both present + dt = pd.to_datetime( + date.dt.strftime("%Y-%m-%d").fillna("") + " " + + pd.Series(time).astype(str).replace("NaT",""), + errors="coerce" + ) + df_out = pd.DataFrame({ + "Date": dt, + "HomeTeam": df[home_id_col].astype(str), + "AwayTeam": df[away_id_col].astype(str), + "HomeRuns": df["home_runs"].astype(int), + "AwayRuns": df["away_runs"].astype(int), + }) + df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"] + df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H", + np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T")) + return df_out.reset_index(drop=True) + +def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame: + teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team") + stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0) + for _, r in df.iterrows(): + h, a = r["HomeTeam"], r["AwayTeam"] + hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"]) + stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar + stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr + if hr > ar: + stats.at[h,"W"] += 1; stats.at[a,"L"] += 1 + elif hr < ar: + stats.at[a,"W"] += 1; stats.at[h,"L"] += 1 + else: + stats.at[h,"T"] += 1; stats.at[a,"T"] += 1 + stats = stats.astype(int) + stats["GP"] = stats["W"] + stats["L"] + stats["T"] + stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan) + stats["RunDiff"] = stats["RS"] - stats["RA"] + return stats.reset_index() + +def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series: + rs = rs.clip(lower=0); ra = ra.clip(lower=0) + num = np.power(rs, exp); den = num + np.power(ra, exp) + with np.errstate(divide="ignore", invalid="ignore"): + p = np.where(den > 0, num / den, 0.5) + return pd.Series(p, index=rs.index) + +def estimate_home_field_runs(df: pd.DataFrame) -> float: + return float(df["Margin"].mean()) if len(df) else 0.0 + +def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]: + teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"])) + idx = {t:i for i,t in enumerate(teams)} + y = df["Margin"].astype(float).to_numpy() + if cap and cap > 0: + y = np.clip(y, -cap, cap) + h_est = estimate_home_field_runs(df) + if subtract_home: + y = y - h_est + G, N = len(df), len(teams) + A = np.zeros((G+1, N), dtype=float) + for r_i, r in enumerate(df.itertuples(index=False)): + A[r_i, idx[r.HomeTeam]] = 1.0 + A[r_i, idx[r.AwayTeam]] = -1.0 + A[G, :] = 1.0 + y_ext = np.concatenate([y, [0.0]]) + r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None) + return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0) + +def elo_expected(ra: float, rb: float) -> float: + return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0)) + +def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]: + ratings = dict(init) + for _, r in df.iterrows(): + h, a = r["HomeTeam"], r["AwayTeam"] + hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"]) + margin = hr - ar + Eh = elo_expected(ratings[h] + H, ratings[a]) + Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5)) + M = np.log(abs(margin) + 1.0) + if mcap is not None: + M = min(M, mcap) + ratings[h] += K * M * (Sh - Eh) + ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh)) + return ratings + +def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series: + teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"])) + base = {t: 1500.0 for t in teams} + # baseline in chronological order (Date may be NaT; sort is stable) + df0 = df.sort_values(["Date"]).reset_index(drop=True) + r_first = elo_once(df0, K, H, mcap, base) + rng = np.random.default_rng(seed) + vals = {t: [r_first[t]] for t in teams} + for _ in range(max(0, shuffles-1)): + idx = np.arange(len(df0)); rng.shuffle(idx) + r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base) + for t in teams: + vals[t].append(r[t]) + return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index() + +def zscore(s: pd.Series) -> pd.Series: + mu, sd = s.mean(), s.std(ddof=0) + return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd + +def main(): + a = parse_args() + games = load_games(a) + + # Aggregates + team = aggregate_team_stats(games) + team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], a.pyexp) + + # Ratings + massey_r, h_runs = massey(games, cap=a.massey_cap, subtract_home=(not a.no_massey_home_adj)) + sos = ( + games.assign(OppTeam=np.where(True, games["AwayTeam"], games["AwayTeam"])) # placeholder + ) + # Strength of schedule: avg opponent Massey rating faced + opps = {t: [] for t in massey_r.index} + for _, r in games.iterrows(): + opps[r["HomeTeam"]].append(r["AwayTeam"]) + opps[r["AwayTeam"]].append(r["HomeTeam"]) + sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps}) + + elo_r = elo(games, K=a.elo_k, H=a.elo_home, mcap=a.elo_mcap, shuffles=a.elo_shuffles, seed=a.elo_seed) + + # Merge + out = team.set_index("Team") + out["MasseyRating"] = massey_r + out["EloRating"] = elo_r + out["StrengthOfSchedule"] = sos_series + + # Composite + Z_r, Z_e, Z_p = zscore(out["MasseyRating"]), zscore(out["EloRating"]), zscore(out["PythagoreanWinPct"]) + out["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p + + out = out.reset_index() + out = out[[ + "Team","GP","W","L","T","WinPct","RS","RA","RunDiff", + "PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating" + ]].sort_values("CompositeRating", ascending=False) + + # Round for readability + for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]: + out[c] = out[c].astype(float).round(5) + + out.to_csv(a.out, index=False) + print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}") + print(f"Teams ranked: {len(out)} | Games processed: {len(games)}") + print(f"Output -> {a.out}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/csyba.py b/csyba.py new file mode 100644 index 0000000..36126f4 --- /dev/null +++ b/csyba.py @@ -0,0 +1,93 @@ +import requests, re, time, csv, logging +from bs4 import BeautifulSoup +from dateutil import parser as dtp + +# --- Logging setup --- +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S" +) + +HEADERS = {"User-Agent": "Mozilla/5.0"} +SUBSEASON_ID = "942425" + +TEAM_INSTANCES = [ + "10119604","10119605","10119601","10119603","10119599","10185021","10119607", + "10219990","10119600","10119602","10119611","10119616","10119612","10148204", + "10147713","10119617","10178191","10119608","10119615","10119614","10168648", + "10168644","10168645","10168646","10168649" +] + +def clean(x): + return re.sub(r"\s+"," ",x or "").strip() + +def fetch_team_schedule(iid): + url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}" + try: + r = requests.get(url, headers=HEADERS, timeout=30) + r.raise_for_status() + except Exception as e: + logging.error(f"Failed to fetch team {iid}: {e}") + return [] + + soup = BeautifulSoup(r.text,"html.parser") + games = [] + for tr in soup.select("table tr")[1:]: # skip header + tds = tr.select("td") + if len(tds) < 5: + continue + date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]] + # parse date + try: + date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat() + except: + date_iso = date_txt + # extract result/score + m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I) + result = m_res.group(1).upper() if m_res else "" + m_score = re.search(r"(\d+)\s*[-–]\s*(\d+)", result_txt) + hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","") + away_flag = opp_txt.startswith("@") + opponent = opp_txt.lstrip("@").strip() + games.append({ + "team_instance": iid, + "date": date_iso, + "result": result, + "score": f"{hs}-{as_}" if hs else "", + "home_score": hs, + "away_score": as_, + "opponent": opponent, + "is_away": away_flag, + "location": loc_txt, + "status": status_txt, + "source_url": url + }) + logging.info(f"Team {iid}: parsed {len(games)} games") + return games + +def main(): + all_games = [] + for i, iid in enumerate(TEAM_INSTANCES, start=1): + logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}") + all_games.extend(fetch_team_schedule(iid)) + time.sleep(0.5) + + # deduplicate: key = (date, sorted team_instance+opponent, score) + unique = {} + for g in all_games: + key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"]) + if key not in unique: + unique[key] = g + deduped_games = list(unique.values()) + + out_file = "season_games.csv" + with open(out_file,"w",newline="",encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys()) + writer.writeheader() + writer.writerows(deduped_games) + + logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}") + +if __name__ == "__main__": + main() \ No newline at end of file