initial commit

This commit is contained in:
2025-08-27 11:23:48 -05:00
commit 5cecc6e280
5 changed files with 1159 additions and 0 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
/*.csv
/*.numbers

376
2025-csyba.json Normal file
View File

@@ -0,0 +1,376 @@
[{
"teamName": "Carol Stream Cheaties",
"team_id": "8944347",
"team_slug": "carol-stream-cheaties",
"subseason_id": "942425",
"instance_id": "10119604",
"w": "15",
"l": "2",
"t": "1",
"rf": "139",
"ra": "41",
"division_record": "10-2-1",
"division": "North",
"link": "https://www.csyba.com/page/show/8944347-carol-stream-cheaties?subseason=942425"
},
{
"teamName": "Deerfield Dynasty",
"team_id": "8944348",
"team_slug": "deerfield-dynasty",
"subseason_id": "942425",
"instance_id": "10119605",
"w": "15",
"l": "3",
"t": "0",
"rf": "152",
"ra": "52",
"division_record": "12-2-0",
"division": "North",
"link": "https://www.csyba.com/page/show/8944348-deerfield-dynasty?subseason=942425"
},
{
"teamName": "Buffalo Grove Marlins",
"team_id": "8944344",
"team_slug": "buffalo-grove-marlins",
"subseason_id": "942425",
"instance_id": "10119601",
"w": "15",
"l": "3",
"t": "0",
"rf": "127",
"ra": "47",
"division_record": "11-2-0",
"division": "North",
"link": "https://www.csyba.com/page/show/8944344-buffalo-grove-marlins?subseason=942425"
},
{
"teamName": "Buffalo Grove White Sox",
"team_id": "8944346",
"team_slug": "buffalo-grove-white-sox",
"subseason_id": "942425",
"instance_id": "10119603",
"w": "12",
"l": "5",
"t": "0",
"rf": "140",
"ra": "58",
"division_record": "10-5-0",
"division": "North",
"link": "https://www.csyba.com/page/show/8944346-buffalo-grove-white-sox?subseason=942425"
},
{
"teamName": "Arlington Hts Shamrocks",
"team_id": "8944342",
"team_slug": "arlington-hts-shamrocks",
"subseason_id": "942425",
"instance_id": "10119599",
"w": "9",
"l": "9",
"t": "3",
"rf": "120",
"ra": "119",
"division_record": "5-8-3",
"division": "North",
"link": "https://www.csyba.com/page/show/8944342-arlington-hts-shamrocks?subseason=942425"
},
{
"teamName": "Waukegan Alacranes",
"team_id": "9024497",
"team_slug": "waukegan-alacranes",
"subseason_id": "942425",
"instance_id": "10185021",
"w": "7",
"l": "7",
"t": "2",
"rf": "96",
"ra": "88",
"division_record": "6-5-2",
"division": "North",
"link": "https://www.csyba.com/page/show/9024497-waukegan-alacranes?subseason=942425"
},
{
"teamName": "Palatine Pelicans",
"team_id": "8944350",
"team_slug": "palatine-pelicans",
"subseason_id": "942425",
"instance_id": "10119607",
"w": "6",
"l": "10",
"t": "2",
"rf": "91",
"ra": "128",
"division_record": "3-9-2",
"division": "North",
"link": "https://www.csyba.com/page/show/8944350-palatine-pelicans?subseason=942425"
},
{
"teamName": "Buffalo Grove Blue Wahoos",
"team_id": "9071622",
"team_slug": "buffalo-grove-blue-wahoos",
"subseason_id": "942425",
"instance_id": "10219990",
"w": "5",
"l": "10",
"t": "1",
"rf": "57",
"ra": "115",
"division_record": "3-8-0",
"division": "North",
"link": "https://www.csyba.com/page/show/9071622-buffalo-grove-blue-wahoos?subseason=942425"
},
{
"teamName": "Arlington Hts Freeze",
"team_id": "8944343",
"team_slug": "arlington-hts-freeze",
"subseason_id": "942425",
"instance_id": "10119600",
"w": "6",
"l": "13",
"t": "0",
"rf": "87",
"ra": "116",
"division_record": "3-11-0",
"division": "North",
"link": "https://www.csyba.com/page/show/8944343-arlington-hts-freeze?subseason=942425"
},
{
"teamName": "Buffalo Grove Orioles",
"team_id": "8944345",
"team_slug": "buffalo-grove-orioles",
"subseason_id": "942425",
"instance_id": "10119602",
"w": "2",
"l": "16",
"t": "0",
"rf": "76",
"ra": "175",
"division_record": "1-12-0",
"division": "North",
"link": "https://www.csyba.com/page/show/8944345-buffalo-grove-orioles?subseason=942425"
},
{
"teamName": "Dunham Dash",
"team_id": "8944355",
"team_slug": "dunham-dash",
"subseason_id": "942425",
"instance_id": "10119611",
"w": "12",
"l": "3",
"t": "0",
"rf": "117",
"ra": "57",
"division_record": "9-0-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8944355-dunham-dash?subseason=942425"
},
{
"teamName": "Skokie Vikings",
"team_id": "8944360",
"team_slug": "skokie-vikings",
"subseason_id": "942425",
"instance_id": "10119616",
"w": "9",
"l": "6",
"t": "1",
"rf": "93",
"ra": "72",
"division_record": "6-3-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8944360-skokie-vikings?subseason=942425"
},
{
"teamName": "Elmhurst White Sox",
"team_id": "8944356",
"team_slug": "elmhurst-white-sox",
"subseason_id": "942425",
"instance_id": "10119612",
"w": "4",
"l": "3",
"t": "0",
"rf": "35",
"ra": "31",
"division_record": "3-2-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8944356-elmhurst-white-sox?subseason=942425"
},
{
"teamName": "Lombard Expos",
"team_id": "8974790",
"team_slug": "lombard-expos",
"subseason_id": "942425",
"instance_id": "10148204",
"w": "8",
"l": "7",
"t": "1",
"rf": "97",
"ra": "68",
"division_record": "5-4-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8974790-lombard-expos?subseason=942425"
},
{
"teamName": "Chicago Rebels",
"team_id": "8974058",
"team_slug": "chicago-rebels",
"subseason_id": "942425",
"instance_id": "10147713",
"w": "9",
"l": "9",
"t": "0",
"rf": "104",
"ra": "81",
"division_record": "6-4-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8974058-chicago-rebels?subseason=942425"
},
{
"teamName": "Westchester Knights",
"team_id": "8944361",
"team_slug": "westchester-knights",
"subseason_id": "942425",
"instance_id": "10119617",
"w": "5",
"l": "10",
"t": "0",
"rf": "82",
"ra": "155",
"division_record": "4-4-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8944361-westchester-knights?subseason=942425"
},
{
"teamName": "Melrose Park Thorns",
"team_id": "9014143",
"team_slug": "melrose-park-thorns",
"subseason_id": "942425",
"instance_id": "10178191",
"w": "5",
"l": "12",
"t": "1",
"rf": "106",
"ra": "139",
"division_record": "3-7-0",
"division": "South",
"link": "https://www.csyba.com/page/show/9014143-melrose-park-thorns?subseason=942425"
},
{
"teamName": "Bedford Park Bombers",
"team_id": "8944352",
"team_slug": "bedford-park-bombers",
"subseason_id": "942425",
"instance_id": "10119608",
"w": "3",
"l": "12",
"t": "0",
"rf": "48",
"ra": "133",
"division_record": "2-6-0",
"division": "South",
"link": "https://www.csyba.com/page/show/8944352-bedford-park-bombers?subseason=942425"
},
{
"teamName": "Skokie Classics",
"team_id": "8944359",
"team_slug": "skokie-classics",
"subseason_id": "942425",
"instance_id": "10119615",
"w": "5",
"l": "15",
"t": "1",
"rf": "105",
"ra": "177",
"division_record": "4-6-1",
"division": "South",
"link": "https://www.csyba.com/page/show/8944359-skokie-classics?subseason=942425"
},
{
"teamName": "Park Ridge White Sox",
"team_id": "8944358",
"team_slug": "park-ridge-white-sox",
"subseason_id": "942425",
"instance_id": "10119614",
"w": "1",
"l": "11",
"t": "3",
"rf": "42",
"ra": "142",
"division_record": "0-6-1",
"division": "South",
"link": "https://www.csyba.com/page/show/8944358-park-ridge-white-sox?subseason=942425"
},
{
"teamName": "Chicago White Sox",
"team_id": "9002208",
"team_slug": "chicago-white-sox",
"subseason_id": "942425",
"instance_id": "10168648",
"w": "19",
"l": "6",
"t": "0",
"rf": "162",
"ra": "73",
"division_record": "10-4-0",
"division": "CMBA",
"link": "https://www.csyba.com/page/show/9002208-chicago-white-sox?subseason=942425"
},
{
"teamName": "Chicago Blazers",
"team_id": "9002204",
"team_slug": "chicago-blazers",
"subseason_id": "942425",
"instance_id": "10168644",
"w": "17",
"l": "7",
"t": "0",
"rf": "239",
"ra": "94",
"division_record": "9-4-0",
"division": "CMBA",
"link": "https://www.csyba.com/page/show/9002204-chicago-blazers?subseason=942425"
},
{
"teamName": "Chicago Electrons",
"team_id": "9002205",
"team_slug": "chicago-electrons",
"subseason_id": "942425",
"instance_id": "10168645",
"w": "16",
"l": "6",
"t": "2",
"rf": "170",
"ra": "112",
"division_record": "9-4-0",
"division": "CMBA",
"link": "https://www.csyba.com/page/show/9002205-chicago-electrons?subseason=942425"
},
{
"teamName": "Chicago Hounds",
"team_id": "9002206",
"team_slug": "chicago-hounds",
"subseason_id": "942425",
"instance_id": "10168646",
"w": "15",
"l": "11",
"t": "0",
"rf": "182",
"ra": "126",
"division_record": "7-8-0",
"division": "CMBA",
"link": "https://www.csyba.com/page/show/9002206-chicago-hounds?subseason=942425"
},
{
"teamName": "Chicago Hawks",
"team_id": "9002209",
"team_slug": "chicago-hawks",
"subseason_id": "942425",
"instance_id": "10168649",
"w": "1",
"l": "25",
"t": "2",
"rf": "87",
"ra": "355",
"division_record": "0-15-0",
"division": "CMBA",
"link": "https://www.csyba.com/page/show/9002209-chicago-hawks?subseason=942425"
}
]

464
build_season_schedule.py Normal file
View File

@@ -0,0 +1,464 @@
#!/usr/bin/env python3
# build_season_schedule.py
#
# Build a deduped season schedule from SportsEngine team-instance printable pages.
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
# - Determines home/away using the '@' marker on the opponent cell.
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
#
# Usage:
# pip install requests beautifulsoup4 python-dateutil
# python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
#
# Example teams.json (array):
# [
# {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
# ...
# ]
import argparse
import csv
import json
import logging
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtp
# ----------------- logging -----------------
logging.basicConfig(
level=logging.INFO, # change to DEBUG for verbose tracing
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S",
)
# ----------------- constants -----------------
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
HEADERS = {"User-Agent": UA}
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"
GAME_BASE = "https://www.csyba.com/game/show/{gid}"
SCORE_RE = re.compile(r"\b(\d+)\s*[-]\s*(\d+)\b")
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)
# ----------------- helpers -----------------
def clean(x: str) -> str:
return re.sub(r"\s+", " ", (x or "")).strip()
def slugify(s: str) -> str:
s = s.lower()
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
return s
def norm_name(s: str) -> str:
s = s.lower()
s = re.sub(r"[^a-z0-9 ]+", " ", s)
s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
@dataclass(frozen=True)
class TeamRec:
name: str
slug: str
team_id: str
instance_id: str
subseason_id: str
def load_teams(teams_path: str):
"""Load mapping tables from teams.json you provided."""
with open(teams_path, "r", encoding="utf-8") as f:
arr = json.load(f)
by_instance: Dict[str, TeamRec] = {}
by_slug: Dict[str, TeamRec] = {}
by_norm: Dict[str, TeamRec] = {}
for t in arr:
rec = TeamRec(
name=str(t["teamName"]),
slug=str(t["team_slug"]),
team_id=str(t["team_id"]),
instance_id=str(t["instance_id"]),
subseason_id=str(t["subseason_id"]),
)
by_instance[rec.instance_id] = rec
by_slug[rec.slug] = rec
by_norm[norm_name(rec.name)] = rec
return by_instance, by_slug, by_norm
def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
"""Match opponent using slug first, then normalized name, then loose containment."""
s = slugify(opponent_text)
if s in by_slug:
return by_slug[s]
n = norm_name(opponent_text)
if n in by_norm:
return by_norm[n]
for key, rec in by_norm.items():
if key in n or n in key:
return rec
return None
def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
"""
Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs.
We don't reorder; we only validate with W/L/T if needed.
"""
if not (s_a.isdigit() and s_b.isdigit()):
return None, None
a, b = int(s_a), int(s_b)
if result_flag == "W" and a <= b:
logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
if result_flag == "L" and a >= b:
logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
return a, b
# ----------------- HTTP utils -----------------
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
try:
sess = session or requests.Session()
r = sess.get(url, headers=HEADERS, timeout=timeout)
r.raise_for_status()
return BeautifulSoup(r.text, "html.parser")
except Exception as e:
logging.error(f"GET failed {url}: {e}")
return None
# ----------------- scraping -----------------
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
"""Parse one team-instance printable schedule page into perspective rows."""
url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
"schedule_type": "index",
"subseason": subseason_id,
})
soup = get_soup(url, session=session)
if not soup:
return []
table = soup.select_one("table")
if not table:
logging.warning(f"No table found for team_instance={instance_id}")
return []
games = []
for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
tds = tr.select("td")
if len(tds) < 5:
continue
# Cells: Date | Result | Opponent | Location | Status
date_txt = clean(tds[0].get_text(" "))
result_txt = clean(tds[1].get_text(" "))
opp_txt = clean(tds[2].get_text(" "))
loc_txt = clean(tds[3].get_text(" "))
status_txt = clean(tds[4].get_text(" "))
# Date → ISO
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except Exception:
date_iso = date_txt
# Pull a game_id if present (from any link in the row)
game_id = ""
for a in tr.select("a[href]"):
m = GAME_LINK_RE.search(a.get("href", ""))
if m:
game_id = m.group(1)
break
# Extract W/L/T (Result cell)
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result_flag = m_res.group(1).upper() if m_res else ""
# Extract score from Result cell; if missing, also try Opponent cell
m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")
# Opponent + home/away flag
is_away = opp_txt.startswith("@")
opponent_name = opp_txt.lstrip("@").strip()
# Compute team/opp runs (TEAM-FIRST orientation)
team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)
logging.debug(
f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
f"→ team_runs={team_runs}, opp_runs={opp_runs}"
)
games.append({
"team_instance": instance_id,
"game_id": game_id, # may be empty
"date": date_iso,
"result": result_flag, # W/L/T from THIS TEAM's perspective
"team_runs": team_runs,
"opp_runs": opp_runs,
"opponent_name": opponent_name,
"is_away": is_away,
"location": loc_txt,
"status": status_txt,
"source_url": url,
})
logging.info(f"Team {instance_id}: parsed {len(games)} rows")
return games
def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
"""
Fetch the game's local start time from the /game/show/<id> page.
Looks inside the tab with id 'tab_boxscores_content' but also
falls back to scanning the page for common time patterns.
Returns a zero-padded 24h 'HH:MM' string or None if unavailable.
"""
if not game_id:
return None
url = GAME_BASE.format(gid=game_id)
soup = get_soup(url, session=session, timeout=30)
if not soup:
return None
# Prefer the boxscores tab content
box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
text = ""
if box:
text = " ".join(box.stripped_strings)
else:
# Fall back to page-wide text (but avoid pulling too much)
main = soup.select_one("div.page") or soup
text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())
m = TIME_RE.search(text)
if not m:
logging.debug(f"TIME: no time found in game {game_id}")
return None
hhmm = m.group(1)
ampm = (m.group(2) or "").lower().replace(".", "")
try:
# Normalize to 24h HH:MM
from datetime import datetime
if ampm:
dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
else:
# already 24h-ish
dt = datetime.strptime(hhmm, "%H:%M")
return dt.strftime("%H:%M")
except Exception:
# Be forgiving (e.g., "6:00pm" without space)
try:
from datetime import datetime
hhmm2 = hhmm
if ampm:
dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p")
return dt.strftime("%H:%M")
except Exception:
logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
return None
# ----------------- build & merge -----------------
def main():
ap = argparse.ArgumentParser(description="Build a deduped season schedule with IDs, winners/losers, runs, and times.")
ap.add_argument("--subseason", required=True, help="Subseason ID, e.g. 942425")
ap.add_argument("--teams", required=True, help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)")
ap.add_argument("--out", default="season_schedule.csv", help="Output CSV path")
ap.add_argument("--fetch-time", action="store_true", help="Fetch game time from /game/show/<id>")
ap.add_argument("--sleep", type=float, default=0.35, help="Delay between requests (seconds)")
args = ap.parse_args()
by_instance, by_slug, by_norm = load_teams(args.teams)
instance_ids = sorted(by_instance.keys())
session = requests.Session()
session.headers.update(HEADERS)
# Scrape all teams
raw: List[dict] = []
for i, iid in enumerate(instance_ids, 1):
logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
raw.extend(parse_printable(iid, args.subseason, session=session))
time.sleep(args.sleep) # be polite
def rec_from_instance(iid: str) -> Optional[TeamRec]:
return by_instance.get(iid)
def match_opponent(text: str) -> Optional[TeamRec]:
return best_match_team(text, by_slug, by_norm)
# Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present)
buckets: Dict[str, dict] = {}
fallback_rows = 0
for row in raw:
team_rec = rec_from_instance(row["team_instance"])
if not team_rec:
logging.warning(f"Unknown instance {row['team_instance']}; skipping")
continue
opp_rec = match_opponent(row["opponent_name"])
opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
pair = tuple(sorted([team_rec.slug, opp_slug]))
if row["game_id"]:
key = f"id:{row['game_id']}"
else:
runs_sig = ""
if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
fallback_rows += 1
perspective = {
"team": team_rec,
"opp": opp_rec, # may be None
"is_away": row["is_away"],
"team_runs": row["team_runs"],
"opp_runs": row["opp_runs"],
"location": row["location"],
"status": row["status"],
"source_url": row["source_url"],
"pair": pair,
"date": row["date"],
"game_id": row["game_id"],
}
if key not in buckets:
buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
else:
buckets[key]["persp"].append(perspective)
if fallback_rows:
logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")
# Merge perspectives into a single home/away row
out_rows = []
time_cache: Dict[str, Optional[str]] = {}
for key, bucket in buckets.items():
p = bucket["persp"]
date = p[0]["date"]
game_id = bucket.get("game_id", "")
# Identify home/away perspectives
p_home = next((x for x in p if x["is_away"] is False), None)
p_away = next((x for x in p if x["is_away"] is True), None)
# Team identities
home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))
def pack_team(rec: Optional[TeamRec], fallback_slug: str):
if rec:
return rec.slug, rec.instance_id, rec.team_id, rec.name
return fallback_slug, "", "", fallback_slug.replace("-", " ").title()
# Prefer runs from the explicit perspective (home if available; otherwise away)
home_runs = away_runs = None
if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
home_runs = p_home["team_runs"]
away_runs = p_home["opp_runs"]
elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
away_runs = p_away["team_runs"]
home_runs = p_away["opp_runs"]
# Fallback: single perspective present but numbers known → place by is_away
if (home_runs is None or away_runs is None) and p:
one = p[0]
if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
if one["is_away"]:
away_runs = one["team_runs"]; home_runs = one["opp_runs"]
away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team
else:
home_runs = one["team_runs"]; away_runs = one["opp_runs"]
home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team
# Pack final team identifiers (fallback slug = guess from perspectives)
guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
p_away["opp"].slug if p_away and p_away["opp"] else
p[0]["pair"][0])
guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
p_home["opp"].slug if p_home and p_home["opp"] else
p[0]["pair"][1])
home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)
# Winner/loser
winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
if isinstance(home_runs, int) and isinstance(away_runs, int):
if home_runs > away_runs:
winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
elif away_runs > home_runs:
winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id
# Meta from perspectives
loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
source_urls = sorted({x["source_url"] for x in p})
# -------- NEW: fetch game start time from game page --------
time_local = ""
if args.fetch_time and game_id:
if game_id in time_cache:
tval = time_cache[game_id]
else:
logging.debug(f"TIME: fetching game {game_id}")
tval = fetch_game_time(game_id, session=session)
time_cache[game_id] = tval
if tval is None:
# small backoff to be nice if many misses
time.sleep(min(args.sleep * 2, 1.0))
if tval:
time_local = tval
logging.debug(
f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
)
out_rows.append({
"date_local": date,
"time_local": time_local,
"home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
"away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
"home_runs": "" if home_runs is None else home_runs,
"away_runs": "" if away_runs is None else away_runs,
"winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
"loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
"location": loc, "status": status,
"game_id": game_id,
"source_urls": " ".join(source_urls),
})
if not out_rows:
logging.warning("No games produced.")
return
fieldnames = [
"date_local","time_local",
"home_slug","home_instance","home_id","home_name",
"away_slug","away_instance","away_id","away_name",
"home_runs","away_runs",
"winner_slug","winner_instance","winner_id",
"loser_slug","loser_instance","loser_id",
"location","status","game_id","source_urls",
]
with open(args.out, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=fieldnames)
w.writeheader()
for r in out_rows:
w.writerow(r)
logging.info(f"Wrote {len(out_rows)} games → {args.out}")
if __name__ == "__main__":
main()

224
compute_ratings.py Normal file
View File

@@ -0,0 +1,224 @@
#!/usr/bin/env python3
"""
Rank baseball teams from a season_schedule.csv that has columns:
date_local,time_local,home_slug,home_instance,home_id,home_name,
away_slug,away_instance,away_id,away_name,home_runs,away_runs,
winner_slug,winner_instance,winner_id,loser_slug,loser_instance,loser_id,
location,status,game_id,source_urls
Output CSV columns (one row per team):
Team, GP, W, L, T, WinPct, RS, RA, RunDiff, PythagoreanWinPct,
MasseyRating, EloRating, StrengthOfSchedule, CompositeRating
Defaults:
- Team identity uses *_name; switch to slugs with --team-id slugs
- Pythagorean exponent = 1.83
- Massey caps margins at 8 runs and subtracts estimated home-field runs
- Elo: start 1500, K=24, home bonus H=30, margin factor ln(|m|+1) capped at 2.0
- Elo averaged over 20 random shuffles (reduces order dependence)
"""
from __future__ import annotations
import argparse
import math
import numpy as np
import pandas as pd
def parse_args():
p = argparse.ArgumentParser(description="Power ratings from season_schedule.csv")
p.add_argument("--in", dest="inp", required=True, help="Input CSV (season_schedule.csv)")
p.add_argument("--out", dest="out", required=True, help="Output ratings CSV")
p.add_argument("--team-id", choices=["names","slugs"], default="names",
help="Use team names or slugs as identifiers (default: names)")
p.add_argument("--final-status", default=None,
help="Only include games where status == this value (e.g., 'final'). If omitted, any row with scores is included.")
# Tunables
p.add_argument("--pyexp", type=float, default=1.83, help="Pythagorean exponent")
p.add_argument("--massey-cap", type=float, default=8.0, help="Cap for run margins in Massey")
p.add_argument("--no-massey-home-adj", action="store_true",
help="Disable subtracting estimated home-field runs in Massey")
p.add_argument("--elo-k", type=float, default=24.0, help="Elo K-factor")
p.add_argument("--elo-home", type=float, default=30.0, help="Elo home bonus (points)")
p.add_argument("--elo-mcap", type=float, default=2.0, help="Cap for margin factor ln(|m|+1)")
p.add_argument("--elo-shuffles", type=int, default=20, help="Random shuffles to average Elo")
p.add_argument("--elo-seed", type=int, default=42, help="RNG seed for shuffles")
return p.parse_args()
def load_games(a) -> pd.DataFrame:
df = pd.read_csv(a.inp)
# Choose identifiers
home_id_col = "home_name" if a.team_id == "names" else "home_slug"
away_id_col = "away_name" if a.team_id == "names" else "away_slug"
for c in [home_id_col, away_id_col, "home_runs", "away_runs"]:
if c not in df.columns:
raise ValueError(f"Missing required column: {c}")
# Optional status filter (helps exclude postponed/canceled)
if a.final_status is not None and "status" in df.columns:
df = df[df["status"].astype(str).str.lower() == str(a.final_status).lower()]
# Keep only games with numeric scores
df = df.copy()
df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce")
df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce")
df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"])
# Parse datetime (robust to missing either field)
date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce")
time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time
# Combine when possible
dt = date
if "time_local" in df.columns:
# build datetime only where both present
dt = pd.to_datetime(
date.dt.strftime("%Y-%m-%d").fillna("") + " " +
pd.Series(time).astype(str).replace("NaT",""),
errors="coerce"
)
df_out = pd.DataFrame({
"Date": dt,
"HomeTeam": df[home_id_col].astype(str),
"AwayTeam": df[away_id_col].astype(str),
"HomeRuns": df["home_runs"].astype(int),
"AwayRuns": df["away_runs"].astype(int),
})
df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"]
df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H",
np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T"))
return df_out.reset_index(drop=True)
def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team")
stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0)
for _, r in df.iterrows():
h, a = r["HomeTeam"], r["AwayTeam"]
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar
stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr
if hr > ar:
stats.at[h,"W"] += 1; stats.at[a,"L"] += 1
elif hr < ar:
stats.at[a,"W"] += 1; stats.at[h,"L"] += 1
else:
stats.at[h,"T"] += 1; stats.at[a,"T"] += 1
stats = stats.astype(int)
stats["GP"] = stats["W"] + stats["L"] + stats["T"]
stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan)
stats["RunDiff"] = stats["RS"] - stats["RA"]
return stats.reset_index()
def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series:
rs = rs.clip(lower=0); ra = ra.clip(lower=0)
num = np.power(rs, exp); den = num + np.power(ra, exp)
with np.errstate(divide="ignore", invalid="ignore"):
p = np.where(den > 0, num / den, 0.5)
return pd.Series(p, index=rs.index)
def estimate_home_field_runs(df: pd.DataFrame) -> float:
return float(df["Margin"].mean()) if len(df) else 0.0
def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]:
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
idx = {t:i for i,t in enumerate(teams)}
y = df["Margin"].astype(float).to_numpy()
if cap and cap > 0:
y = np.clip(y, -cap, cap)
h_est = estimate_home_field_runs(df)
if subtract_home:
y = y - h_est
G, N = len(df), len(teams)
A = np.zeros((G+1, N), dtype=float)
for r_i, r in enumerate(df.itertuples(index=False)):
A[r_i, idx[r.HomeTeam]] = 1.0
A[r_i, idx[r.AwayTeam]] = -1.0
A[G, :] = 1.0
y_ext = np.concatenate([y, [0.0]])
r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None)
return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0)
def elo_expected(ra: float, rb: float) -> float:
return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0))
def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]:
ratings = dict(init)
for _, r in df.iterrows():
h, a = r["HomeTeam"], r["AwayTeam"]
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
margin = hr - ar
Eh = elo_expected(ratings[h] + H, ratings[a])
Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5))
M = np.log(abs(margin) + 1.0)
if mcap is not None:
M = min(M, mcap)
ratings[h] += K * M * (Sh - Eh)
ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh))
return ratings
def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series:
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
base = {t: 1500.0 for t in teams}
# baseline in chronological order (Date may be NaT; sort is stable)
df0 = df.sort_values(["Date"]).reset_index(drop=True)
r_first = elo_once(df0, K, H, mcap, base)
rng = np.random.default_rng(seed)
vals = {t: [r_first[t]] for t in teams}
for _ in range(max(0, shuffles-1)):
idx = np.arange(len(df0)); rng.shuffle(idx)
r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base)
for t in teams:
vals[t].append(r[t])
return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index()
def zscore(s: pd.Series) -> pd.Series:
mu, sd = s.mean(), s.std(ddof=0)
return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd
def main():
a = parse_args()
games = load_games(a)
# Aggregates
team = aggregate_team_stats(games)
team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], a.pyexp)
# Ratings
massey_r, h_runs = massey(games, cap=a.massey_cap, subtract_home=(not a.no_massey_home_adj))
sos = (
games.assign(OppTeam=np.where(True, games["AwayTeam"], games["AwayTeam"])) # placeholder
)
# Strength of schedule: avg opponent Massey rating faced
opps = {t: [] for t in massey_r.index}
for _, r in games.iterrows():
opps[r["HomeTeam"]].append(r["AwayTeam"])
opps[r["AwayTeam"]].append(r["HomeTeam"])
sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps})
elo_r = elo(games, K=a.elo_k, H=a.elo_home, mcap=a.elo_mcap, shuffles=a.elo_shuffles, seed=a.elo_seed)
# Merge
out = team.set_index("Team")
out["MasseyRating"] = massey_r
out["EloRating"] = elo_r
out["StrengthOfSchedule"] = sos_series
# Composite
Z_r, Z_e, Z_p = zscore(out["MasseyRating"]), zscore(out["EloRating"]), zscore(out["PythagoreanWinPct"])
out["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p
out = out.reset_index()
out = out[[
"Team","GP","W","L","T","WinPct","RS","RA","RunDiff",
"PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"
]].sort_values("CompositeRating", ascending=False)
# Round for readability
for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]:
out[c] = out[c].astype(float).round(5)
out.to_csv(a.out, index=False)
print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}")
print(f"Teams ranked: {len(out)} | Games processed: {len(games)}")
print(f"Output -> {a.out}")
if __name__ == "__main__":
main()

93
csyba.py Normal file
View File

@@ -0,0 +1,93 @@
import requests, re, time, csv, logging
from bs4 import BeautifulSoup
from dateutil import parser as dtp
# --- Logging setup ---
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S"
)
HEADERS = {"User-Agent": "Mozilla/5.0"}
SUBSEASON_ID = "942425"
TEAM_INSTANCES = [
"10119604","10119605","10119601","10119603","10119599","10185021","10119607",
"10219990","10119600","10119602","10119611","10119616","10119612","10148204",
"10147713","10119617","10178191","10119608","10119615","10119614","10168648",
"10168644","10168645","10168646","10168649"
]
def clean(x):
return re.sub(r"\s+"," ",x or "").strip()
def fetch_team_schedule(iid):
url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}"
try:
r = requests.get(url, headers=HEADERS, timeout=30)
r.raise_for_status()
except Exception as e:
logging.error(f"Failed to fetch team {iid}: {e}")
return []
soup = BeautifulSoup(r.text,"html.parser")
games = []
for tr in soup.select("table tr")[1:]: # skip header
tds = tr.select("td")
if len(tds) < 5:
continue
date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]]
# parse date
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except:
date_iso = date_txt
# extract result/score
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result = m_res.group(1).upper() if m_res else ""
m_score = re.search(r"(\d+)\s*[-]\s*(\d+)", result_txt)
hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","")
away_flag = opp_txt.startswith("@")
opponent = opp_txt.lstrip("@").strip()
games.append({
"team_instance": iid,
"date": date_iso,
"result": result,
"score": f"{hs}-{as_}" if hs else "",
"home_score": hs,
"away_score": as_,
"opponent": opponent,
"is_away": away_flag,
"location": loc_txt,
"status": status_txt,
"source_url": url
})
logging.info(f"Team {iid}: parsed {len(games)} games")
return games
def main():
all_games = []
for i, iid in enumerate(TEAM_INSTANCES, start=1):
logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}")
all_games.extend(fetch_team_schedule(iid))
time.sleep(0.5)
# deduplicate: key = (date, sorted team_instance+opponent, score)
unique = {}
for g in all_games:
key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"])
if key not in unique:
unique[key] = g
deduped_games = list(unique.values())
out_file = "season_games.csv"
with open(out_file,"w",newline="",encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys())
writer.writeheader()
writer.writerows(deduped_games)
logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}")
if __name__ == "__main__":
main()