464 lines
18 KiB
Python
464 lines
18 KiB
Python
#!/usr/bin/env python3
|
||
# build_season_schedule.py
|
||
#
|
||
# Build a deduped season schedule from SportsEngine team-instance printable pages.
|
||
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
|
||
# - Determines home/away using the '@' marker on the opponent cell.
|
||
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
|
||
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
|
||
#
|
||
# Usage:
|
||
# pip install requests beautifulsoup4 python-dateutil
|
||
# python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
|
||
#
|
||
# Example teams.json (array):
|
||
# [
|
||
# {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
|
||
# ...
|
||
# ]
|
||
|
||
import argparse
|
||
import csv
|
||
import json
|
||
import logging
|
||
import re
|
||
import time
|
||
from dataclasses import dataclass
|
||
from typing import Dict, List, Optional, Tuple
|
||
from urllib.parse import urlencode
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from dateutil import parser as dtp
|
||
|
||
# ----------------- logging -----------------
|
||
logging.basicConfig(
|
||
level=logging.INFO, # change to DEBUG for verbose tracing
|
||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||
datefmt="%H:%M:%S",
|
||
)
|
||
|
||
# ----------------- constants -----------------
|
||
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
|
||
HEADERS = {"User-Agent": UA}
|
||
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"
|
||
GAME_BASE = "https://www.csyba.com/game/show/{gid}"
|
||
|
||
SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b")
|
||
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
|
||
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)
|
||
|
||
# ----------------- helpers -----------------
|
||
def clean(x: str) -> str:
|
||
return re.sub(r"\s+", " ", (x or "")).strip()
|
||
|
||
def slugify(s: str) -> str:
|
||
s = s.lower()
|
||
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
|
||
return s
|
||
|
||
def norm_name(s: str) -> str:
|
||
s = s.lower()
|
||
s = re.sub(r"[^a-z0-9 ]+", " ", s)
|
||
s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
|
||
s = re.sub(r"\s+", " ", s).strip()
|
||
return s
|
||
|
||
@dataclass(frozen=True)
|
||
class TeamRec:
|
||
name: str
|
||
slug: str
|
||
team_id: str
|
||
instance_id: str
|
||
subseason_id: str
|
||
|
||
def load_teams(teams_path: str):
|
||
"""Load mapping tables from teams.json you provided."""
|
||
with open(teams_path, "r", encoding="utf-8") as f:
|
||
arr = json.load(f)
|
||
by_instance: Dict[str, TeamRec] = {}
|
||
by_slug: Dict[str, TeamRec] = {}
|
||
by_norm: Dict[str, TeamRec] = {}
|
||
for t in arr:
|
||
rec = TeamRec(
|
||
name=str(t["teamName"]),
|
||
slug=str(t["team_slug"]),
|
||
team_id=str(t["team_id"]),
|
||
instance_id=str(t["instance_id"]),
|
||
subseason_id=str(t["subseason_id"]),
|
||
)
|
||
by_instance[rec.instance_id] = rec
|
||
by_slug[rec.slug] = rec
|
||
by_norm[norm_name(rec.name)] = rec
|
||
return by_instance, by_slug, by_norm
|
||
|
||
def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
|
||
"""Match opponent using slug first, then normalized name, then loose containment."""
|
||
s = slugify(opponent_text)
|
||
if s in by_slug:
|
||
return by_slug[s]
|
||
n = norm_name(opponent_text)
|
||
if n in by_norm:
|
||
return by_norm[n]
|
||
for key, rec in by_norm.items():
|
||
if key in n or n in key:
|
||
return rec
|
||
return None
|
||
|
||
def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
|
||
"""
|
||
Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs.
|
||
We don't reorder; we only validate with W/L/T if needed.
|
||
"""
|
||
if not (s_a.isdigit() and s_b.isdigit()):
|
||
return None, None
|
||
a, b = int(s_a), int(s_b)
|
||
if result_flag == "W" and a <= b:
|
||
logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
|
||
if result_flag == "L" and a >= b:
|
||
logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
|
||
return a, b
|
||
|
||
# ----------------- HTTP utils -----------------
|
||
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
|
||
try:
|
||
sess = session or requests.Session()
|
||
r = sess.get(url, headers=HEADERS, timeout=timeout)
|
||
r.raise_for_status()
|
||
return BeautifulSoup(r.text, "html.parser")
|
||
except Exception as e:
|
||
logging.error(f"GET failed {url}: {e}")
|
||
return None
|
||
|
||
# ----------------- scraping -----------------
|
||
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
|
||
"""Parse one team-instance printable schedule page into perspective rows."""
|
||
url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
|
||
"schedule_type": "index",
|
||
"subseason": subseason_id,
|
||
})
|
||
soup = get_soup(url, session=session)
|
||
if not soup:
|
||
return []
|
||
|
||
table = soup.select_one("table")
|
||
if not table:
|
||
logging.warning(f"No table found for team_instance={instance_id}")
|
||
return []
|
||
|
||
games = []
|
||
for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
|
||
tds = tr.select("td")
|
||
if len(tds) < 5:
|
||
continue
|
||
|
||
# Cells: Date | Result | Opponent | Location | Status
|
||
date_txt = clean(tds[0].get_text(" "))
|
||
result_txt = clean(tds[1].get_text(" "))
|
||
opp_txt = clean(tds[2].get_text(" "))
|
||
loc_txt = clean(tds[3].get_text(" "))
|
||
status_txt = clean(tds[4].get_text(" "))
|
||
|
||
# Date → ISO
|
||
try:
|
||
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
|
||
except Exception:
|
||
date_iso = date_txt
|
||
|
||
# Pull a game_id if present (from any link in the row)
|
||
game_id = ""
|
||
for a in tr.select("a[href]"):
|
||
m = GAME_LINK_RE.search(a.get("href", ""))
|
||
if m:
|
||
game_id = m.group(1)
|
||
break
|
||
|
||
# Extract W/L/T (Result cell)
|
||
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
|
||
result_flag = m_res.group(1).upper() if m_res else ""
|
||
|
||
# Extract score from Result cell; if missing, also try Opponent cell
|
||
m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
|
||
s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")
|
||
|
||
# Opponent + home/away flag
|
||
is_away = opp_txt.startswith("@")
|
||
opponent_name = opp_txt.lstrip("@").strip()
|
||
|
||
# Compute team/opp runs (TEAM-FIRST orientation)
|
||
team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)
|
||
|
||
logging.debug(
|
||
f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
|
||
f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
|
||
f"→ team_runs={team_runs}, opp_runs={opp_runs}"
|
||
)
|
||
|
||
games.append({
|
||
"team_instance": instance_id,
|
||
"game_id": game_id, # may be empty
|
||
"date": date_iso,
|
||
"result": result_flag, # W/L/T from THIS TEAM's perspective
|
||
"team_runs": team_runs,
|
||
"opp_runs": opp_runs,
|
||
"opponent_name": opponent_name,
|
||
"is_away": is_away,
|
||
"location": loc_txt,
|
||
"status": status_txt,
|
||
"source_url": url,
|
||
})
|
||
|
||
logging.info(f"Team {instance_id}: parsed {len(games)} rows")
|
||
return games
|
||
|
||
def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
|
||
"""
|
||
Fetch the game's local start time from the /game/show/<id> page.
|
||
Looks inside the tab with id 'tab_boxscores_content' but also
|
||
falls back to scanning the page for common time patterns.
|
||
Returns a zero-padded 24h 'HH:MM' string or None if unavailable.
|
||
"""
|
||
if not game_id:
|
||
return None
|
||
url = GAME_BASE.format(gid=game_id)
|
||
soup = get_soup(url, session=session, timeout=30)
|
||
if not soup:
|
||
return None
|
||
|
||
# Prefer the boxscores tab content
|
||
box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
|
||
text = ""
|
||
if box:
|
||
text = " ".join(box.stripped_strings)
|
||
else:
|
||
# Fall back to page-wide text (but avoid pulling too much)
|
||
main = soup.select_one("div.page") or soup
|
||
text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())
|
||
|
||
m = TIME_RE.search(text)
|
||
if not m:
|
||
logging.debug(f"TIME: no time found in game {game_id}")
|
||
return None
|
||
|
||
hhmm = m.group(1)
|
||
ampm = (m.group(2) or "").lower().replace(".", "")
|
||
try:
|
||
# Normalize to 24h HH:MM
|
||
from datetime import datetime
|
||
if ampm:
|
||
dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
|
||
else:
|
||
# already 24h-ish
|
||
dt = datetime.strptime(hhmm, "%H:%M")
|
||
return dt.strftime("%H:%M")
|
||
except Exception:
|
||
# Be forgiving (e.g., "6:00pm" without space)
|
||
try:
|
||
from datetime import datetime
|
||
hhmm2 = hhmm
|
||
if ampm:
|
||
dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p")
|
||
return dt.strftime("%H:%M")
|
||
except Exception:
|
||
logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
|
||
return None
|
||
|
||
# ----------------- build & merge -----------------
|
||
def main():
|
||
ap = argparse.ArgumentParser(description="Build a deduped season schedule with IDs, winners/losers, runs, and times.")
|
||
ap.add_argument("--subseason", required=True, help="Subseason ID, e.g. 942425")
|
||
ap.add_argument("--teams", required=True, help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)")
|
||
ap.add_argument("--out", default="season_schedule.csv", help="Output CSV path")
|
||
ap.add_argument("--fetch-time", action="store_true", help="Fetch game time from /game/show/<id>")
|
||
ap.add_argument("--sleep", type=float, default=0.35, help="Delay between requests (seconds)")
|
||
args = ap.parse_args()
|
||
|
||
by_instance, by_slug, by_norm = load_teams(args.teams)
|
||
instance_ids = sorted(by_instance.keys())
|
||
|
||
session = requests.Session()
|
||
session.headers.update(HEADERS)
|
||
|
||
# Scrape all teams
|
||
raw: List[dict] = []
|
||
for i, iid in enumerate(instance_ids, 1):
|
||
logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
|
||
raw.extend(parse_printable(iid, args.subseason, session=session))
|
||
time.sleep(args.sleep) # be polite
|
||
|
||
def rec_from_instance(iid: str) -> Optional[TeamRec]:
|
||
return by_instance.get(iid)
|
||
|
||
def match_opponent(text: str) -> Optional[TeamRec]:
|
||
return best_match_team(text, by_slug, by_norm)
|
||
|
||
# Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present)
|
||
buckets: Dict[str, dict] = {}
|
||
fallback_rows = 0
|
||
|
||
for row in raw:
|
||
team_rec = rec_from_instance(row["team_instance"])
|
||
if not team_rec:
|
||
logging.warning(f"Unknown instance {row['team_instance']}; skipping")
|
||
continue
|
||
|
||
opp_rec = match_opponent(row["opponent_name"])
|
||
opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
|
||
pair = tuple(sorted([team_rec.slug, opp_slug]))
|
||
|
||
if row["game_id"]:
|
||
key = f"id:{row['game_id']}"
|
||
else:
|
||
runs_sig = ""
|
||
if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
|
||
runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
|
||
key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
|
||
fallback_rows += 1
|
||
|
||
perspective = {
|
||
"team": team_rec,
|
||
"opp": opp_rec, # may be None
|
||
"is_away": row["is_away"],
|
||
"team_runs": row["team_runs"],
|
||
"opp_runs": row["opp_runs"],
|
||
"location": row["location"],
|
||
"status": row["status"],
|
||
"source_url": row["source_url"],
|
||
"pair": pair,
|
||
"date": row["date"],
|
||
"game_id": row["game_id"],
|
||
}
|
||
|
||
if key not in buckets:
|
||
buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
|
||
else:
|
||
buckets[key]["persp"].append(perspective)
|
||
|
||
if fallback_rows:
|
||
logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")
|
||
|
||
# Merge perspectives into a single home/away row
|
||
out_rows = []
|
||
time_cache: Dict[str, Optional[str]] = {}
|
||
|
||
for key, bucket in buckets.items():
|
||
p = bucket["persp"]
|
||
date = p[0]["date"]
|
||
game_id = bucket.get("game_id", "")
|
||
|
||
# Identify home/away perspectives
|
||
p_home = next((x for x in p if x["is_away"] is False), None)
|
||
p_away = next((x for x in p if x["is_away"] is True), None)
|
||
|
||
# Team identities
|
||
home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
|
||
away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))
|
||
|
||
def pack_team(rec: Optional[TeamRec], fallback_slug: str):
|
||
if rec:
|
||
return rec.slug, rec.instance_id, rec.team_id, rec.name
|
||
return fallback_slug, "", "", fallback_slug.replace("-", " ").title()
|
||
|
||
# Prefer runs from the explicit perspective (home if available; otherwise away)
|
||
home_runs = away_runs = None
|
||
if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
|
||
home_runs = p_home["team_runs"]
|
||
away_runs = p_home["opp_runs"]
|
||
elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
|
||
away_runs = p_away["team_runs"]
|
||
home_runs = p_away["opp_runs"]
|
||
|
||
# Fallback: single perspective present but numbers known → place by is_away
|
||
if (home_runs is None or away_runs is None) and p:
|
||
one = p[0]
|
||
if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
|
||
if one["is_away"]:
|
||
away_runs = one["team_runs"]; home_runs = one["opp_runs"]
|
||
away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team
|
||
else:
|
||
home_runs = one["team_runs"]; away_runs = one["opp_runs"]
|
||
home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team
|
||
|
||
# Pack final team identifiers (fallback slug = guess from perspectives)
|
||
guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
|
||
p_away["opp"].slug if p_away and p_away["opp"] else
|
||
p[0]["pair"][0])
|
||
guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
|
||
p_home["opp"].slug if p_home and p_home["opp"] else
|
||
p[0]["pair"][1])
|
||
|
||
home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
|
||
away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)
|
||
|
||
# Winner/loser
|
||
winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
|
||
if isinstance(home_runs, int) and isinstance(away_runs, int):
|
||
if home_runs > away_runs:
|
||
winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
|
||
loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
|
||
elif away_runs > home_runs:
|
||
winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
|
||
loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id
|
||
|
||
# Meta from perspectives
|
||
loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
|
||
status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
|
||
source_urls = sorted({x["source_url"] for x in p})
|
||
|
||
# -------- NEW: fetch game start time from game page --------
|
||
time_local = ""
|
||
if args.fetch_time and game_id:
|
||
if game_id in time_cache:
|
||
tval = time_cache[game_id]
|
||
else:
|
||
logging.debug(f"TIME: fetching game {game_id}")
|
||
tval = fetch_game_time(game_id, session=session)
|
||
time_cache[game_id] = tval
|
||
if tval is None:
|
||
# small backoff to be nice if many misses
|
||
time.sleep(min(args.sleep * 2, 1.0))
|
||
if tval:
|
||
time_local = tval
|
||
|
||
logging.debug(
|
||
f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
|
||
f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
|
||
)
|
||
|
||
out_rows.append({
|
||
"date_local": date,
|
||
"time_local": time_local,
|
||
"home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
|
||
"away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
|
||
"home_runs": "" if home_runs is None else home_runs,
|
||
"away_runs": "" if away_runs is None else away_runs,
|
||
"winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
|
||
"loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
|
||
"location": loc, "status": status,
|
||
"game_id": game_id,
|
||
"source_urls": " ".join(source_urls),
|
||
})
|
||
|
||
if not out_rows:
|
||
logging.warning("No games produced.")
|
||
return
|
||
|
||
fieldnames = [
|
||
"date_local","time_local",
|
||
"home_slug","home_instance","home_id","home_name",
|
||
"away_slug","away_instance","away_id","away_name",
|
||
"home_runs","away_runs",
|
||
"winner_slug","winner_instance","winner_id",
|
||
"loser_slug","loser_instance","loser_id",
|
||
"location","status","game_id","source_urls",
|
||
]
|
||
with open(args.out, "w", newline="", encoding="utf-8") as f:
|
||
w = csv.DictWriter(f, fieldnames=fieldnames)
|
||
w.writeheader()
|
||
for r in out_rows:
|
||
w.writerow(r)
|
||
|
||
logging.info(f"Wrote {len(out_rows)} games → {args.out}")
|
||
|
||
if __name__ == "__main__":
|
||
main() |