csyba/build_season_schedule.py

#!/usr/bin/env python3
# build_season_schedule.py
#
# Build a deduped season schedule from SportsEngine team-instance printable pages.
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
# - Determines home/away using the '@' marker on the opponent cell.
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
#
# Usage:
#   pip install requests beautifulsoup4 python-dateutil
#   python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
#
# Example teams.json (array):
# [
#   {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
#   ...
# ]

import argparse
import csv
import json
import logging
import re
import time
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from urllib.parse import urlencode

import requests
from bs4 import BeautifulSoup
from dateutil import parser as dtp

# ----------------- logging -----------------
logging.basicConfig(
    level=logging.INFO,  # change to DEBUG for verbose tracing
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%H:%M:%S",
)

# ----------------- constants -----------------
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
HEADERS = {"User-Agent": UA}
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"
GAME_BASE = "https://www.csyba.com/game/show/{gid}"

SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b")
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)

# ----------------- helpers -----------------
def clean(x: str) -> str:
    return re.sub(r"\s+", " ", (x or "")).strip()

def slugify(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
    return s

def norm_name(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

@dataclass(frozen=True)
class TeamRec:
    name: str
    slug: str
    team_id: str
    instance_id: str
    subseason_id: str

def load_teams(teams_path: str):
    """Load mapping tables from teams.json you provided."""
    with open(teams_path, "r", encoding="utf-8") as f:
        arr = json.load(f)
    by_instance: Dict[str, TeamRec] = {}
    by_slug: Dict[str, TeamRec] = {}
    by_norm: Dict[str, TeamRec] = {}
    for t in arr:
        rec = TeamRec(
            name=str(t["teamName"]),
            slug=str(t["team_slug"]),
            team_id=str(t["team_id"]),
            instance_id=str(t["instance_id"]),
            subseason_id=str(t["subseason_id"]),
        )
        by_instance[rec.instance_id] = rec
        by_slug[rec.slug] = rec
        by_norm[norm_name(rec.name)] = rec
    return by_instance, by_slug, by_norm

def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
    """Match opponent using slug first, then normalized name, then loose containment."""
    s = slugify(opponent_text)
    if s in by_slug:
        return by_slug[s]
    n = norm_name(opponent_text)
    if n in by_norm:
        return by_norm[n]
    for key, rec in by_norm.items():
        if key in n or n in key:
            return rec
    return None

def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
    """
    Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs.
    We don't reorder; we only validate with W/L/T if needed.
    """
    if not (s_a.isdigit() and s_b.isdigit()):
        return None, None
    a, b = int(s_a), int(s_b)
    if result_flag == "W" and a <= b:
        logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
    if result_flag == "L" and a >= b:
        logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
    return a, b

# ----------------- HTTP utils -----------------
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
    try:
        sess = session or requests.Session()
        r = sess.get(url, headers=HEADERS, timeout=timeout)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        logging.error(f"GET failed {url}: {e}")
        return None

# ----------------- scraping -----------------
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
    """Parse one team-instance printable schedule page into perspective rows."""
    url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
        "schedule_type": "index",
        "subseason": subseason_id,
    })
    soup = get_soup(url, session=session)
    if not soup:
        return []

    table = soup.select_one("table")
    if not table:
        logging.warning(f"No table found for team_instance={instance_id}")
        return []

    games = []
    for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
        tds = tr.select("td")
        if len(tds) < 5:
            continue

        # Cells: Date | Result | Opponent | Location | Status
        date_txt   = clean(tds[0].get_text(" "))
        result_txt = clean(tds[1].get_text(" "))
        opp_txt    = clean(tds[2].get_text(" "))
        loc_txt    = clean(tds[3].get_text(" "))
        status_txt = clean(tds[4].get_text(" "))

        # Date → ISO
        try:
            date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
        except Exception:
            date_iso = date_txt

        # Pull a game_id if present (from any link in the row)
        game_id = ""
        for a in tr.select("a[href]"):
            m = GAME_LINK_RE.search(a.get("href", ""))
            if m:
                game_id = m.group(1)
                break

        # Extract W/L/T (Result cell)
        m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
        result_flag = m_res.group(1).upper() if m_res else ""

        # Extract score from Result cell; if missing, also try Opponent cell
        m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
        s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")

        # Opponent + home/away flag
        is_away = opp_txt.startswith("@")
        opponent_name = opp_txt.lstrip("@").strip()

        # Compute team/opp runs (TEAM-FIRST orientation)
        team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)

        logging.debug(
            f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
            f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
            f"→ team_runs={team_runs}, opp_runs={opp_runs}"
        )

        games.append({
            "team_instance": instance_id,
            "game_id": game_id,              # may be empty
            "date": date_iso,
            "result": result_flag,           # W/L/T from THIS TEAM's perspective
            "team_runs": team_runs,
            "opp_runs": opp_runs,
            "opponent_name": opponent_name,
            "is_away": is_away,
            "location": loc_txt,
            "status": status_txt,
            "source_url": url,
        })

    logging.info(f"Team {instance_id}: parsed {len(games)} rows")
    return games

def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
    """
    Fetch the game's local start time from the /game/show/<id> page.
    Looks inside the tab with id 'tab_boxscores_content' but also
    falls back to scanning the page for common time patterns.
    Returns a zero-padded 24h 'HH:MM' string or None if unavailable.
    """
    if not game_id:
        return None
    url = GAME_BASE.format(gid=game_id)
    soup = get_soup(url, session=session, timeout=30)
    if not soup:
        return None

    # Prefer the boxscores tab content
    box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
    text = ""
    if box:
        text = " ".join(box.stripped_strings)
    else:
        # Fall back to page-wide text (but avoid pulling too much)
        main = soup.select_one("div.page") or soup
        text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())

    m = TIME_RE.search(text)
    if not m:
        logging.debug(f"TIME: no time found in game {game_id}")
        return None

    hhmm = m.group(1)
    ampm = (m.group(2) or "").lower().replace(".", "")
    try:
        # Normalize to 24h HH:MM
        from datetime import datetime
        if ampm:
            dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
        else:
            # already 24h-ish
            dt = datetime.strptime(hhmm, "%H:%M")
        return dt.strftime("%H:%M")
    except Exception:
        # Be forgiving (e.g., "6:00pm" without space)
        try:
            from datetime import datetime
            hhmm2 = hhmm
            if ampm:
                dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p")
                return dt.strftime("%H:%M")
        except Exception:
            logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
            return None

# ----------------- build & merge -----------------
def main():
    ap = argparse.ArgumentParser(description="Build a deduped season schedule with IDs, winners/losers, runs, and times.")
    ap.add_argument("--subseason", required=True, help="Subseason ID, e.g. 942425")
    ap.add_argument("--teams", required=True, help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)")
    ap.add_argument("--out", default="season_schedule.csv", help="Output CSV path")
    ap.add_argument("--fetch-time", action="store_true", help="Fetch game time from /game/show/<id>")
    ap.add_argument("--sleep", type=float, default=0.35, help="Delay between requests (seconds)")
    args = ap.parse_args()

    by_instance, by_slug, by_norm = load_teams(args.teams)
    instance_ids = sorted(by_instance.keys())

    session = requests.Session()
    session.headers.update(HEADERS)

    # Scrape all teams
    raw: List[dict] = []
    for i, iid in enumerate(instance_ids, 1):
        logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
        raw.extend(parse_printable(iid, args.subseason, session=session))
        time.sleep(args.sleep)  # be polite

    def rec_from_instance(iid: str) -> Optional[TeamRec]:
        return by_instance.get(iid)

    def match_opponent(text: str) -> Optional[TeamRec]:
        return best_match_team(text, by_slug, by_norm)

    # Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present)
    buckets: Dict[str, dict] = {}
    fallback_rows = 0

    for row in raw:
        team_rec = rec_from_instance(row["team_instance"])
        if not team_rec:
            logging.warning(f"Unknown instance {row['team_instance']}; skipping")
            continue

        opp_rec = match_opponent(row["opponent_name"])
        opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
        pair = tuple(sorted([team_rec.slug, opp_slug]))

        if row["game_id"]:
            key = f"id:{row['game_id']}"
        else:
            runs_sig = ""
            if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
                runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
            key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
            fallback_rows += 1

        perspective = {
            "team": team_rec,
            "opp": opp_rec,  # may be None
            "is_away": row["is_away"],
            "team_runs": row["team_runs"],
            "opp_runs": row["opp_runs"],
            "location": row["location"],
            "status": row["status"],
            "source_url": row["source_url"],
            "pair": pair,
            "date": row["date"],
            "game_id": row["game_id"],
        }

        if key not in buckets:
            buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
        else:
            buckets[key]["persp"].append(perspective)

    if fallback_rows:
        logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")

    # Merge perspectives into a single home/away row
    out_rows = []
    time_cache: Dict[str, Optional[str]] = {}

    for key, bucket in buckets.items():
        p = bucket["persp"]
        date = p[0]["date"]
        game_id = bucket.get("game_id", "")

        # Identify home/away perspectives
        p_home = next((x for x in p if x["is_away"] is False), None)
        p_away = next((x for x in p if x["is_away"] is True), None)

        # Team identities
        home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
        away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))

        def pack_team(rec: Optional[TeamRec], fallback_slug: str):
            if rec:
                return rec.slug, rec.instance_id, rec.team_id, rec.name
            return fallback_slug, "", "", fallback_slug.replace("-", " ").title()

        # Prefer runs from the explicit perspective (home if available; otherwise away)
        home_runs = away_runs = None
        if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
            home_runs = p_home["team_runs"]
            away_runs = p_home["opp_runs"]
        elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
            away_runs = p_away["team_runs"]
            home_runs = p_away["opp_runs"]

        # Fallback: single perspective present but numbers known → place by is_away
        if (home_runs is None or away_runs is None) and p:
            one = p[0]
            if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
                if one["is_away"]:
                    away_runs = one["team_runs"]; home_runs = one["opp_runs"]
                    away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team
                else:
                    home_runs = one["team_runs"]; away_runs = one["opp_runs"]
                    home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team

        # Pack final team identifiers (fallback slug = guess from perspectives)
        guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
                               p_away["opp"].slug if p_away and p_away["opp"] else
                               p[0]["pair"][0])
        guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
                               p_home["opp"].slug if p_home and p_home["opp"] else
                               p[0]["pair"][1])

        home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
        away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)

        # Winner/loser
        winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
        if isinstance(home_runs, int) and isinstance(away_runs, int):
            if home_runs > away_runs:
                winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
                loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
            elif away_runs > home_runs:
                winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
                loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id

        # Meta from perspectives
        loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
        status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
        source_urls = sorted({x["source_url"] for x in p})

        # -------- NEW: fetch game start time from game page --------
        time_local = ""
        if args.fetch_time and game_id:
            if game_id in time_cache:
                tval = time_cache[game_id]
            else:
                logging.debug(f"TIME: fetching game {game_id}")
                tval = fetch_game_time(game_id, session=session)
                time_cache[game_id] = tval
                if tval is None:
                    # small backoff to be nice if many misses
                    time.sleep(min(args.sleep * 2, 1.0))
            if tval:
                time_local = tval

        logging.debug(
            f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
            f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
        )

        out_rows.append({
            "date_local": date,
            "time_local": time_local,
            "home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
            "away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
            "home_runs": "" if home_runs is None else home_runs,
            "away_runs": "" if away_runs is None else away_runs,
            "winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
            "loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
            "location": loc, "status": status,
            "game_id": game_id,
            "source_urls": " ".join(source_urls),
        })

    if not out_rows:
        logging.warning("No games produced.")
        return

    fieldnames = [
        "date_local","time_local",
        "home_slug","home_instance","home_id","home_name",
        "away_slug","away_instance","away_id","away_name",
        "home_runs","away_runs",
        "winner_slug","winner_instance","winner_id",
        "loser_slug","loser_instance","loser_id",
        "location","status","game_id","source_urls",
    ]
    with open(args.out, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in out_rows:
            w.writerow(r)

    logging.info(f"Wrote {len(out_rows)} games → {args.out}")

if __name__ == "__main__":
    main()