initial commit
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
/*.csv
|
||||||
|
/*.numbers
|
||||||
376
2025-csyba.json
Normal file
376
2025-csyba.json
Normal file
@@ -0,0 +1,376 @@
|
|||||||
|
[{
|
||||||
|
"teamName": "Carol Stream Cheaties",
|
||||||
|
"team_id": "8944347",
|
||||||
|
"team_slug": "carol-stream-cheaties",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119604",
|
||||||
|
"w": "15",
|
||||||
|
"l": "2",
|
||||||
|
"t": "1",
|
||||||
|
"rf": "139",
|
||||||
|
"ra": "41",
|
||||||
|
"division_record": "10-2-1",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944347-carol-stream-cheaties?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Deerfield Dynasty",
|
||||||
|
"team_id": "8944348",
|
||||||
|
"team_slug": "deerfield-dynasty",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119605",
|
||||||
|
"w": "15",
|
||||||
|
"l": "3",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "152",
|
||||||
|
"ra": "52",
|
||||||
|
"division_record": "12-2-0",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944348-deerfield-dynasty?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Buffalo Grove Marlins",
|
||||||
|
"team_id": "8944344",
|
||||||
|
"team_slug": "buffalo-grove-marlins",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119601",
|
||||||
|
"w": "15",
|
||||||
|
"l": "3",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "127",
|
||||||
|
"ra": "47",
|
||||||
|
"division_record": "11-2-0",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944344-buffalo-grove-marlins?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Buffalo Grove White Sox",
|
||||||
|
"team_id": "8944346",
|
||||||
|
"team_slug": "buffalo-grove-white-sox",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119603",
|
||||||
|
"w": "12",
|
||||||
|
"l": "5",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "140",
|
||||||
|
"ra": "58",
|
||||||
|
"division_record": "10-5-0",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944346-buffalo-grove-white-sox?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Arlington Hts Shamrocks",
|
||||||
|
"team_id": "8944342",
|
||||||
|
"team_slug": "arlington-hts-shamrocks",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119599",
|
||||||
|
"w": "9",
|
||||||
|
"l": "9",
|
||||||
|
"t": "3",
|
||||||
|
"rf": "120",
|
||||||
|
"ra": "119",
|
||||||
|
"division_record": "5-8-3",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944342-arlington-hts-shamrocks?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Waukegan Alacranes",
|
||||||
|
"team_id": "9024497",
|
||||||
|
"team_slug": "waukegan-alacranes",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10185021",
|
||||||
|
"w": "7",
|
||||||
|
"l": "7",
|
||||||
|
"t": "2",
|
||||||
|
"rf": "96",
|
||||||
|
"ra": "88",
|
||||||
|
"division_record": "6-5-2",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/9024497-waukegan-alacranes?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Palatine Pelicans",
|
||||||
|
"team_id": "8944350",
|
||||||
|
"team_slug": "palatine-pelicans",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119607",
|
||||||
|
"w": "6",
|
||||||
|
"l": "10",
|
||||||
|
"t": "2",
|
||||||
|
"rf": "91",
|
||||||
|
"ra": "128",
|
||||||
|
"division_record": "3-9-2",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944350-palatine-pelicans?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Buffalo Grove Blue Wahoos",
|
||||||
|
"team_id": "9071622",
|
||||||
|
"team_slug": "buffalo-grove-blue-wahoos",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10219990",
|
||||||
|
"w": "5",
|
||||||
|
"l": "10",
|
||||||
|
"t": "1",
|
||||||
|
"rf": "57",
|
||||||
|
"ra": "115",
|
||||||
|
"division_record": "3-8-0",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/9071622-buffalo-grove-blue-wahoos?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Arlington Hts Freeze",
|
||||||
|
"team_id": "8944343",
|
||||||
|
"team_slug": "arlington-hts-freeze",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119600",
|
||||||
|
"w": "6",
|
||||||
|
"l": "13",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "87",
|
||||||
|
"ra": "116",
|
||||||
|
"division_record": "3-11-0",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944343-arlington-hts-freeze?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Buffalo Grove Orioles",
|
||||||
|
"team_id": "8944345",
|
||||||
|
"team_slug": "buffalo-grove-orioles",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119602",
|
||||||
|
"w": "2",
|
||||||
|
"l": "16",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "76",
|
||||||
|
"ra": "175",
|
||||||
|
"division_record": "1-12-0",
|
||||||
|
"division": "North",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944345-buffalo-grove-orioles?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Dunham Dash",
|
||||||
|
"team_id": "8944355",
|
||||||
|
"team_slug": "dunham-dash",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119611",
|
||||||
|
"w": "12",
|
||||||
|
"l": "3",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "117",
|
||||||
|
"ra": "57",
|
||||||
|
"division_record": "9-0-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944355-dunham-dash?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Skokie Vikings",
|
||||||
|
"team_id": "8944360",
|
||||||
|
"team_slug": "skokie-vikings",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119616",
|
||||||
|
"w": "9",
|
||||||
|
"l": "6",
|
||||||
|
"t": "1",
|
||||||
|
"rf": "93",
|
||||||
|
"ra": "72",
|
||||||
|
"division_record": "6-3-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944360-skokie-vikings?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Elmhurst White Sox",
|
||||||
|
"team_id": "8944356",
|
||||||
|
"team_slug": "elmhurst-white-sox",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119612",
|
||||||
|
"w": "4",
|
||||||
|
"l": "3",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "35",
|
||||||
|
"ra": "31",
|
||||||
|
"division_record": "3-2-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944356-elmhurst-white-sox?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Lombard Expos",
|
||||||
|
"team_id": "8974790",
|
||||||
|
"team_slug": "lombard-expos",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10148204",
|
||||||
|
"w": "8",
|
||||||
|
"l": "7",
|
||||||
|
"t": "1",
|
||||||
|
"rf": "97",
|
||||||
|
"ra": "68",
|
||||||
|
"division_record": "5-4-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8974790-lombard-expos?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Chicago Rebels",
|
||||||
|
"team_id": "8974058",
|
||||||
|
"team_slug": "chicago-rebels",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10147713",
|
||||||
|
"w": "9",
|
||||||
|
"l": "9",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "104",
|
||||||
|
"ra": "81",
|
||||||
|
"division_record": "6-4-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8974058-chicago-rebels?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Westchester Knights",
|
||||||
|
"team_id": "8944361",
|
||||||
|
"team_slug": "westchester-knights",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119617",
|
||||||
|
"w": "5",
|
||||||
|
"l": "10",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "82",
|
||||||
|
"ra": "155",
|
||||||
|
"division_record": "4-4-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944361-westchester-knights?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Melrose Park Thorns",
|
||||||
|
"team_id": "9014143",
|
||||||
|
"team_slug": "melrose-park-thorns",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10178191",
|
||||||
|
"w": "5",
|
||||||
|
"l": "12",
|
||||||
|
"t": "1",
|
||||||
|
"rf": "106",
|
||||||
|
"ra": "139",
|
||||||
|
"division_record": "3-7-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/9014143-melrose-park-thorns?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Bedford Park Bombers",
|
||||||
|
"team_id": "8944352",
|
||||||
|
"team_slug": "bedford-park-bombers",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119608",
|
||||||
|
"w": "3",
|
||||||
|
"l": "12",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "48",
|
||||||
|
"ra": "133",
|
||||||
|
"division_record": "2-6-0",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944352-bedford-park-bombers?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Skokie Classics",
|
||||||
|
"team_id": "8944359",
|
||||||
|
"team_slug": "skokie-classics",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119615",
|
||||||
|
"w": "5",
|
||||||
|
"l": "15",
|
||||||
|
"t": "1",
|
||||||
|
"rf": "105",
|
||||||
|
"ra": "177",
|
||||||
|
"division_record": "4-6-1",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944359-skokie-classics?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Park Ridge White Sox",
|
||||||
|
"team_id": "8944358",
|
||||||
|
"team_slug": "park-ridge-white-sox",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10119614",
|
||||||
|
"w": "1",
|
||||||
|
"l": "11",
|
||||||
|
"t": "3",
|
||||||
|
"rf": "42",
|
||||||
|
"ra": "142",
|
||||||
|
"division_record": "0-6-1",
|
||||||
|
"division": "South",
|
||||||
|
"link": "https://www.csyba.com/page/show/8944358-park-ridge-white-sox?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Chicago White Sox",
|
||||||
|
"team_id": "9002208",
|
||||||
|
"team_slug": "chicago-white-sox",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10168648",
|
||||||
|
"w": "19",
|
||||||
|
"l": "6",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "162",
|
||||||
|
"ra": "73",
|
||||||
|
"division_record": "10-4-0",
|
||||||
|
"division": "CMBA",
|
||||||
|
"link": "https://www.csyba.com/page/show/9002208-chicago-white-sox?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Chicago Blazers",
|
||||||
|
"team_id": "9002204",
|
||||||
|
"team_slug": "chicago-blazers",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10168644",
|
||||||
|
"w": "17",
|
||||||
|
"l": "7",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "239",
|
||||||
|
"ra": "94",
|
||||||
|
"division_record": "9-4-0",
|
||||||
|
"division": "CMBA",
|
||||||
|
"link": "https://www.csyba.com/page/show/9002204-chicago-blazers?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Chicago Electrons",
|
||||||
|
"team_id": "9002205",
|
||||||
|
"team_slug": "chicago-electrons",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10168645",
|
||||||
|
"w": "16",
|
||||||
|
"l": "6",
|
||||||
|
"t": "2",
|
||||||
|
"rf": "170",
|
||||||
|
"ra": "112",
|
||||||
|
"division_record": "9-4-0",
|
||||||
|
"division": "CMBA",
|
||||||
|
"link": "https://www.csyba.com/page/show/9002205-chicago-electrons?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Chicago Hounds",
|
||||||
|
"team_id": "9002206",
|
||||||
|
"team_slug": "chicago-hounds",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10168646",
|
||||||
|
"w": "15",
|
||||||
|
"l": "11",
|
||||||
|
"t": "0",
|
||||||
|
"rf": "182",
|
||||||
|
"ra": "126",
|
||||||
|
"division_record": "7-8-0",
|
||||||
|
"division": "CMBA",
|
||||||
|
"link": "https://www.csyba.com/page/show/9002206-chicago-hounds?subseason=942425"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"teamName": "Chicago Hawks",
|
||||||
|
"team_id": "9002209",
|
||||||
|
"team_slug": "chicago-hawks",
|
||||||
|
"subseason_id": "942425",
|
||||||
|
"instance_id": "10168649",
|
||||||
|
"w": "1",
|
||||||
|
"l": "25",
|
||||||
|
"t": "2",
|
||||||
|
"rf": "87",
|
||||||
|
"ra": "355",
|
||||||
|
"division_record": "0-15-0",
|
||||||
|
"division": "CMBA",
|
||||||
|
"link": "https://www.csyba.com/page/show/9002209-chicago-hawks?subseason=942425"
|
||||||
|
}
|
||||||
|
]
|
||||||
464
build_season_schedule.py
Normal file
464
build_season_schedule.py
Normal file
@@ -0,0 +1,464 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# build_season_schedule.py
|
||||||
|
#
|
||||||
|
# Build a deduped season schedule from SportsEngine team-instance printable pages.
|
||||||
|
# - Assumes team-instance schedule pages are TEAM-FIRST for scores.
|
||||||
|
# - Determines home/away using the '@' marker on the opponent cell.
|
||||||
|
# - Deduplicates primarily by game_id (from /game/show/<id> links), otherwise by a fallback key.
|
||||||
|
# - Optionally fetches each game's time from the /game/show/<id> page ("tab_boxscores_content").
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# pip install requests beautifulsoup4 python-dateutil
|
||||||
|
# python build_season_schedule.py --subseason 942425 --teams teams.json --out season_schedule.csv
|
||||||
|
#
|
||||||
|
# Example teams.json (array):
|
||||||
|
# [
|
||||||
|
# {"teamName":"Carol Stream Cheaties","team_id":"8944347","team_slug":"carol-stream-cheaties","subseason_id":"942425","instance_id":"10119604"},
|
||||||
|
# ...
|
||||||
|
# ]
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from dateutil import parser as dtp
|
||||||
|
|
||||||
|
# ----------------- logging -----------------
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO, # change to DEBUG for verbose tracing
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ----------------- constants -----------------
|
||||||
|
UA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) SE-Schedule/1.3 Safari/537.36"
|
||||||
|
HEADERS = {"User-Agent": UA}
|
||||||
|
PRINT_BASE = "https://www.csyba.com/schedule/print/team_instance/{iid}"
|
||||||
|
GAME_BASE = "https://www.csyba.com/game/show/{gid}"
|
||||||
|
|
||||||
|
SCORE_RE = re.compile(r"\b(\d+)\s*[–-]\s*(\d+)\b")
|
||||||
|
GAME_LINK_RE = re.compile(r"/game/show/(\d+)")
|
||||||
|
TIME_RE = re.compile(r"\b(\d{1,2}:\d{2})\s*([ap]\.?m\.?|AM|PM)?\b", re.I)
|
||||||
|
|
||||||
|
# ----------------- helpers -----------------
|
||||||
|
def clean(x: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", (x or "")).strip()
|
||||||
|
|
||||||
|
def slugify(s: str) -> str:
|
||||||
|
s = s.lower()
|
||||||
|
s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
|
||||||
|
return s
|
||||||
|
|
||||||
|
def norm_name(s: str) -> str:
|
||||||
|
s = s.lower()
|
||||||
|
s = re.sub(r"[^a-z0-9 ]+", " ", s)
|
||||||
|
s = re.sub(r"\b(the|club|team|ll|little league|baseball|softball|youth|athletic|athletics|rec|rec\.)\b", " ", s)
|
||||||
|
s = re.sub(r"\s+", " ", s).strip()
|
||||||
|
return s
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class TeamRec:
|
||||||
|
name: str
|
||||||
|
slug: str
|
||||||
|
team_id: str
|
||||||
|
instance_id: str
|
||||||
|
subseason_id: str
|
||||||
|
|
||||||
|
def load_teams(teams_path: str):
|
||||||
|
"""Load mapping tables from teams.json you provided."""
|
||||||
|
with open(teams_path, "r", encoding="utf-8") as f:
|
||||||
|
arr = json.load(f)
|
||||||
|
by_instance: Dict[str, TeamRec] = {}
|
||||||
|
by_slug: Dict[str, TeamRec] = {}
|
||||||
|
by_norm: Dict[str, TeamRec] = {}
|
||||||
|
for t in arr:
|
||||||
|
rec = TeamRec(
|
||||||
|
name=str(t["teamName"]),
|
||||||
|
slug=str(t["team_slug"]),
|
||||||
|
team_id=str(t["team_id"]),
|
||||||
|
instance_id=str(t["instance_id"]),
|
||||||
|
subseason_id=str(t["subseason_id"]),
|
||||||
|
)
|
||||||
|
by_instance[rec.instance_id] = rec
|
||||||
|
by_slug[rec.slug] = rec
|
||||||
|
by_norm[norm_name(rec.name)] = rec
|
||||||
|
return by_instance, by_slug, by_norm
|
||||||
|
|
||||||
|
def best_match_team(opponent_text: str, by_slug, by_norm) -> Optional[TeamRec]:
|
||||||
|
"""Match opponent using slug first, then normalized name, then loose containment."""
|
||||||
|
s = slugify(opponent_text)
|
||||||
|
if s in by_slug:
|
||||||
|
return by_slug[s]
|
||||||
|
n = norm_name(opponent_text)
|
||||||
|
if n in by_norm:
|
||||||
|
return by_norm[n]
|
||||||
|
for key, rec in by_norm.items():
|
||||||
|
if key in n or n in key:
|
||||||
|
return rec
|
||||||
|
return None
|
||||||
|
|
||||||
|
def runs_from_team_pov(result_flag: str, s_a: str, s_b: str):
|
||||||
|
"""
|
||||||
|
Team-instance pages are TEAM-FIRST. s_a is THIS team's runs, s_b is opponent runs.
|
||||||
|
We don't reorder; we only validate with W/L/T if needed.
|
||||||
|
"""
|
||||||
|
if not (s_a.isdigit() and s_b.isdigit()):
|
||||||
|
return None, None
|
||||||
|
a, b = int(s_a), int(s_b)
|
||||||
|
if result_flag == "W" and a <= b:
|
||||||
|
logging.debug(f"Result=W but team_runs<=opp_runs ({a}-{b}); keeping as-is (team-first).")
|
||||||
|
if result_flag == "L" and a >= b:
|
||||||
|
logging.debug(f"Result=L but team_runs>=opp_runs ({a}-{b}); keeping as-is (team-first).")
|
||||||
|
return a, b
|
||||||
|
|
||||||
|
# ----------------- HTTP utils -----------------
|
||||||
|
def get_soup(url: str, session: Optional[requests.Session] = None, timeout: int = 30) -> Optional[BeautifulSoup]:
|
||||||
|
try:
|
||||||
|
sess = session or requests.Session()
|
||||||
|
r = sess.get(url, headers=HEADERS, timeout=timeout)
|
||||||
|
r.raise_for_status()
|
||||||
|
return BeautifulSoup(r.text, "html.parser")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"GET failed {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ----------------- scraping -----------------
|
||||||
|
def parse_printable(instance_id: str, subseason_id: str, session: requests.Session) -> List[dict]:
|
||||||
|
"""Parse one team-instance printable schedule page into perspective rows."""
|
||||||
|
url = PRINT_BASE.format(iid=instance_id) + "?" + urlencode({
|
||||||
|
"schedule_type": "index",
|
||||||
|
"subseason": subseason_id,
|
||||||
|
})
|
||||||
|
soup = get_soup(url, session=session)
|
||||||
|
if not soup:
|
||||||
|
return []
|
||||||
|
|
||||||
|
table = soup.select_one("table")
|
||||||
|
if not table:
|
||||||
|
logging.warning(f"No table found for team_instance={instance_id}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
games = []
|
||||||
|
for row_idx, tr in enumerate(table.select("tr")[1:], start=1):
|
||||||
|
tds = tr.select("td")
|
||||||
|
if len(tds) < 5:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Cells: Date | Result | Opponent | Location | Status
|
||||||
|
date_txt = clean(tds[0].get_text(" "))
|
||||||
|
result_txt = clean(tds[1].get_text(" "))
|
||||||
|
opp_txt = clean(tds[2].get_text(" "))
|
||||||
|
loc_txt = clean(tds[3].get_text(" "))
|
||||||
|
status_txt = clean(tds[4].get_text(" "))
|
||||||
|
|
||||||
|
# Date → ISO
|
||||||
|
try:
|
||||||
|
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
|
||||||
|
except Exception:
|
||||||
|
date_iso = date_txt
|
||||||
|
|
||||||
|
# Pull a game_id if present (from any link in the row)
|
||||||
|
game_id = ""
|
||||||
|
for a in tr.select("a[href]"):
|
||||||
|
m = GAME_LINK_RE.search(a.get("href", ""))
|
||||||
|
if m:
|
||||||
|
game_id = m.group(1)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract W/L/T (Result cell)
|
||||||
|
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
|
||||||
|
result_flag = m_res.group(1).upper() if m_res else ""
|
||||||
|
|
||||||
|
# Extract score from Result cell; if missing, also try Opponent cell
|
||||||
|
m_score = SCORE_RE.search(result_txt) or SCORE_RE.search(opp_txt)
|
||||||
|
s_a, s_b = (m_score.group(1), m_score.group(2)) if m_score else ("", "")
|
||||||
|
|
||||||
|
# Opponent + home/away flag
|
||||||
|
is_away = opp_txt.startswith("@")
|
||||||
|
opponent_name = opp_txt.lstrip("@").strip()
|
||||||
|
|
||||||
|
# Compute team/opp runs (TEAM-FIRST orientation)
|
||||||
|
team_runs, opp_runs = runs_from_team_pov(result_flag, s_a, s_b)
|
||||||
|
|
||||||
|
logging.debug(
|
||||||
|
f"PARSER: inst={instance_id} row={row_idx} date={date_iso} "
|
||||||
|
f"res={result_flag} scores=({s_a}-{s_b}) away={is_away} "
|
||||||
|
f"→ team_runs={team_runs}, opp_runs={opp_runs}"
|
||||||
|
)
|
||||||
|
|
||||||
|
games.append({
|
||||||
|
"team_instance": instance_id,
|
||||||
|
"game_id": game_id, # may be empty
|
||||||
|
"date": date_iso,
|
||||||
|
"result": result_flag, # W/L/T from THIS TEAM's perspective
|
||||||
|
"team_runs": team_runs,
|
||||||
|
"opp_runs": opp_runs,
|
||||||
|
"opponent_name": opponent_name,
|
||||||
|
"is_away": is_away,
|
||||||
|
"location": loc_txt,
|
||||||
|
"status": status_txt,
|
||||||
|
"source_url": url,
|
||||||
|
})
|
||||||
|
|
||||||
|
logging.info(f"Team {instance_id}: parsed {len(games)} rows")
|
||||||
|
return games
|
||||||
|
|
||||||
|
def fetch_game_time(game_id: str, session: requests.Session) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Fetch the game's local start time from the /game/show/<id> page.
|
||||||
|
Looks inside the tab with id 'tab_boxscores_content' but also
|
||||||
|
falls back to scanning the page for common time patterns.
|
||||||
|
Returns a zero-padded 24h 'HH:MM' string or None if unavailable.
|
||||||
|
"""
|
||||||
|
if not game_id:
|
||||||
|
return None
|
||||||
|
url = GAME_BASE.format(gid=game_id)
|
||||||
|
soup = get_soup(url, session=session, timeout=30)
|
||||||
|
if not soup:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Prefer the boxscores tab content
|
||||||
|
box = soup.select_one("#tab_boxscores_content") or soup.select_one("#tab_boxscore_content")
|
||||||
|
text = ""
|
||||||
|
if box:
|
||||||
|
text = " ".join(box.stripped_strings)
|
||||||
|
else:
|
||||||
|
# Fall back to page-wide text (but avoid pulling too much)
|
||||||
|
main = soup.select_one("div.page") or soup
|
||||||
|
text = " ".join((main.get_text(" ", strip=True) or "")[:4000].split())
|
||||||
|
|
||||||
|
m = TIME_RE.search(text)
|
||||||
|
if not m:
|
||||||
|
logging.debug(f"TIME: no time found in game {game_id}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
hhmm = m.group(1)
|
||||||
|
ampm = (m.group(2) or "").lower().replace(".", "")
|
||||||
|
try:
|
||||||
|
# Normalize to 24h HH:MM
|
||||||
|
from datetime import datetime
|
||||||
|
if ampm:
|
||||||
|
dt = datetime.strptime(f"{hhmm} {ampm.upper()}", "%I:%M %p")
|
||||||
|
else:
|
||||||
|
# already 24h-ish
|
||||||
|
dt = datetime.strptime(hhmm, "%H:%M")
|
||||||
|
return dt.strftime("%H:%M")
|
||||||
|
except Exception:
|
||||||
|
# Be forgiving (e.g., "6:00pm" without space)
|
||||||
|
try:
|
||||||
|
from datetime import datetime
|
||||||
|
hhmm2 = hhmm
|
||||||
|
if ampm:
|
||||||
|
dt = datetime.strptime(f"{hhmm2}{ampm}", "%I:%M%p")
|
||||||
|
return dt.strftime("%H:%M")
|
||||||
|
except Exception:
|
||||||
|
logging.debug(f"TIME: could not normalize '{hhmm} {ampm}' for game {game_id}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ----------------- build & merge -----------------
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser(description="Build a deduped season schedule with IDs, winners/losers, runs, and times.")
|
||||||
|
ap.add_argument("--subseason", required=True, help="Subseason ID, e.g. 942425")
|
||||||
|
ap.add_argument("--teams", required=True, help="Path to teams.json (array with team_id, team_slug, instance_id, teamName)")
|
||||||
|
ap.add_argument("--out", default="season_schedule.csv", help="Output CSV path")
|
||||||
|
ap.add_argument("--fetch-time", action="store_true", help="Fetch game time from /game/show/<id>")
|
||||||
|
ap.add_argument("--sleep", type=float, default=0.35, help="Delay between requests (seconds)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
by_instance, by_slug, by_norm = load_teams(args.teams)
|
||||||
|
instance_ids = sorted(by_instance.keys())
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update(HEADERS)
|
||||||
|
|
||||||
|
# Scrape all teams
|
||||||
|
raw: List[dict] = []
|
||||||
|
for i, iid in enumerate(instance_ids, 1):
|
||||||
|
logging.info(f"[{i}/{len(instance_ids)}] Fetching schedule for instance {iid}")
|
||||||
|
raw.extend(parse_printable(iid, args.subseason, session=session))
|
||||||
|
time.sleep(args.sleep) # be polite
|
||||||
|
|
||||||
|
def rec_from_instance(iid: str) -> Optional[TeamRec]:
|
||||||
|
return by_instance.get(iid)
|
||||||
|
|
||||||
|
def match_opponent(text: str) -> Optional[TeamRec]:
|
||||||
|
return best_match_team(text, by_slug, by_norm)
|
||||||
|
|
||||||
|
# Group by game_id if available; otherwise fallback on (date + unordered pair + raw score text if present)
|
||||||
|
buckets: Dict[str, dict] = {}
|
||||||
|
fallback_rows = 0
|
||||||
|
|
||||||
|
for row in raw:
|
||||||
|
team_rec = rec_from_instance(row["team_instance"])
|
||||||
|
if not team_rec:
|
||||||
|
logging.warning(f"Unknown instance {row['team_instance']}; skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
opp_rec = match_opponent(row["opponent_name"])
|
||||||
|
opp_slug = opp_rec.slug if opp_rec else slugify(row["opponent_name"])
|
||||||
|
pair = tuple(sorted([team_rec.slug, opp_slug]))
|
||||||
|
|
||||||
|
if row["game_id"]:
|
||||||
|
key = f"id:{row['game_id']}"
|
||||||
|
else:
|
||||||
|
runs_sig = ""
|
||||||
|
if isinstance(row["team_runs"], int) and isinstance(row["opp_runs"], int):
|
||||||
|
runs_sig = f"{row['team_runs']}-{row['opp_runs']}"
|
||||||
|
key = f"fb:{row['date']}|{pair[0]}@{pair[1]}|{runs_sig}"
|
||||||
|
fallback_rows += 1
|
||||||
|
|
||||||
|
perspective = {
|
||||||
|
"team": team_rec,
|
||||||
|
"opp": opp_rec, # may be None
|
||||||
|
"is_away": row["is_away"],
|
||||||
|
"team_runs": row["team_runs"],
|
||||||
|
"opp_runs": row["opp_runs"],
|
||||||
|
"location": row["location"],
|
||||||
|
"status": row["status"],
|
||||||
|
"source_url": row["source_url"],
|
||||||
|
"pair": pair,
|
||||||
|
"date": row["date"],
|
||||||
|
"game_id": row["game_id"],
|
||||||
|
}
|
||||||
|
|
||||||
|
if key not in buckets:
|
||||||
|
buckets[key] = {"persp": [perspective], "game_id": row["game_id"]}
|
||||||
|
else:
|
||||||
|
buckets[key]["persp"].append(perspective)
|
||||||
|
|
||||||
|
if fallback_rows:
|
||||||
|
logging.info(f"Used fallback dedupe for {fallback_rows} rows without game_id.")
|
||||||
|
|
||||||
|
# Merge perspectives into a single home/away row
|
||||||
|
out_rows = []
|
||||||
|
time_cache: Dict[str, Optional[str]] = {}
|
||||||
|
|
||||||
|
for key, bucket in buckets.items():
|
||||||
|
p = bucket["persp"]
|
||||||
|
date = p[0]["date"]
|
||||||
|
game_id = bucket.get("game_id", "")
|
||||||
|
|
||||||
|
# Identify home/away perspectives
|
||||||
|
p_home = next((x for x in p if x["is_away"] is False), None)
|
||||||
|
p_away = next((x for x in p if x["is_away"] is True), None)
|
||||||
|
|
||||||
|
# Team identities
|
||||||
|
home_team = (p_home["team"] if p_home else (p_away["opp"] if p_away else None))
|
||||||
|
away_team = (p_away["team"] if p_away else (p_home["opp"] if p_home else None))
|
||||||
|
|
||||||
|
def pack_team(rec: Optional[TeamRec], fallback_slug: str):
|
||||||
|
if rec:
|
||||||
|
return rec.slug, rec.instance_id, rec.team_id, rec.name
|
||||||
|
return fallback_slug, "", "", fallback_slug.replace("-", " ").title()
|
||||||
|
|
||||||
|
# Prefer runs from the explicit perspective (home if available; otherwise away)
|
||||||
|
home_runs = away_runs = None
|
||||||
|
if p_home and isinstance(p_home["team_runs"], int) and isinstance(p_home["opp_runs"], int):
|
||||||
|
home_runs = p_home["team_runs"]
|
||||||
|
away_runs = p_home["opp_runs"]
|
||||||
|
elif p_away and isinstance(p_away["team_runs"], int) and isinstance(p_away["opp_runs"], int):
|
||||||
|
away_runs = p_away["team_runs"]
|
||||||
|
home_runs = p_away["opp_runs"]
|
||||||
|
|
||||||
|
# Fallback: single perspective present but numbers known → place by is_away
|
||||||
|
if (home_runs is None or away_runs is None) and p:
|
||||||
|
one = p[0]
|
||||||
|
if isinstance(one["team_runs"], int) and isinstance(one["opp_runs"], int):
|
||||||
|
if one["is_away"]:
|
||||||
|
away_runs = one["team_runs"]; home_runs = one["opp_runs"]
|
||||||
|
away_team = one["team"]; home_team = one["opp"] if one["opp"] else home_team
|
||||||
|
else:
|
||||||
|
home_runs = one["team_runs"]; away_runs = one["opp_runs"]
|
||||||
|
home_team = one["team"]; away_team = one["opp"] if one["opp"] else away_team
|
||||||
|
|
||||||
|
# Pack final team identifiers (fallback slug = guess from perspectives)
|
||||||
|
guess_home_fallback = (p_home["team"].slug if p_home and p_home["team"] else
|
||||||
|
p_away["opp"].slug if p_away and p_away["opp"] else
|
||||||
|
p[0]["pair"][0])
|
||||||
|
guess_away_fallback = (p_away["team"].slug if p_away and p_away["team"] else
|
||||||
|
p_home["opp"].slug if p_home and p_home["opp"] else
|
||||||
|
p[0]["pair"][1])
|
||||||
|
|
||||||
|
home_slug, home_inst, home_id, home_name = pack_team(home_team, guess_home_fallback)
|
||||||
|
away_slug, away_inst, away_id, away_name = pack_team(away_team, guess_away_fallback)
|
||||||
|
|
||||||
|
# Winner/loser
|
||||||
|
winner_slug = winner_inst = winner_id = loser_slug = loser_inst = loser_id = ""
|
||||||
|
if isinstance(home_runs, int) and isinstance(away_runs, int):
|
||||||
|
if home_runs > away_runs:
|
||||||
|
winner_slug, winner_inst, winner_id = home_slug, home_inst, home_id
|
||||||
|
loser_slug, loser_inst, loser_id = away_slug, away_inst, away_id
|
||||||
|
elif away_runs > home_runs:
|
||||||
|
winner_slug, winner_inst, winner_id = away_slug, away_inst, away_id
|
||||||
|
loser_slug, loser_inst, loser_id = home_slug, home_inst, home_id
|
||||||
|
|
||||||
|
# Meta from perspectives
|
||||||
|
loc = (p_home["location"] if p_home else "") or (p_away["location"] if p_away else "")
|
||||||
|
status = (p_home["status"] if p_home else "") or (p_away["status"] if p_away else "")
|
||||||
|
source_urls = sorted({x["source_url"] for x in p})
|
||||||
|
|
||||||
|
# -------- NEW: fetch game start time from game page --------
|
||||||
|
time_local = ""
|
||||||
|
if args.fetch_time and game_id:
|
||||||
|
if game_id in time_cache:
|
||||||
|
tval = time_cache[game_id]
|
||||||
|
else:
|
||||||
|
logging.debug(f"TIME: fetching game {game_id}")
|
||||||
|
tval = fetch_game_time(game_id, session=session)
|
||||||
|
time_cache[game_id] = tval
|
||||||
|
if tval is None:
|
||||||
|
# small backoff to be nice if many misses
|
||||||
|
time.sleep(min(args.sleep * 2, 1.0))
|
||||||
|
if tval:
|
||||||
|
time_local = tval
|
||||||
|
|
||||||
|
logging.debug(
|
||||||
|
f"MERGE: {date} {home_slug}({home_runs}) vs {away_slug}({away_runs}) "
|
||||||
|
f"winner={winner_slug or 'TIE'} id={game_id} time={time_local or 'NA'}"
|
||||||
|
)
|
||||||
|
|
||||||
|
out_rows.append({
|
||||||
|
"date_local": date,
|
||||||
|
"time_local": time_local,
|
||||||
|
"home_slug": home_slug, "home_instance": home_inst, "home_id": home_id, "home_name": home_name,
|
||||||
|
"away_slug": away_slug, "away_instance": away_inst, "away_id": away_id, "away_name": away_name,
|
||||||
|
"home_runs": "" if home_runs is None else home_runs,
|
||||||
|
"away_runs": "" if away_runs is None else away_runs,
|
||||||
|
"winner_slug": winner_slug, "winner_instance": winner_inst, "winner_id": winner_id,
|
||||||
|
"loser_slug": loser_slug, "loser_instance": loser_inst, "loser_id": loser_id,
|
||||||
|
"location": loc, "status": status,
|
||||||
|
"game_id": game_id,
|
||||||
|
"source_urls": " ".join(source_urls),
|
||||||
|
})
|
||||||
|
|
||||||
|
if not out_rows:
|
||||||
|
logging.warning("No games produced.")
|
||||||
|
return
|
||||||
|
|
||||||
|
fieldnames = [
|
||||||
|
"date_local","time_local",
|
||||||
|
"home_slug","home_instance","home_id","home_name",
|
||||||
|
"away_slug","away_instance","away_id","away_name",
|
||||||
|
"home_runs","away_runs",
|
||||||
|
"winner_slug","winner_instance","winner_id",
|
||||||
|
"loser_slug","loser_instance","loser_id",
|
||||||
|
"location","status","game_id","source_urls",
|
||||||
|
]
|
||||||
|
with open(args.out, "w", newline="", encoding="utf-8") as f:
|
||||||
|
w = csv.DictWriter(f, fieldnames=fieldnames)
|
||||||
|
w.writeheader()
|
||||||
|
for r in out_rows:
|
||||||
|
w.writerow(r)
|
||||||
|
|
||||||
|
logging.info(f"Wrote {len(out_rows)} games → {args.out}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
224
compute_ratings.py
Normal file
224
compute_ratings.py
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Rank baseball teams from a season_schedule.csv that has columns:
|
||||||
|
date_local,time_local,home_slug,home_instance,home_id,home_name,
|
||||||
|
away_slug,away_instance,away_id,away_name,home_runs,away_runs,
|
||||||
|
winner_slug,winner_instance,winner_id,loser_slug,loser_instance,loser_id,
|
||||||
|
location,status,game_id,source_urls
|
||||||
|
|
||||||
|
Output CSV columns (one row per team):
|
||||||
|
Team, GP, W, L, T, WinPct, RS, RA, RunDiff, PythagoreanWinPct,
|
||||||
|
MasseyRating, EloRating, StrengthOfSchedule, CompositeRating
|
||||||
|
|
||||||
|
Defaults:
|
||||||
|
- Team identity uses *_name; switch to slugs with --team-id slugs
|
||||||
|
- Pythagorean exponent = 1.83
|
||||||
|
- Massey caps margins at 8 runs and subtracts estimated home-field runs
|
||||||
|
- Elo: start 1500, K=24, home bonus H=30, margin factor ln(|m|+1) capped at 2.0
|
||||||
|
- Elo averaged over 20 random shuffles (reduces order dependence)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
|
import math
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Power ratings from season_schedule.csv")
|
||||||
|
p.add_argument("--in", dest="inp", required=True, help="Input CSV (season_schedule.csv)")
|
||||||
|
p.add_argument("--out", dest="out", required=True, help="Output ratings CSV")
|
||||||
|
p.add_argument("--team-id", choices=["names","slugs"], default="names",
|
||||||
|
help="Use team names or slugs as identifiers (default: names)")
|
||||||
|
p.add_argument("--final-status", default=None,
|
||||||
|
help="Only include games where status == this value (e.g., 'final'). If omitted, any row with scores is included.")
|
||||||
|
# Tunables
|
||||||
|
p.add_argument("--pyexp", type=float, default=1.83, help="Pythagorean exponent")
|
||||||
|
p.add_argument("--massey-cap", type=float, default=8.0, help="Cap for run margins in Massey")
|
||||||
|
p.add_argument("--no-massey-home-adj", action="store_true",
|
||||||
|
help="Disable subtracting estimated home-field runs in Massey")
|
||||||
|
p.add_argument("--elo-k", type=float, default=24.0, help="Elo K-factor")
|
||||||
|
p.add_argument("--elo-home", type=float, default=30.0, help="Elo home bonus (points)")
|
||||||
|
p.add_argument("--elo-mcap", type=float, default=2.0, help="Cap for margin factor ln(|m|+1)")
|
||||||
|
p.add_argument("--elo-shuffles", type=int, default=20, help="Random shuffles to average Elo")
|
||||||
|
p.add_argument("--elo-seed", type=int, default=42, help="RNG seed for shuffles")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
def load_games(a) -> pd.DataFrame:
|
||||||
|
df = pd.read_csv(a.inp)
|
||||||
|
# Choose identifiers
|
||||||
|
home_id_col = "home_name" if a.team_id == "names" else "home_slug"
|
||||||
|
away_id_col = "away_name" if a.team_id == "names" else "away_slug"
|
||||||
|
for c in [home_id_col, away_id_col, "home_runs", "away_runs"]:
|
||||||
|
if c not in df.columns:
|
||||||
|
raise ValueError(f"Missing required column: {c}")
|
||||||
|
|
||||||
|
# Optional status filter (helps exclude postponed/canceled)
|
||||||
|
if a.final_status is not None and "status" in df.columns:
|
||||||
|
df = df[df["status"].astype(str).str.lower() == str(a.final_status).lower()]
|
||||||
|
|
||||||
|
# Keep only games with numeric scores
|
||||||
|
df = df.copy()
|
||||||
|
df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce")
|
||||||
|
df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce")
|
||||||
|
df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"])
|
||||||
|
|
||||||
|
# Parse datetime (robust to missing either field)
|
||||||
|
date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce")
|
||||||
|
time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time
|
||||||
|
# Combine when possible
|
||||||
|
dt = date
|
||||||
|
if "time_local" in df.columns:
|
||||||
|
# build datetime only where both present
|
||||||
|
dt = pd.to_datetime(
|
||||||
|
date.dt.strftime("%Y-%m-%d").fillna("") + " " +
|
||||||
|
pd.Series(time).astype(str).replace("NaT",""),
|
||||||
|
errors="coerce"
|
||||||
|
)
|
||||||
|
df_out = pd.DataFrame({
|
||||||
|
"Date": dt,
|
||||||
|
"HomeTeam": df[home_id_col].astype(str),
|
||||||
|
"AwayTeam": df[away_id_col].astype(str),
|
||||||
|
"HomeRuns": df["home_runs"].astype(int),
|
||||||
|
"AwayRuns": df["away_runs"].astype(int),
|
||||||
|
})
|
||||||
|
df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"]
|
||||||
|
df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H",
|
||||||
|
np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T"))
|
||||||
|
return df_out.reset_index(drop=True)
|
||||||
|
|
||||||
|
def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team")
|
||||||
|
stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0)
|
||||||
|
for _, r in df.iterrows():
|
||||||
|
h, a = r["HomeTeam"], r["AwayTeam"]
|
||||||
|
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
|
||||||
|
stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar
|
||||||
|
stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr
|
||||||
|
if hr > ar:
|
||||||
|
stats.at[h,"W"] += 1; stats.at[a,"L"] += 1
|
||||||
|
elif hr < ar:
|
||||||
|
stats.at[a,"W"] += 1; stats.at[h,"L"] += 1
|
||||||
|
else:
|
||||||
|
stats.at[h,"T"] += 1; stats.at[a,"T"] += 1
|
||||||
|
stats = stats.astype(int)
|
||||||
|
stats["GP"] = stats["W"] + stats["L"] + stats["T"]
|
||||||
|
stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan)
|
||||||
|
stats["RunDiff"] = stats["RS"] - stats["RA"]
|
||||||
|
return stats.reset_index()
|
||||||
|
|
||||||
|
def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series:
|
||||||
|
rs = rs.clip(lower=0); ra = ra.clip(lower=0)
|
||||||
|
num = np.power(rs, exp); den = num + np.power(ra, exp)
|
||||||
|
with np.errstate(divide="ignore", invalid="ignore"):
|
||||||
|
p = np.where(den > 0, num / den, 0.5)
|
||||||
|
return pd.Series(p, index=rs.index)
|
||||||
|
|
||||||
|
def estimate_home_field_runs(df: pd.DataFrame) -> float:
|
||||||
|
return float(df["Margin"].mean()) if len(df) else 0.0
|
||||||
|
|
||||||
|
def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]:
|
||||||
|
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
|
||||||
|
idx = {t:i for i,t in enumerate(teams)}
|
||||||
|
y = df["Margin"].astype(float).to_numpy()
|
||||||
|
if cap and cap > 0:
|
||||||
|
y = np.clip(y, -cap, cap)
|
||||||
|
h_est = estimate_home_field_runs(df)
|
||||||
|
if subtract_home:
|
||||||
|
y = y - h_est
|
||||||
|
G, N = len(df), len(teams)
|
||||||
|
A = np.zeros((G+1, N), dtype=float)
|
||||||
|
for r_i, r in enumerate(df.itertuples(index=False)):
|
||||||
|
A[r_i, idx[r.HomeTeam]] = 1.0
|
||||||
|
A[r_i, idx[r.AwayTeam]] = -1.0
|
||||||
|
A[G, :] = 1.0
|
||||||
|
y_ext = np.concatenate([y, [0.0]])
|
||||||
|
r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None)
|
||||||
|
return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0)
|
||||||
|
|
||||||
|
def elo_expected(ra: float, rb: float) -> float:
|
||||||
|
return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0))
|
||||||
|
|
||||||
|
def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]:
|
||||||
|
ratings = dict(init)
|
||||||
|
for _, r in df.iterrows():
|
||||||
|
h, a = r["HomeTeam"], r["AwayTeam"]
|
||||||
|
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
|
||||||
|
margin = hr - ar
|
||||||
|
Eh = elo_expected(ratings[h] + H, ratings[a])
|
||||||
|
Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5))
|
||||||
|
M = np.log(abs(margin) + 1.0)
|
||||||
|
if mcap is not None:
|
||||||
|
M = min(M, mcap)
|
||||||
|
ratings[h] += K * M * (Sh - Eh)
|
||||||
|
ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh))
|
||||||
|
return ratings
|
||||||
|
|
||||||
|
def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series:
|
||||||
|
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
|
||||||
|
base = {t: 1500.0 for t in teams}
|
||||||
|
# baseline in chronological order (Date may be NaT; sort is stable)
|
||||||
|
df0 = df.sort_values(["Date"]).reset_index(drop=True)
|
||||||
|
r_first = elo_once(df0, K, H, mcap, base)
|
||||||
|
rng = np.random.default_rng(seed)
|
||||||
|
vals = {t: [r_first[t]] for t in teams}
|
||||||
|
for _ in range(max(0, shuffles-1)):
|
||||||
|
idx = np.arange(len(df0)); rng.shuffle(idx)
|
||||||
|
r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base)
|
||||||
|
for t in teams:
|
||||||
|
vals[t].append(r[t])
|
||||||
|
return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index()
|
||||||
|
|
||||||
|
def zscore(s: pd.Series) -> pd.Series:
|
||||||
|
mu, sd = s.mean(), s.std(ddof=0)
|
||||||
|
return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd
|
||||||
|
|
||||||
|
def main():
|
||||||
|
a = parse_args()
|
||||||
|
games = load_games(a)
|
||||||
|
|
||||||
|
# Aggregates
|
||||||
|
team = aggregate_team_stats(games)
|
||||||
|
team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], a.pyexp)
|
||||||
|
|
||||||
|
# Ratings
|
||||||
|
massey_r, h_runs = massey(games, cap=a.massey_cap, subtract_home=(not a.no_massey_home_adj))
|
||||||
|
sos = (
|
||||||
|
games.assign(OppTeam=np.where(True, games["AwayTeam"], games["AwayTeam"])) # placeholder
|
||||||
|
)
|
||||||
|
# Strength of schedule: avg opponent Massey rating faced
|
||||||
|
opps = {t: [] for t in massey_r.index}
|
||||||
|
for _, r in games.iterrows():
|
||||||
|
opps[r["HomeTeam"]].append(r["AwayTeam"])
|
||||||
|
opps[r["AwayTeam"]].append(r["HomeTeam"])
|
||||||
|
sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps})
|
||||||
|
|
||||||
|
elo_r = elo(games, K=a.elo_k, H=a.elo_home, mcap=a.elo_mcap, shuffles=a.elo_shuffles, seed=a.elo_seed)
|
||||||
|
|
||||||
|
# Merge
|
||||||
|
out = team.set_index("Team")
|
||||||
|
out["MasseyRating"] = massey_r
|
||||||
|
out["EloRating"] = elo_r
|
||||||
|
out["StrengthOfSchedule"] = sos_series
|
||||||
|
|
||||||
|
# Composite
|
||||||
|
Z_r, Z_e, Z_p = zscore(out["MasseyRating"]), zscore(out["EloRating"]), zscore(out["PythagoreanWinPct"])
|
||||||
|
out["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p
|
||||||
|
|
||||||
|
out = out.reset_index()
|
||||||
|
out = out[[
|
||||||
|
"Team","GP","W","L","T","WinPct","RS","RA","RunDiff",
|
||||||
|
"PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"
|
||||||
|
]].sort_values("CompositeRating", ascending=False)
|
||||||
|
|
||||||
|
# Round for readability
|
||||||
|
for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]:
|
||||||
|
out[c] = out[c].astype(float).round(5)
|
||||||
|
|
||||||
|
out.to_csv(a.out, index=False)
|
||||||
|
print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}")
|
||||||
|
print(f"Teams ranked: {len(out)} | Games processed: {len(games)}")
|
||||||
|
print(f"Output -> {a.out}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
93
csyba.py
Normal file
93
csyba.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
import requests, re, time, csv, logging
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from dateutil import parser as dtp
|
||||||
|
|
||||||
|
# --- Logging setup ---
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||||
|
datefmt="%H:%M:%S"
|
||||||
|
)
|
||||||
|
|
||||||
|
HEADERS = {"User-Agent": "Mozilla/5.0"}
|
||||||
|
SUBSEASON_ID = "942425"
|
||||||
|
|
||||||
|
TEAM_INSTANCES = [
|
||||||
|
"10119604","10119605","10119601","10119603","10119599","10185021","10119607",
|
||||||
|
"10219990","10119600","10119602","10119611","10119616","10119612","10148204",
|
||||||
|
"10147713","10119617","10178191","10119608","10119615","10119614","10168648",
|
||||||
|
"10168644","10168645","10168646","10168649"
|
||||||
|
]
|
||||||
|
|
||||||
|
def clean(x):
|
||||||
|
return re.sub(r"\s+"," ",x or "").strip()
|
||||||
|
|
||||||
|
def fetch_team_schedule(iid):
|
||||||
|
url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}"
|
||||||
|
try:
|
||||||
|
r = requests.get(url, headers=HEADERS, timeout=30)
|
||||||
|
r.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to fetch team {iid}: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
soup = BeautifulSoup(r.text,"html.parser")
|
||||||
|
games = []
|
||||||
|
for tr in soup.select("table tr")[1:]: # skip header
|
||||||
|
tds = tr.select("td")
|
||||||
|
if len(tds) < 5:
|
||||||
|
continue
|
||||||
|
date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]]
|
||||||
|
# parse date
|
||||||
|
try:
|
||||||
|
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
|
||||||
|
except:
|
||||||
|
date_iso = date_txt
|
||||||
|
# extract result/score
|
||||||
|
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
|
||||||
|
result = m_res.group(1).upper() if m_res else ""
|
||||||
|
m_score = re.search(r"(\d+)\s*[-–]\s*(\d+)", result_txt)
|
||||||
|
hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","")
|
||||||
|
away_flag = opp_txt.startswith("@")
|
||||||
|
opponent = opp_txt.lstrip("@").strip()
|
||||||
|
games.append({
|
||||||
|
"team_instance": iid,
|
||||||
|
"date": date_iso,
|
||||||
|
"result": result,
|
||||||
|
"score": f"{hs}-{as_}" if hs else "",
|
||||||
|
"home_score": hs,
|
||||||
|
"away_score": as_,
|
||||||
|
"opponent": opponent,
|
||||||
|
"is_away": away_flag,
|
||||||
|
"location": loc_txt,
|
||||||
|
"status": status_txt,
|
||||||
|
"source_url": url
|
||||||
|
})
|
||||||
|
logging.info(f"Team {iid}: parsed {len(games)} games")
|
||||||
|
return games
|
||||||
|
|
||||||
|
def main():
|
||||||
|
all_games = []
|
||||||
|
for i, iid in enumerate(TEAM_INSTANCES, start=1):
|
||||||
|
logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}")
|
||||||
|
all_games.extend(fetch_team_schedule(iid))
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# deduplicate: key = (date, sorted team_instance+opponent, score)
|
||||||
|
unique = {}
|
||||||
|
for g in all_games:
|
||||||
|
key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"])
|
||||||
|
if key not in unique:
|
||||||
|
unique[key] = g
|
||||||
|
deduped_games = list(unique.values())
|
||||||
|
|
||||||
|
out_file = "season_games.csv"
|
||||||
|
with open(out_file,"w",newline="",encoding="utf-8") as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys())
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(deduped_games)
|
||||||
|
|
||||||
|
logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user