initial commit

This commit is contained in:
2025-08-27 11:23:48 -05:00
commit 5cecc6e280
5 changed files with 1159 additions and 0 deletions

93
csyba.py Normal file
View File

@@ -0,0 +1,93 @@
import requests, re, time, csv, logging
from bs4 import BeautifulSoup
from dateutil import parser as dtp
# --- Logging setup ---
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%H:%M:%S"
)
HEADERS = {"User-Agent": "Mozilla/5.0"}
SUBSEASON_ID = "942425"
TEAM_INSTANCES = [
"10119604","10119605","10119601","10119603","10119599","10185021","10119607",
"10219990","10119600","10119602","10119611","10119616","10119612","10148204",
"10147713","10119617","10178191","10119608","10119615","10119614","10168648",
"10168644","10168645","10168646","10168649"
]
def clean(x):
return re.sub(r"\s+"," ",x or "").strip()
def fetch_team_schedule(iid):
url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}"
try:
r = requests.get(url, headers=HEADERS, timeout=30)
r.raise_for_status()
except Exception as e:
logging.error(f"Failed to fetch team {iid}: {e}")
return []
soup = BeautifulSoup(r.text,"html.parser")
games = []
for tr in soup.select("table tr")[1:]: # skip header
tds = tr.select("td")
if len(tds) < 5:
continue
date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]]
# parse date
try:
date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat()
except:
date_iso = date_txt
# extract result/score
m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I)
result = m_res.group(1).upper() if m_res else ""
m_score = re.search(r"(\d+)\s*[-]\s*(\d+)", result_txt)
hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","")
away_flag = opp_txt.startswith("@")
opponent = opp_txt.lstrip("@").strip()
games.append({
"team_instance": iid,
"date": date_iso,
"result": result,
"score": f"{hs}-{as_}" if hs else "",
"home_score": hs,
"away_score": as_,
"opponent": opponent,
"is_away": away_flag,
"location": loc_txt,
"status": status_txt,
"source_url": url
})
logging.info(f"Team {iid}: parsed {len(games)} games")
return games
def main():
all_games = []
for i, iid in enumerate(TEAM_INSTANCES, start=1):
logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}")
all_games.extend(fetch_team_schedule(iid))
time.sleep(0.5)
# deduplicate: key = (date, sorted team_instance+opponent, score)
unique = {}
for g in all_games:
key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"])
if key not in unique:
unique[key] = g
deduped_games = list(unique.values())
out_file = "season_games.csv"
with open(out_file,"w",newline="",encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys())
writer.writeheader()
writer.writerows(deduped_games)
logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}")
if __name__ == "__main__":
main()