import requests, re, time, csv, logging from bs4 import BeautifulSoup from dateutil import parser as dtp # --- Logging setup --- logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%H:%M:%S" ) HEADERS = {"User-Agent": "Mozilla/5.0"} SUBSEASON_ID = "942425" TEAM_INSTANCES = [ "10119604","10119605","10119601","10119603","10119599","10185021","10119607", "10219990","10119600","10119602","10119611","10119616","10119612","10148204", "10147713","10119617","10178191","10119608","10119615","10119614","10168648", "10168644","10168645","10168646","10168649" ] def clean(x): return re.sub(r"\s+"," ",x or "").strip() def fetch_team_schedule(iid): url = f"https://www.csyba.com/schedule/print/team_instance/{iid}?schedule_type=index&subseason={SUBSEASON_ID}" try: r = requests.get(url, headers=HEADERS, timeout=30) r.raise_for_status() except Exception as e: logging.error(f"Failed to fetch team {iid}: {e}") return [] soup = BeautifulSoup(r.text,"html.parser") games = [] for tr in soup.select("table tr")[1:]: # skip header tds = tr.select("td") if len(tds) < 5: continue date_txt, result_txt, opp_txt, loc_txt, status_txt = [clean(td.get_text(" ")) for td in tds[:5]] # parse date try: date_iso = dtp.parse(date_txt, fuzzy=True).date().isoformat() except: date_iso = date_txt # extract result/score m_res = re.search(r"\b(W|L|T)\b", result_txt, re.I) result = m_res.group(1).upper() if m_res else "" m_score = re.search(r"(\d+)\s*[-–]\s*(\d+)", result_txt) hs, as_ = (m_score.group(1), m_score.group(2)) if m_score else ("","") away_flag = opp_txt.startswith("@") opponent = opp_txt.lstrip("@").strip() games.append({ "team_instance": iid, "date": date_iso, "result": result, "score": f"{hs}-{as_}" if hs else "", "home_score": hs, "away_score": as_, "opponent": opponent, "is_away": away_flag, "location": loc_txt, "status": status_txt, "source_url": url }) logging.info(f"Team {iid}: parsed {len(games)} games") return games def main(): all_games = [] for i, iid in enumerate(TEAM_INSTANCES, start=1): logging.info(f"[{i}/{len(TEAM_INSTANCES)}] Fetching schedule for team {iid}") all_games.extend(fetch_team_schedule(iid)) time.sleep(0.5) # deduplicate: key = (date, sorted team_instance+opponent, score) unique = {} for g in all_games: key = (g["date"], tuple(sorted([g["opponent"], g["team_instance"]])), g["score"]) if key not in unique: unique[key] = g deduped_games = list(unique.values()) out_file = "season_games.csv" with open(out_file,"w",newline="",encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=deduped_games[0].keys()) writer.writeheader() writer.writerows(deduped_games) logging.info(f"Finished. {len(all_games)} raw rows → {len(deduped_games)} unique games saved to {out_file}") if __name__ == "__main__": main()