Remove redundant docstrings and clean comments in compute_ratings.py
- Simplify function docstrings for load_games, aggregate_team_stats, and others - Keep key explanatory comments concise within code blocks - Maintain overall code clarity while reducing verbosity
This commit is contained in:
@@ -29,20 +29,8 @@ def load_games(
|
|||||||
team_id: str = "names",
|
team_id: str = "names",
|
||||||
final_status: str | None = None,
|
final_status: str | None = None,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
"""
|
|
||||||
Load input CSV (season_schedule.csv) into a cleaned DataFrame with consistent columns.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- inp: CSV path to read
|
|
||||||
- team_id: 'names' or 'slugs' to identify teams
|
|
||||||
- final_status: if given, filter rows with status matching this (e.g. 'final')
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DataFrame with columns Date, HomeTeam, AwayTeam, HomeRuns, AwayRuns, Margin, Result
|
|
||||||
"""
|
|
||||||
df = pd.read_csv(inp)
|
df = pd.read_csv(inp)
|
||||||
# Choose identifiers
|
# Choose identifiers
|
||||||
# Determine team ID columns based on input param
|
|
||||||
home_id_col = "home_name" if team_id == "names" else "home_slug"
|
home_id_col = "home_name" if team_id == "names" else "home_slug"
|
||||||
away_id_col = "away_name" if team_id == "names" else "away_slug"
|
away_id_col = "away_name" if team_id == "names" else "away_slug"
|
||||||
for c in [home_id_col, away_id_col, "home_runs", "away_runs"]:
|
for c in [home_id_col, away_id_col, "home_runs", "away_runs"]:
|
||||||
@@ -50,31 +38,27 @@ def load_games(
|
|||||||
raise ValueError(f"Missing required column: {c}")
|
raise ValueError(f"Missing required column: {c}")
|
||||||
|
|
||||||
# Optional status filter (helps exclude postponed/canceled)
|
# Optional status filter (helps exclude postponed/canceled)
|
||||||
# Filter for final_status if provided to exclude e.g. postponed games
|
|
||||||
if final_status is not None and "status" in df.columns:
|
if final_status is not None and "status" in df.columns:
|
||||||
df = df[df["status"].astype(str).str.lower() == str(final_status).lower()]
|
df = df[df["status"].astype(str).str.lower() == str(final_status).lower()]
|
||||||
|
|
||||||
# Convert run columns to numeric, drop rows with missing runs or teams
|
# Keep only games with numeric scores
|
||||||
df = df.copy()
|
df = df.copy()
|
||||||
df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce")
|
df["home_runs"] = pd.to_numeric(df["home_runs"], errors="coerce")
|
||||||
df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce")
|
df["away_runs"] = pd.to_numeric(df["away_runs"], errors="coerce")
|
||||||
df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"])
|
df = df.dropna(subset=[home_id_col, away_id_col, "home_runs", "away_runs"])
|
||||||
|
|
||||||
|
# Parse datetime (robust to missing either field)
|
||||||
# Parse datetime by combining date_local and time_local if possible
|
|
||||||
date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce")
|
date = pd.to_datetime(df.get("date_local", pd.NaT), errors="coerce")
|
||||||
time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time
|
time = pd.to_datetime(df.get("time_local", pd.NaT), errors="coerce").dt.time
|
||||||
|
# Combine when possible
|
||||||
dt = date
|
dt = date
|
||||||
if "time_local" in df.columns:
|
if "time_local" in df.columns:
|
||||||
|
# build datetime only where both present
|
||||||
# Build datetime where both date and time present
|
|
||||||
dt = pd.to_datetime(
|
dt = pd.to_datetime(
|
||||||
date.dt.strftime("%Y-%m-%d").fillna("") + " " +
|
date.dt.strftime("%Y-%m-%d").fillna("") + " " +
|
||||||
pd.Series(time).astype(str).replace("NaT",""),
|
pd.Series(time).astype(str).replace("NaT",""),
|
||||||
errors="coerce"
|
errors="coerce"
|
||||||
)
|
)
|
||||||
# Construct cleaned DataFrame with fixed column names
|
|
||||||
df_out = pd.DataFrame({
|
df_out = pd.DataFrame({
|
||||||
"Date": dt,
|
"Date": dt,
|
||||||
"HomeTeam": df[home_id_col].astype(str),
|
"HomeTeam": df[home_id_col].astype(str),
|
||||||
@@ -82,35 +66,19 @@ def load_games(
|
|||||||
"HomeRuns": df["home_runs"].astype(int),
|
"HomeRuns": df["home_runs"].astype(int),
|
||||||
"AwayRuns": df["away_runs"].astype(int),
|
"AwayRuns": df["away_runs"].astype(int),
|
||||||
})
|
})
|
||||||
# Margin is difference in runs (home - away)
|
|
||||||
df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"]
|
df_out["Margin"] = df_out["HomeRuns"] - df_out["AwayRuns"]
|
||||||
# Result: 'H' if home win, 'A' if away win, 'T' for tie
|
|
||||||
df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H",
|
df_out["Result"] = np.where(df_out["HomeRuns"] > df_out["AwayRuns"], "H",
|
||||||
np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T"))
|
np.where(df_out["HomeRuns"] < df_out["AwayRuns"], "A", "T"))
|
||||||
return df_out.reset_index(drop=True)
|
return df_out.reset_index(drop=True)
|
||||||
|
|
||||||
def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
|
def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
"""
|
|
||||||
Aggregate game-level data into team-level season stats: wins, losses, ties, runs scored,
|
|
||||||
runs allowed, games played, win percentage, and run differential.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- df: DataFrame with game results
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DataFrame indexed by Team with aggregated stats
|
|
||||||
"""
|
|
||||||
# Collect all team names from home and away columns
|
|
||||||
teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team")
|
teams = pd.Index(sorted(set(df["HomeTeam"]).union(df["AwayTeam"])), name="Team")
|
||||||
# Initialize stats DataFrame with W/L/T/RS/RA all zero
|
|
||||||
stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0)
|
stats = pd.DataFrame(index=teams, columns=["W","L","T","RS","RA"], data=0)
|
||||||
for _, r in df.iterrows():
|
for _, r in df.iterrows():
|
||||||
h, a = r["HomeTeam"], r["AwayTeam"]
|
h, a = r["HomeTeam"], r["AwayTeam"]
|
||||||
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
|
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
|
||||||
# Update runs scored and allowed for both teams
|
|
||||||
stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar
|
stats.at[h,"RS"] += hr; stats.at[h,"RA"] += ar
|
||||||
stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr
|
stats.at[a,"RS"] += ar; stats.at[a,"RA"] += hr
|
||||||
# Update win/loss/tie counts
|
|
||||||
if hr > ar:
|
if hr > ar:
|
||||||
stats.at[h,"W"] += 1; stats.at[a,"L"] += 1
|
stats.at[h,"W"] += 1; stats.at[a,"L"] += 1
|
||||||
elif hr < ar:
|
elif hr < ar:
|
||||||
@@ -118,57 +86,22 @@ def aggregate_team_stats(df: pd.DataFrame) -> pd.DataFrame:
|
|||||||
else:
|
else:
|
||||||
stats.at[h,"T"] += 1; stats.at[a,"T"] += 1
|
stats.at[h,"T"] += 1; stats.at[a,"T"] += 1
|
||||||
stats = stats.astype(int)
|
stats = stats.astype(int)
|
||||||
# Games played
|
|
||||||
stats["GP"] = stats["W"] + stats["L"] + stats["T"]
|
stats["GP"] = stats["W"] + stats["L"] + stats["T"]
|
||||||
# Win percentage with ties counting as half a win
|
|
||||||
stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan)
|
stats["WinPct"] = (stats["W"] + 0.5 * stats["T"]) / stats["GP"].replace(0, np.nan)
|
||||||
# Run differential (runs scored - runs allowed)
|
|
||||||
stats["RunDiff"] = stats["RS"] - stats["RA"]
|
stats["RunDiff"] = stats["RS"] - stats["RA"]
|
||||||
return stats.reset_index()
|
return stats.reset_index()
|
||||||
|
|
||||||
def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series:
|
def pythagorean(rs: pd.Series, ra: pd.Series, exp: float) -> pd.Series:
|
||||||
"""
|
|
||||||
Compute Pythagorean expectation for winning percentage:
|
|
||||||
RS^exp / (RS^exp + RA^exp), handling zero or missing runs.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- rs: runs scored
|
|
||||||
- ra: runs allowed
|
|
||||||
- exp: exponent (typically ~1.83 for baseball)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Series of expected win percentages
|
|
||||||
"""
|
|
||||||
rs = rs.clip(lower=0); ra = ra.clip(lower=0)
|
rs = rs.clip(lower=0); ra = ra.clip(lower=0)
|
||||||
num = np.power(rs, exp); den = num + np.power(ra, exp)
|
num = np.power(rs, exp); den = num + np.power(ra, exp)
|
||||||
with np.errstate(divide="ignore", invalid="ignore"):
|
with np.errstate(divide="ignore", invalid="ignore"):
|
||||||
|
p = np.where(den > 0, num / den, 0.5)
|
||||||
p = np.where(den > 0, num / den, 0.5) # handle zero denominator as 0.5 (neutral)
|
|
||||||
return pd.Series(p, index=rs.index)
|
return pd.Series(p, index=rs.index)
|
||||||
|
|
||||||
def estimate_home_field_runs(df: pd.DataFrame) -> float:
|
def estimate_home_field_runs(df: pd.DataFrame) -> float:
|
||||||
"""
|
|
||||||
Estimate home-field advantage in runs as the average margin (home_runs - away_runs).
|
|
||||||
Useful for adjusting rating systems to neutralize advantage.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Float average home-field runs advantage.
|
|
||||||
"""
|
|
||||||
return float(df["Margin"].mean()) if len(df) else 0.0
|
return float(df["Margin"].mean()) if len(df) else 0.0
|
||||||
|
|
||||||
def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]:
|
def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series, float]:
|
||||||
"""
|
|
||||||
Calculate Massey ratings (simple linear system) for teams using margins of victory.
|
|
||||||
Optionally caps margins and subtracts estimated home field runs.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- df: games DataFrame with HomeTeam, AwayTeam, Margin columns
|
|
||||||
- cap: maximum absolute margin value to use (run cap)
|
|
||||||
- subtract_home: whether to subtract estimated home field runs advantage
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (ratings Series indexed by team, estimated home-run advantage float)
|
|
||||||
"""
|
|
||||||
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
|
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
|
||||||
idx = {t:i for i,t in enumerate(teams)}
|
idx = {t:i for i,t in enumerate(teams)}
|
||||||
y = df["Margin"].astype(float).to_numpy()
|
y = df["Margin"].astype(float).to_numpy()
|
||||||
@@ -178,107 +111,48 @@ def massey(df: pd.DataFrame, cap: float, subtract_home: bool) -> tuple[pd.Series
|
|||||||
if subtract_home:
|
if subtract_home:
|
||||||
y = y - h_est
|
y = y - h_est
|
||||||
G, N = len(df), len(teams)
|
G, N = len(df), len(teams)
|
||||||
# Construct design matrix A with +1 for home, -1 for away per game, plus normalization row
|
|
||||||
A = np.zeros((G+1, N), dtype=float)
|
A = np.zeros((G+1, N), dtype=float)
|
||||||
for r_i, r in enumerate(df.itertuples(index=False)):
|
for r_i, r in enumerate(df.itertuples(index=False)):
|
||||||
A[r_i, idx[r.HomeTeam]] = 1.0
|
A[r_i, idx[r.HomeTeam]] = 1.0
|
||||||
A[r_i, idx[r.AwayTeam]] = -1.0
|
A[r_i, idx[r.AwayTeam]] = -1.0
|
||||||
# Normalize ratings sum to zero for uniqueness
|
|
||||||
A[G, :] = 1.0
|
A[G, :] = 1.0
|
||||||
y_ext = np.concatenate([y, [0.0]])
|
y_ext = np.concatenate([y, [0.0]])
|
||||||
# Solve least squares for ratings vector
|
|
||||||
r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None)
|
r_sol, *_ = np.linalg.lstsq(A, y_ext, rcond=None)
|
||||||
return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0)
|
return pd.Series(r_sol, index=teams), (h_est if subtract_home else 0.0)
|
||||||
|
|
||||||
def elo_expected(ra: float, rb: float) -> float:
|
def elo_expected(ra: float, rb: float) -> float:
|
||||||
"""
|
|
||||||
Compute Elo expected probability (expected score) for player A.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- ra: rating of player A
|
|
||||||
- rb: rating of player B
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Probability player A wins
|
|
||||||
"""
|
|
||||||
return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0))
|
return 1.0 / (1.0 + 10.0 ** (-(ra - rb) / 400.0))
|
||||||
|
|
||||||
def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]:
|
def elo_once(df: pd.DataFrame, K: float, H: float, mcap: float, init: dict[str,float]) -> dict[str,float]:
|
||||||
"""
|
|
||||||
Perform one pass of Elo rating updates across the games in chronological order.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- df: DataFrame with games (must have HomeTeam, AwayTeam, HomeRuns, AwayRuns)
|
|
||||||
- K: Elo K-factor (adjustment multiplier)
|
|
||||||
- H: home field bonus in points
|
|
||||||
- mcap: cap for margin of victory factor ln(|margin| + 1)
|
|
||||||
- init: dict of initial ratings by team
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Updated dict of Elo ratings after processing games.
|
|
||||||
"""
|
|
||||||
ratings = dict(init)
|
ratings = dict(init)
|
||||||
for _, r in df.iterrows():
|
for _, r in df.iterrows():
|
||||||
h, a = r["HomeTeam"], r["AwayTeam"]
|
h, a = r["HomeTeam"], r["AwayTeam"]
|
||||||
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
|
hr, ar = int(r["HomeRuns"]), int(r["AwayRuns"])
|
||||||
margin = hr - ar
|
margin = hr - ar
|
||||||
# Calculate expected win probability for home team (with home advantage added)
|
|
||||||
Eh = elo_expected(ratings[h] + H, ratings[a])
|
Eh = elo_expected(ratings[h] + H, ratings[a])
|
||||||
# Actual game result scores (1 for win, 0 for loss, 0.5 tie)
|
|
||||||
Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5))
|
Sh, Sa = (1.0, 0.0) if hr > ar else ((0.0, 1.0) if hr < ar else (0.5, 0.5))
|
||||||
# Margin factor based on logarithm of absolute margin plus one
|
|
||||||
M = np.log(abs(margin) + 1.0)
|
M = np.log(abs(margin) + 1.0)
|
||||||
if mcap is not None:
|
if mcap is not None:
|
||||||
M = min(M, mcap)
|
M = min(M, mcap)
|
||||||
# Elo rating update, scaled by margin factor and difference between actual and expected score
|
|
||||||
ratings[h] += K * M * (Sh - Eh)
|
ratings[h] += K * M * (Sh - Eh)
|
||||||
ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh))
|
ratings[a] += K * M * ((1.0 - Sh) - (1.0 - Eh))
|
||||||
return ratings
|
return ratings
|
||||||
|
|
||||||
def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series:
|
def elo(df: pd.DataFrame, K=24.0, H=30.0, mcap=2.0, shuffles=20, seed=42) -> pd.Series:
|
||||||
"""
|
|
||||||
Compute Elo ratings averaged over multiple random shuffle orders of games
|
|
||||||
to reduce order dependency of sequential Elo updates.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- df: games DataFrame sorted by Date
|
|
||||||
- K: Elo K-factor
|
|
||||||
- H: home field advantage bonus
|
|
||||||
- mcap: margin factor cap
|
|
||||||
- shuffles: number of random game orders to compute Elo over
|
|
||||||
- seed: RNG seed for reproducibility
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Series of Elo ratings indexed by team
|
|
||||||
"""
|
|
||||||
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
|
teams = sorted(set(df["HomeTeam"]).union(df["AwayTeam"]))
|
||||||
|
base = {t: 1500.0 for t in teams}
|
||||||
base = {t: 1500.0 for t in teams} # initial Elo ratings
|
|
||||||
df0 = df.sort_values(["Date"]).reset_index(drop=True)
|
df0 = df.sort_values(["Date"]).reset_index(drop=True)
|
||||||
# Elo with original date order (baseline)
|
|
||||||
r_first = elo_once(df0, K, H, mcap, base)
|
r_first = elo_once(df0, K, H, mcap, base)
|
||||||
# Initialize RNG
|
|
||||||
rng = np.random.default_rng(seed)
|
rng = np.random.default_rng(seed)
|
||||||
vals = {t: [r_first[t]] for t in teams}
|
vals = {t: [r_first[t]] for t in teams}
|
||||||
# Compute Elo over randomized orderings for averaging
|
|
||||||
for _ in range(max(0, shuffles-1)):
|
for _ in range(max(0, shuffles-1)):
|
||||||
idx = np.arange(len(df0)); rng.shuffle(idx)
|
idx = np.arange(len(df0)); rng.shuffle(idx)
|
||||||
r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base)
|
r = elo_once(df0.iloc[idx].reset_index(drop=True), K, H, mcap, base)
|
||||||
for t in teams:
|
for t in teams:
|
||||||
vals[t].append(r[t])
|
vals[t].append(r[t])
|
||||||
# Average ratings across runs for each team
|
|
||||||
return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index()
|
return pd.Series({t: float(np.mean(vals[t])) for t in teams}).sort_index()
|
||||||
|
|
||||||
def zscore(s: pd.Series) -> pd.Series:
|
def zscore(s: pd.Series) -> pd.Series:
|
||||||
"""
|
|
||||||
Calculate z-score (standard score) for a pandas Series.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
- s: input Series
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Series normalized to mean=0 and std=1; zeros if std=0.
|
|
||||||
"""
|
|
||||||
mu, sd = s.mean(), s.std(ddof=0)
|
mu, sd = s.mean(), s.std(ddof=0)
|
||||||
return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd
|
return pd.Series(0.0, index=s.index) if (sd == 0 or np.isnan(sd)) else (s - mu) / sd
|
||||||
|
|
||||||
@@ -302,68 +176,47 @@ def main(
|
|||||||
elo_shuffles: int = typer.Option(20, help="Random shuffles to average Elo"),
|
elo_shuffles: int = typer.Option(20, help="Random shuffles to average Elo"),
|
||||||
elo_seed: int = typer.Option(42, help="RNG seed for shuffles")
|
elo_seed: int = typer.Option(42, help="RNG seed for shuffles")
|
||||||
):
|
):
|
||||||
"""
|
|
||||||
Main entry point:
|
|
||||||
|
|
||||||
Loads input games, computes aggregate stats, Pythagorean expectation,
|
|
||||||
Massey ratings, Elo ratings (averaged over shuffles), Strength of Schedule,
|
|
||||||
and an overall CompositeRating combining these metrics.
|
|
||||||
|
|
||||||
Outputs a CSV file with rankings and stats.
|
|
||||||
"""
|
|
||||||
team_id = team_id.lower()
|
team_id = team_id.lower()
|
||||||
|
# Load games
|
||||||
# Load cleaned games DataFrame
|
|
||||||
games = load_games(inp, team_id=team_id, final_status=final_status)
|
games = load_games(inp, team_id=team_id, final_status=final_status)
|
||||||
|
|
||||||
|
# Aggregates
|
||||||
# Compute aggregated team-level statistics from games
|
|
||||||
team = aggregate_team_stats(games)
|
team = aggregate_team_stats(games)
|
||||||
# Calculate Pythagorean expected winning percentage
|
|
||||||
team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], pyexp)
|
team["PythagoreanWinPct"] = pythagorean(team["RS"], team["RA"], pyexp)
|
||||||
|
|
||||||
|
# Ratings
|
||||||
# Calculate Massey ratings and get estimated home field runs
|
|
||||||
massey_r, h_runs = massey(games, cap=massey_cap, subtract_home=not no_massey_home_adj)
|
massey_r, h_runs = massey(games, cap=massey_cap, subtract_home=not no_massey_home_adj)
|
||||||
|
|
||||||
|
# Strength of schedule
|
||||||
# Calculate Strength of Schedule as average Massey rating of opponents
|
|
||||||
opps = {t: [] for t in massey_r.index}
|
opps = {t: [] for t in massey_r.index}
|
||||||
for _, r in games.iterrows():
|
for _, r in games.iterrows():
|
||||||
opps[r["HomeTeam"]].append(r["AwayTeam"])
|
opps[r["HomeTeam"]].append(r["AwayTeam"])
|
||||||
opps[r["AwayTeam"]].append(r["HomeTeam"])
|
opps[r["AwayTeam"]].append(r["HomeTeam"])
|
||||||
sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps})
|
sos_series = pd.Series({t: (float(massey_r[opps[t]].mean()) if opps[t] else 0.0) for t in opps})
|
||||||
|
|
||||||
# Compute Elo ratings with multiple shuffles for stability
|
|
||||||
elo_r = elo(games, K=elo_k, H=elo_home, mcap=elo_mcap, shuffles=elo_shuffles, seed=elo_seed)
|
elo_r = elo(games, K=elo_k, H=elo_home, mcap=elo_mcap, shuffles=elo_shuffles, seed=elo_seed)
|
||||||
|
|
||||||
|
# Merge
|
||||||
# Merge all metrics into a single DataFrame
|
|
||||||
out_df = team.set_index("Team")
|
out_df = team.set_index("Team")
|
||||||
out_df["MasseyRating"] = massey_r
|
out_df["MasseyRating"] = massey_r
|
||||||
out_df["EloRating"] = elo_r
|
out_df["EloRating"] = elo_r
|
||||||
out_df["StrengthOfSchedule"] = sos_series
|
out_df["StrengthOfSchedule"] = sos_series
|
||||||
|
|
||||||
|
# Composite
|
||||||
# Composite rating: weighted Z-score combination of Massey, Elo, and Pythagorean
|
|
||||||
Z_r, Z_e, Z_p = zscore(out_df["MasseyRating"]), zscore(out_df["EloRating"]), zscore(out_df["PythagoreanWinPct"])
|
Z_r, Z_e, Z_p = zscore(out_df["MasseyRating"]), zscore(out_df["EloRating"]), zscore(out_df["PythagoreanWinPct"])
|
||||||
out_df["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p
|
out_df["CompositeRating"] = 0.45*Z_r + 0.35*Z_e + 0.20*Z_p
|
||||||
|
|
||||||
out_df = out_df.reset_index()
|
out_df = out_df.reset_index()
|
||||||
# Select columns and sort teams by CompositeRating descending
|
|
||||||
out_df = out_df[[
|
out_df = out_df[[
|
||||||
"Team","GP","W","L","T","WinPct","RS","RA","RunDiff",
|
"Team","GP","W","L","T","WinPct","RS","RA","RunDiff",
|
||||||
"PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"
|
"PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"
|
||||||
]].sort_values("CompositeRating", ascending=False)
|
]].sort_values("CompositeRating", ascending=False)
|
||||||
|
|
||||||
|
# Round for readability
|
||||||
# Round numeric columns for neatness
|
|
||||||
for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]:
|
for c in ["WinPct","PythagoreanWinPct","MasseyRating","EloRating","StrengthOfSchedule","CompositeRating"]:
|
||||||
out_df[c] = out_df[c].astype(float).round(5)
|
out_df[c] = out_df[c].astype(float).round(5)
|
||||||
|
|
||||||
# Write to output CSV
|
|
||||||
out_df.to_csv(out, index=False)
|
out_df.to_csv(out, index=False)
|
||||||
# Output summary info
|
|
||||||
print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}")
|
print(f"Done. Estimated home-field (runs) used in Massey: {h_runs:.3f}")
|
||||||
print(f"Teams ranked: {len(out_df)} | Games processed: {len(games)}")
|
print(f"Teams ranked: {len(out_df)} | Games processed: {len(games)}")
|
||||||
print(f"Output -> {out}")
|
print(f"Output -> {out}")
|
||||||
|
|||||||
Reference in New Issue
Block a user