Files
baseball-db/convert_to_sportspress/utils.py
2023-12-31 14:28:02 -06:00

263 lines
9.8 KiB
Python

import csv
import re
from typing import List, Dict
from dateutil import parser
from pathlib import Path
from rich.console import Console
from rich.table import Table
def normalize_header_key(key: str) -> str:
key_mapping = {
"away": "visitor",
"results": "results",
"final score": "results",
"venue": "field",
"location":"field",
"result": "results",
"w":"win",
"l":"loss",
"t":"tie",
"div":"division",
"rf":"runs_for",
"runs":"runs_against"
}
return key_mapping.get(key.lower().strip(), key.lower().strip())
def validate_csv_header(header: List[str]) -> bool:
required_keys = ["date", "time", "field", "visitor", "home", "results"]
normalized_header = [normalize_header_key(key) for key in header]
return all(key in normalized_header for key in required_keys)
def read_csv(file_path: Path) -> List[dict]:
data = []
with open(file_path, "r", newline="") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
normalized_row = {normalize_header_key(key): value.strip() for key, value in row.items()}
data.append(normalized_row)
return data
def write_csv(file_path: Path, data: List[dict]) -> None:
with open(file_path, "w", newline="") as csvfile:
fieldnames = data[0].keys()
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
def parse_score(score_str: str, reverse_order: bool = False) -> Dict[str, int]:
"""
Parse a score string and extract home and visitor scores.
Args:
score_str (str): The score string contain somewhere "visitor-home".
reverse_order (bool, optional): If True, the order of the scores is reversed (home first).
Defaults to False.
Returns:
Dict[str, int]: A dictionary containing home and visitor scores.
"""
regex = re.compile(r"^(?P<pre>.*?)?(?:(?P<runs_first>\d+)-(?P<runs_second>\d+))?(?P<post>.*?)?$")
match = regex.match(score_str)
if match:
score = {}
if match.group("pre"): score["pre"] = match.group("pre")
if match.group("post"): score["post"] = match.group("post")
if match.group("runs_first") and match.group("runs_second"):
score['has_result'] = True
runs_first, runs_second = int(match.group("runs_first")), int(match.group("runs_second"))
if not reverse_order:
score.update({
"home_runs_for": runs_second, "visitor_runs_for": runs_first,
"home_runs_against":runs_first, "visitor_runs_against": runs_second
})
elif reverse_order:
score.update({
"home_runs_for": runs_first, "visitor_runs_for": runs_second,
"home_runs_against": runs_second, "visitor_runs_against": runs_first
})
if score["home_runs_for"] > score["visitor_runs_for"]:
score["home_outcome"] = "win"
score["visitor_outcome"] = "loss"
if "forfeit" in score.get("post",""):
score["visitor_outcome"] = "forfeit"
elif score["home_runs_for"] < score["visitor_runs_for"]:
score["home_outcome"] = "loss"
score["visitor_outcome"] = "win"
if "forfeit" in score.get("post",""):
score["home_outcome"] = "forfeit"
else:
score["home_outcome"] = "tie"
score["visitor_outcome"] = "tie"
else:
score['has_result'] = False
return score
raise ValueError("Invalid score format")
def is_visitor_home_order_reversed(header: List[str]) -> bool:
"""
Determine if the order of 'visitor' and 'home' in the header suggests reversed order.
convention is that home is second.
Args:
header (List[str]): The list of header keys.
Returns:
bool: True if the 'home' key comes before the 'visitor' key, indicating reversed order.
"""
return header.index('visitor') > header.index('home')
def process_data(data: List[Dict], visitor_home_order_reversed = False) -> List[Dict]:
for row in data:
parsed_score = parse_score(row["results"], visitor_home_order_reversed)
row.update(parsed_score)
try:
row['datetime'] = parser.parse(f"{row['date']} {row['time']}")
except parser.ParserError as e:
raise e
return data
def aggregate_teams(data: List[Dict[str, str]]) -> List[Dict[str, int]]:
"""
Aggregate data by team, summing up wins, losses, and ties.
Args:
data (List[Dict[str, str]]): A list of dictionaries representing the CSV data.
Returns:
List[Dict[str, int]]: A list of dictionaries containing aggregated data for each team.
"""
team_stats = {}
for row in data:
if not row["has_result"]:
continue
home_team = row["home"]
visitor_team = row["visitor"]
team_stats.setdefault(home_team, {"win": 0, "loss": 0, "tie": 0, "gp": 0, "runs_for": 0, "runs_against":0})
team_stats.setdefault(visitor_team, {"win": 0, "loss": 0, "tie": 0, "gp": 0, "runs_for": 0, "runs_against":0})
team_stats[home_team]['gp'] += 1
team_stats[visitor_team]['gp'] += 1
for outcome in ["win", "loss", "tie"]:
if row["home_outcome"] == outcome:
team_stats[home_team][outcome] += 1
# team_stats[home_team]["games"].append(f"{row['datetime']}: {visitor_team}: {outcome[0].upper()} {row['home_runs_for']}-{row['home_runs_against']}")
if row["visitor_outcome"] == outcome:
team_stats[visitor_team][outcome] += 1
# team_stats[visitor_team]["games"].append(f"{row['datetime']}: {home_team}: {outcome[0].upper()} {row['visitor_runs_for']}-{row['visitor_runs_against']}")
team_stats[home_team]["runs_for"] += row["home_runs_for"]
team_stats[home_team]["runs_against"] += row["home_runs_against"]
team_stats[visitor_team]["runs_for"] += row["visitor_runs_for"]
team_stats[visitor_team]["runs_against"] += row["visitor_runs_against"]
# Convert team_stats dictionary to a list of dictionaries
aggregated_data = [{"team": team, **stats} for team, stats in team_stats.items()]
# Sort the list by team name
sorted_aggregated_data = sorted(aggregated_data, key=lambda x: x["win"], reverse=True)
return sorted_aggregated_data
def write_sportspress_csv(data: List[Dict], file_path: Path, only_with_outcome:bool = False):
"""
Writes sports event data to a CSV file in a specific format.
Parameters:
- data (List[Dict]): List of dictionaries where each dictionary represents a sports event.
- file_path (Path): The Path object representing the file path where the CSV file will be created.
- only_with_outcome (bool, optional): If True, only events with outcomes will be included in the CSV. Default is False.
Returns:
None
Example:
>>> data = [...] # List of dictionaries representing sports events
>>> file_path = Path("output.csv")
>>> write_sportspress_csv(data, file_path)
"""
with file_path.open('w') as output_csv_file:
writer = csv.writer(output_csv_file)
fieldnames = [
"Format", #Competitive or Friendly
# "Competition",
"Season",
# "Date Format",
"Date",
"Time",
"Venue",
"Team",
"Results",
"Outcome",
# "Players",
# "Performance",
]
# Write the header
writer.writerow(fieldnames)
# Write the data
for row in data:
if only_with_outcome and not row['has_result']:
continue
writer.writerow(
[
row["datetime"].strftime("%Y/%m/%d"),
row["datetime"].strftime("%H:%M"),
row.get("field", ""),
row["home"],
"|".join([str(row.get(k,"")) for k in [
"home_runs_for_inning_1",
"home_runs_for_inning_2",
"home_runs_for_inning_3",
"home_runs_for_inning_4",
"home_runs_for_inning_5",
"home_runs_for_inning_6",
"home_runs_for_inning_7",
"home_runs_for_inning_8",
"home_runs_for_inning_9",
"home_runs_for_inning_10",
"home_runs_for",
"home_errors",
"home_hits"
]]),
row.get("home_outcome")
]
)
writer.writerow(
[
"",
"",
"",
row["visitor"],
"|".join([str(row.get(k,"")) for k in [
"visitor_runs_for_inning_1",
"visitor_runs_for_inning_2",
"visitor_runs_for_inning_3",
"visitor_runs_for_inning_4",
"visitor_runs_for_inning_5",
"visitor_runs_for_inning_6",
"visitor_runs_for_inning_7",
"visitor_runs_for_inning_8",
"visitor_runs_for_inning_9",
"visitor_runs_for_inning_10",
"visitor_runs_for",
"visitor_errors",
"visitor_hits"
]]),
row.get("visitor_outcome")
]
)