baseball-db/convert_to_sportspress/utils.py

import csv
import re
from typing import List, Dict
from dateutil import parser
from pathlib import Path
from rich.console import Console
from rich.table import Table

def normalize_header_key(key: str) -> str:
    key_mapping = {
        "away": "visitor",
        "results": "results",
        "final score": "results",
        "venue": "field",
        "location":"field",
        "result": "results",
        "w":"win",
        "l":"loss",
        "t":"tie",
        "div":"division",
        "rf":"runs_for",
        "runs":"runs_against"
    }
    return key_mapping.get(key.lower().strip(), key.lower().strip())

def validate_csv_header(header: List[str]) -> bool:
    required_keys = ["date", "time", "field", "visitor", "home", "results"]
    normalized_header = [normalize_header_key(key) for key in header]
    return all(key in normalized_header for key in required_keys)

def read_csv(file_path: Path) -> List[dict]:
    data = []
    with open(file_path, "r", newline="") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            normalized_row = {normalize_header_key(key): value.strip() for key, value in row.items()}
            data.append(normalized_row)
    return data

def write_csv(file_path: Path, data: List[dict]) -> None:
    with open(file_path, "w", newline="") as csvfile:
        fieldnames = data[0].keys()
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

def parse_score(score_str: str, reverse_order: bool = False) -> Dict[str, int]:
    """
    Parse a score string and extract home and visitor scores.

    Args:
        score_str (str): The score string contain somewhere "visitor-home".
        reverse_order (bool, optional): If True, the order of the scores is reversed (home first).
            Defaults to False.

    Returns:
        Dict[str, int]: A dictionary containing home and visitor scores.
    """
    regex = re.compile(r"^(?P<pre>.*?)?(?:(?P<runs_first>\d+)-(?P<runs_second>\d+))?(?P<post>.*?)?$")
    match = regex.match(score_str)

    if match:
        score = {}
        if match.group("pre"):  score["pre"] =  match.group("pre")
        if match.group("post"): score["post"] = match.group("post")
        if match.group("runs_first") and match.group("runs_second"):
            score['has_result'] = True
            runs_first, runs_second = int(match.group("runs_first")), int(match.group("runs_second"))
            if not reverse_order:
                score.update({
                    "home_runs_for": runs_second,   "visitor_runs_for": runs_first,
                    "home_runs_against":runs_first, "visitor_runs_against": runs_second
                    })
            elif reverse_order:
                score.update({
                    "home_runs_for": runs_first,        "visitor_runs_for": runs_second,
                    "home_runs_against": runs_second,   "visitor_runs_against": runs_first
                    })

            if score["home_runs_for"] > score["visitor_runs_for"]:
                score["home_outcome"] = "win"
                score["visitor_outcome"] = "loss"
                if "forfeit" in score.get("post",""):
                    score["visitor_outcome"] = "forfeit"
            elif score["home_runs_for"] < score["visitor_runs_for"]:
                score["home_outcome"] = "loss"
                score["visitor_outcome"] = "win"
                if "forfeit" in score.get("post",""):
                    score["home_outcome"] = "forfeit"
            else:
                score["home_outcome"] = "tie"
                score["visitor_outcome"] = "tie"
        else:
            score['has_result'] = False

        return score

    raise ValueError("Invalid score format")

def is_visitor_home_order_reversed(header: List[str]) -> bool:
    """
    Determine if the order of 'visitor' and 'home' in the header suggests reversed order.
    convention is that home is second.

    Args:
        header (List[str]): The list of header keys.

    Returns:
        bool: True if the 'home' key comes before the 'visitor' key, indicating reversed order.
    """
    return header.index('visitor') > header.index('home')

def process_data(data: List[Dict], visitor_home_order_reversed = False) -> List[Dict]:

    for row in data:
        parsed_score = parse_score(row["results"], visitor_home_order_reversed)
        row.update(parsed_score)
        try:
            row['datetime'] = parser.parse(f"{row['date']} {row['time']}")
        except parser.ParserError as e:
            raise e

    return data

def aggregate_teams(data: List[Dict[str, str]]) -> List[Dict[str, int]]:
    """
    Aggregate data by team, summing up wins, losses, and ties.

    Args:
        data (List[Dict[str, str]]): A list of dictionaries representing the CSV data.

    Returns:
        List[Dict[str, int]]: A list of dictionaries containing aggregated data for each team.
    """
    team_stats = {}

    for row in data:
        if not row["has_result"]:
            continue
        home_team = row["home"]
        visitor_team = row["visitor"]
        team_stats.setdefault(home_team, {"win": 0, "loss": 0, "tie": 0, "gp": 0, "runs_for": 0, "runs_against":0})
        team_stats.setdefault(visitor_team, {"win": 0, "loss": 0, "tie": 0, "gp": 0, "runs_for": 0, "runs_against":0})

        team_stats[home_team]['gp'] += 1
        team_stats[visitor_team]['gp'] += 1

        for outcome in ["win", "loss", "tie"]:
            if row["home_outcome"] == outcome:
                team_stats[home_team][outcome] += 1
                # team_stats[home_team]["games"].append(f"{row['datetime']}: {visitor_team}: {outcome[0].upper()} {row['home_runs_for']}-{row['home_runs_against']}")

            if row["visitor_outcome"] == outcome:
                team_stats[visitor_team][outcome] += 1
                # team_stats[visitor_team]["games"].append(f"{row['datetime']}: {home_team}: {outcome[0].upper()} {row['visitor_runs_for']}-{row['visitor_runs_against']}")

        team_stats[home_team]["runs_for"] += row["home_runs_for"]
        team_stats[home_team]["runs_against"] += row["home_runs_against"]

        team_stats[visitor_team]["runs_for"] += row["visitor_runs_for"]
        team_stats[visitor_team]["runs_against"] += row["visitor_runs_against"]


    # Convert team_stats dictionary to a list of dictionaries
    aggregated_data = [{"team": team, **stats} for team, stats in team_stats.items()]

    # Sort the list by team name
    sorted_aggregated_data = sorted(aggregated_data, key=lambda x: x["win"], reverse=True)

    return sorted_aggregated_data

def write_sportspress_csv(data: List[Dict], file_path: Path, only_with_outcome:bool = False):
    """
    Writes sports event data to a CSV file in a specific format.

    Parameters:
    - data (List[Dict]): List of dictionaries where each dictionary represents a sports event.
    - file_path (Path): The Path object representing the file path where the CSV file will be created.
    - only_with_outcome (bool, optional): If True, only events with outcomes will be included in the CSV. Default is False.

    Returns:
    None

    Example:
    >>> data = [...]  # List of dictionaries representing sports events
    >>> file_path = Path("output.csv")
    >>> write_sportspress_csv(data, file_path)
    """

    with file_path.open('w') as output_csv_file:
        writer = csv.writer(output_csv_file)

        fieldnames = [
            "Format", #Competitive or Friendly
            # "Competition",
            "Season",
            # "Date Format",
            "Date",
            "Time",
            "Venue",
            "Team",
            "Results",
            "Outcome",
            # "Players",
            # "Performance",
        ]

        # Write the header
        writer.writerow(fieldnames)

        # Write the data
        for row in data:
            if only_with_outcome and not row['has_result']:
                continue
            writer.writerow(
                [
                    row["datetime"].strftime("%Y/%m/%d"),
                    row["datetime"].strftime("%H:%M"),
                    row.get("field", ""),
                    row["home"],
                    "|".join([str(row.get(k,"")) for k in [
                        "home_runs_for_inning_1",
                        "home_runs_for_inning_2",
                        "home_runs_for_inning_3",
                        "home_runs_for_inning_4",
                        "home_runs_for_inning_5",
                        "home_runs_for_inning_6",
                        "home_runs_for_inning_7",
                        "home_runs_for_inning_8",
                        "home_runs_for_inning_9",
                        "home_runs_for_inning_10",
                        "home_runs_for",
                        "home_errors",
                        "home_hits"
                    ]]),
                    row.get("home_outcome")
                    ]
                )
            writer.writerow(
                [
                    "",
                    "",
                    "",
                    row["visitor"],
                    "|".join([str(row.get(k,"")) for k in [
                        "visitor_runs_for_inning_1",
                        "visitor_runs_for_inning_2",
                        "visitor_runs_for_inning_3",
                        "visitor_runs_for_inning_4",
                        "visitor_runs_for_inning_5",
                        "visitor_runs_for_inning_6",
                        "visitor_runs_for_inning_7",
                        "visitor_runs_for_inning_8",
                        "visitor_runs_for_inning_9",
                        "visitor_runs_for_inning_10",
                        "visitor_runs_for",
                        "visitor_errors",
                        "visitor_hits"
                    ]]),
                    row.get("visitor_outcome")
                    ]
                )