Source code for ufcscraper.fight_scraper

"""
This module defines classes for scraping fight and round data from the UFCStats
website.

Classes:
    FightScraper: Inherits from `BaseScraper` and is responsible for scraping
    detailed fight statistics, such as fighter information, results, referees,
    and more. The data is stored in a CSV file named `fight_data.csv`. It also
    interacts with the `RoundsHandler` to scrape and store round-specific
    statistics.

    RoundsHandler: Inherits from `BaseFileHandler` and manages the collection
    and storage of round-specific fight data. The data is saved in a CSV file
    named `round_data.csv`. It handles statistics like strikes, takedowns,
    control time, and more.
"""

from __future__ import annotations

from abc import ABC
import csv
import logging
import re
from typing import TYPE_CHECKING
import pandas as pd

from ufcscraper.base import BaseScraper, BaseFileHandler
from ufcscraper.event_scraper import EventScraper, UpcomingEventScraper
from ufcscraper.fighter_scraper import FighterScraper
from ufcscraper.utils import links_to_soups

if TYPE_CHECKING:  # pragma: no cover
    import bs4
    from typing import Any, Dict, List, Tuple

logger = logging.getLogger(__name__)


[docs]
class BaseFightScraper(BaseScraper, ABC):
    """Base class for fight scrapers.

    This class provides the basic functionality to scrape fight data from the UFCStats
    it should be inherited by specific fight scraper classes.
    """
    event_scraper = EventScraper


[docs]
    @classmethod
    def url_from_id(cls, id_: str) -> str:
        """Constructs the fight URL using the fight ID.

        Args:
            id_: The unique identifier for the fight.

        Returns:
            The full URL to the fight's details page on UFCStats.
        """
        return f"{cls.web_url}/fight-details/{id_}"

    

[docs]
    def get_fight_urls(self, get_all_events: bool = False) -> List[str]:
        """Retrieves URLs of all fights from UFCStats.

        Args:
            get_all_events: If False, only gets URLs for fights from events
                not already scraped.

        Returns:
            A list of URLs for fights.
        """
        logger.info("Scraping fight links...")

        logger.info("Opening event information to extract event urls...")
        event_scraper_instance = self.event_scraper(self.data_folder, self.n_sessions, self.delay)
        event_ids = event_scraper_instance.data["event_id"].unique().tolist()

        # Remove events for which information is extracted
        if not get_all_events:
            event_ids = [
                id_
                for id_ in event_ids
                if id_ not in self.data["event_id"].unique().tolist()
            ]

        event_urls: List[str] = list(map(self.event_scraper.url_from_id, event_ids))
        fight_urls = event_scraper_instance.get_fight_urls_from_event_urls(event_urls)

        logger.info(f"Got {len(fight_urls)} fight links...")
        return list(fight_urls)



[docs]
    @staticmethod
    def get_fighters(
        fight_details: bs4.element.ResultSet, fight_soup: bs4.BeautifulSoup
    ) -> Tuple[str, str]:
        """Extracts fighter IDs from the fight details.

        Args:
            fight_details: A ResultSet containing fight detail information.
            fight_soup: The BeautifulSoup object containing the fight page.

        Returns:
            A tuple containing the IDs of the two fighters.
        """
        # Scrape both fighter names
        try:
            fighters = (
                fight_details[0].select("a.b-link.b-link_style_black")[0]["href"],
                fight_details[1].select("a.b-link.b-link_style_black")[0]["href"],
            )
        except:  # pragma: no cover
            fighters = (
                fight_soup.select("a.b-fight-details__person-link")[0]["href"],
                fight_soup.select("a.b-fight-details__person-link")[1]["href"],
            )

        fighter_1, fighter_2 = map(
            FighterScraper.id_from_url,
            fighters,
        )

        return fighter_1, fighter_2


    # Checks if fight is title fight

[docs]
    @staticmethod
    def get_title_fight(fight_type: bs4.element.ResultSet) -> str:
        """Determines if the fight is a title fight.

        Args:
            fight_type: A ResultSet containing fight type information.

        Returns:
            'T' if it's a title fight, 'F' otherwise.
        """
        if "Title" in fight_type[0].text:
            return "T"
        else:
            return "F"


    # Scrapes weight class of fight

[docs]
    @staticmethod
    def get_weight_class(fight_type: bs4.element.ResultSet) -> str:
        """Extracts the weight class of the fight.

        Args:
            fight_type: A ResultSet containing fight type information.

        Returns:
            The weight class of the fight, or '' if not found.
        """
        if "Light Heavyweight" in fight_type[0].text.strip():
            return "Light Heavyweight"

        elif "Women" in fight_type[0].text.strip():
            return "Women's " + re.findall(r"\w*weight", fight_type[0].text.strip())[0]

        elif "Catch Weight" in fight_type[0].text.strip():
            return "Catch Weight"

        elif "Open Weight" in fight_type[0].text.strip():
            return "Open Weight"

        else:
            try:
                return re.findall(r"\w*weight", fight_type[0].text.strip())[0]
            except:
                return ""



    

[docs]
class FightScraper(BaseFightScraper):
    """Scrapes fight data from the UFCStats website.

    This class inherits from `BaseScraper` and handles scraping detailed
    fight statistics including fighters, referees, results, and more. It
    saves the scraped data into two CSV files: one for fights and one for
    rounds (through the companion class `RoundsHandler`).
    """

    dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = {
        "fight_id": str,
        "event_id": str,
        "referee": str,
        "fighter_1": str,
        "fighter_2": str,
        "winner": str,
        "num_rounds": pd.Int64Dtype(),
        "title_fight": str,
        "weight_class": str,
        "gender": str,
        "result": str,
        "result_details": str,
        "finish_round": pd.Int64Dtype(),
        "finish_time": str,
        "time_format": str,
        "scores_1": pd.Int64Dtype(),
        "scores_2": pd.Int64Dtype(),
    }
    sort_fields = ["event_id", "fight_id"]
    data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()})
    filename = "fight_data.csv"

    def __init__(self, *args: Any, **kwargs: Any) -> None:
        """Initializes the FightScraper and the companion RoundsHandler.

        Args:
            *args: Additional positional arguments passed to the base class.
            **kwargs: Additional keyword arguments passed to the base class.
        """
        super().__init__(*args, **kwargs)

        self.rounds_handler = RoundsHandler(self.data_folder)


[docs]
    def scrape_fights(self, get_all_events: bool = False) -> None:
        """Scrapes fight data and saves it to CSV files.

        This method scrapes fight details and round statistics. It saves the
        fight details and round statistics to separate CSV files.

        Args:
            get_all_events: If False, only scrapes fights from events not
                already scraped.
        """
        existing_urls = set(map(self.url_from_id, self.data["fight_id"]))
        ufcstats_fight_urls = self.get_fight_urls(get_all_events)
        urls_to_scrape = set(ufcstats_fight_urls) - existing_urls

        logger.info(f"Opening round information to scrape stats")
        rounds_handler = RoundsHandler(self.data_folder)

        logger.info(f"Scraping {len(urls_to_scrape)} fights...")

        with (
            open(self.data_file, "a") as f_fights,
            open(rounds_handler.data_file, "a+") as f_rounds,
        ):
            writer_fights = csv.writer(f_fights)
            writer_rounds = csv.writer(f_rounds)

            for i, (url, soup) in enumerate(
                links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay)
            ):
                try:
                    overview = soup.select("i.b-fight-details__text-item")
                    select_result = soup.select("i.b-fight-details__text-item_first")
                    select_result_details = soup.select("p.b-fight-details__text")
                    fight_details = soup.select("p.b-fight-details__table-text")
                    fight_type = soup.select("i.b-fight-details__fight-title")
                    win_lose = soup.select("i.b-fight-details__person-status")

                    if soup.h2 is not None:
                        event_id = self.event_scraper.id_from_url(
                            str(soup.h2.select("a.b-link")[0]["href"])
                        )
                    else:
                        raise TypeError("Couldn't find header in the soup.")

                    referee = self.get_referee(overview)
                    fighter_1, fighter_2 = self.get_fighters(fight_details, soup)
                    num_rounds = overview[2].text.split(":")[1].strip()[0].strip()
                    num_rounds = str(int(num_rounds)) if num_rounds != "N" else ""
                    title_fight = self.get_title_fight(fight_type)
                    weight_class = self.get_weight_class(fight_type)
                    gender = self.get_gender(fight_type)
                    result, result_details = self.get_result(
                        select_result, select_result_details
                    )
                    finish_round = int(overview[0].text.split(":")[1].strip())
                    finish_time = re.findall(r"\d:\d\d", overview[1].text)[0]
                    winner = self.get_winner(fighter_1, fighter_2, win_lose)
                    time_format = overview[2].text.split(":")[1].strip()
                    fight_id = self.id_from_url(url)
                    scores_1, scores_2 = self.get_scores(
                        overview, select_result, select_result_details
                    )

                    # Correctly assign winner, in UFCStats winner is the scores_2
                    # always...
                    # I also need to flip in case of tie (right score for the higher
                    # ranked)
                    if winner != fighter_2:
                        scores_1, scores_2 = scores_2, scores_1

                    # I am saving first the rounds and then the fights
                    # in case of error the fight doesn't count as scraped
                    fight_stats_select = soup.select("p.b-fight-details__table-text")
                    for j, fighter_id in enumerate((fighter_1, fighter_2)):
                        for round_ in range(1, finish_round + 1):
                            stats = rounds_handler.get_stats(
                                fight_stats_select,
                                fighter=j,
                                round_=round_,
                                finish_round=finish_round,
                            )

                            writer_rounds.writerow(
                                (fight_id, fighter_id, round_) + stats
                            )

                    writer_fights.writerow(
                        [
                            fight_id,
                            event_id,
                            referee.strip(),
                            fighter_1,
                            fighter_2,
                            winner.strip(),
                            num_rounds,
                            title_fight,
                            weight_class,
                            gender,
                            result.strip(),
                            result_details.strip(),
                            finish_round,
                            finish_time.strip(),
                            time_format.strip(),
                            scores_1,
                            scores_2,
                        ]
                    )

                    logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} fights...")
                except Exception as e:
                    logger.error(f"Error saving data from url: {url}\nError: {e}")

        self.remove_duplicates_from_file()
        self.rounds_handler.remove_duplicates_from_file()



[docs]
    @staticmethod
    def get_referee(overview: bs4.element.ResultSet) -> str:
        """Extracts the referee's name from the fight overview.

        Args:
            overview: A ResultSet containing fight overview information.

        Returns:
            The referee's name, or '' if not found.
        """
        try:
            return overview[3].text.split(":")[1]
        except:
            return ""


    # Scrape name of winner

[docs]
    @staticmethod
    def get_winner(
        fighter_1: str, fighter_2: str, win_lose: bs4.element.ResultSet
    ) -> str:
        """Determines the winner of the fight based on the win/lose status.

        Args:
            fighter_1: The ID of the first fighter.
            fighter_2: The ID of the second fighter.
            win_lose: A ResultSet containing win/lose status for the fighters.

        Returns:
            The ID of the winner, or 'Draw' if it's a draw, or 'NC if no contest
            or '' if not determined.
        """
        fighter_1_result = win_lose[0].text.strip()
        fighter_2_result = win_lose[1].text.strip()

        if fighter_1_result == "D" and fighter_2_result == "D":
            return "Draw"
        elif fighter_1_result == "NC" and fighter_2_result == "NC":
            return "NC"
        elif fighter_1_result == "W":
            return fighter_1
        elif fighter_2_result == "W":
            return fighter_2
        else:
            return ""


    # Checks gender of fight

[docs]
    @staticmethod
    def get_gender(fight_type: bs4.element.ResultSet) -> str:
        """Determines the gender of the fight.

        Args:
            fight_type: A ResultSet containing fight type information.

        Returns:
            'F' if it's a women's fight, 'M' otherwise.
        """
        if "Women" in fight_type[0].text:
            return "F"
        else:
            return "M"


    # Scrapes the way the fight ended (e.g. KO, decision, etc.)

[docs]
    @staticmethod
    def get_result(
        select_result: bs4.element.ResultSet,
        select_result_details: bs4.element.ResultSet,
    ) -> Tuple[str, str]:
        """
        Extracts the result and details of the fight.

        Args:
            select_result: A ResultSet containing the fight result.
            select_result_details: A ResultSet containing additional result details.

        Returns:
            A tuple with the result type and result details.
        """
        if "Decision" in select_result[0].text.split(":")[1]:
            return (
                select_result[0].text.split(":")[1].split()[0],
                select_result[0].text.split(":")[1].split()[-1],
            )
        else:
            result = select_result[0].text.split(":")[1]
            result_details = select_result_details[1].text.split(":")[-1]

            if result_details.count("-") >= 3:
                # This is the case of an overturned decision where the
                # - appearing at least three times is the score '29 - 28'
                # for the three judges (+ maybe an extra term in the
                # description)
                return (
                    result,
                    " ".join(result_details.split(".")[-4].split("-")[-2].split()[:-3]),
                )

            return result, result_details



[docs]
    @staticmethod
    def get_scores(
        overview: bs4.element.ResultSet,
        select_result: bs4.element.ResultSet,
        select_result_details: bs4.element.ResultSet,
    ) -> Tuple[str, str]:
        """
        Extracts the scores of the fight if they the fight went the distance.

        Args:
            overview: A ResultSet containing the fight overview.
            select_result: A ResultSet containing the fight result.

        Returns:
            A tuple with the scores of the fight. As str to be
            written to the CSV file.

        """
        result_details = select_result_details[1].text.split(":")[-1]

        if ("Decision" in select_result[0].text.split(":")[1]) or (
            result_details.count("-") >= 3
        ):
            # Initialize a list to hold the extracted scores
            scores = []

            # Define the regex pattern for capturing the scores (e.g., 27 - 30, 28 - 29, etc.)
            score_pattern = re.compile(r"(\d{1,2})\s*-\s*(\d{1,2})\.")

            # Iterate over the selected elements and check for score patterns
            for detail in overview:
                text = detail.get_text(strip=True)
                matches = score_pattern.findall(text)  # Find all matches in the text
                for match in matches:
                    scores.append(match)  # Append each found score to the list

            if len(scores) > 0:
                scores1 = 0
                scores2 = 0
                for s1, s2 in scores:
                    scores1 += int(s1)
                    scores2 += int(s2)

                return str(scores1), str(scores2)

        return "", ""




[docs]
class UpcomingFightScraper(BaseFightScraper):
    """Scrapes fight data for upcoming events from the UFCStats website.

    This class inherits from `FightScraper` and is specifically designed to
    scrape fight data for upcoming events. It uses the `UpcomingEventScraper`
    to get event URLs and then scrapes fight details from those events.
    """
    dtypes: Dict[str, type] = {
        "fight_id": str,
        "event_id": str,
        "fighter_1": str,
        "fighter_2": str,
        "title_fight": str,
        "weight_class": str,
    }
    sort_fields = ["event_id", "fight_id"]
    data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()})
    filename = "upcoming_fight_data.csv"
    event_scraper = UpcomingEventScraper


[docs]
    def scrape_fights(self) -> None:
        """Scrapes fight data and saves it to CSV files.

        This method scrapes fight details and saves them to a CSV file.
        """
        existing_urls = set(map(self.url_from_id, self.data["fight_id"]))
        ufcstats_fight_urls = set(self.get_fight_urls(get_all_events=True))

        urls_to_scrape = ufcstats_fight_urls - existing_urls
        urls_to_remove = existing_urls - ufcstats_fight_urls

        if urls_to_remove:
            logger.info(f"Removing {len(urls_to_remove)} outdated fight URLs...")
            self.remove_rows_from_table(list(urls_to_remove))
        
        logger.info(f"Scraping {len(urls_to_scrape)} fights...")

        with open(self.data_file, "a") as f_fights:
            writer = csv.writer(f_fights)
            for i, (url, soup) in enumerate(
                links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay)
            ):
                try:
                    fight_details = soup.select("p.b-fight-details__table-text")
                    fight_type = soup.select("i.b-fight-details__fight-title")

                    if soup.h2 is not None:
                        event_id = self.event_scraper.id_from_url(
                            str(soup.h2.select("a.b-link")[0]["href"])
                        )
                    else:
                        raise TypeError("Couldn't find header in the soup.")

                    fighter_1, fighter_2 = self.get_fighters(fight_details, soup)
                    title_fight = self.get_title_fight(fight_type)
                    fight_id = self.id_from_url(url)
                    weight_class = self.get_weight_class(fight_type)

                    writer.writerow(
                        [fight_id, event_id, fighter_1, fighter_2, title_fight, weight_class]
                    )

                    logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} fights...")
                except Exception as e:
                    logger.error(f"Error saving data from url: {url}\nError: {e}")
        
        self.remove_duplicates_from_file()



[docs]
    def remove_rows_from_table(self, fight_ids: List[str]) -> None:
        """Removes rows from the fight data table based on fight IDs.

        Args:
            fight_ids: A list of fight IDs to be removed from the data.
        """
        self.data = self.data[~self.data["fight_id"].isin(fight_ids)]
        self.data.to_csv(self.data_file, index=False)
        self.remove_duplicates_from_file()




[docs]
class RoundsHandler(BaseFileHandler):
    """Handles the manipulation and storage of round statistics.

    This class inherits from `BaseFileHandler` and manages round-specific
    statistics, including strikes, takedowns, and control time. It formats
    and saves the data to a CSV file.
    """

    dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = {
        "fight_id": str,
        "fighter_id": str,
        "round": pd.Int64Dtype(),
        "knockdowns": pd.Int64Dtype(),
        "strikes_att": pd.Int64Dtype(),  # If not stated otherwise they are significant
        "strikes_succ": pd.Int64Dtype(),
        "head_strikes_att": pd.Int64Dtype(),
        "head_strikes_succ": pd.Int64Dtype(),
        "body_strikes_att": pd.Int64Dtype(),
        "body_strikes_succ": pd.Int64Dtype(),
        "leg_strikes_att": pd.Int64Dtype(),
        "leg_strikes_succ": pd.Int64Dtype(),
        "distance_strikes_att": pd.Int64Dtype(),
        "distance_strikes_succ": pd.Int64Dtype(),
        "ground_strikes_att": pd.Int64Dtype(),
        "ground_strikes_succ": pd.Int64Dtype(),
        "clinch_strikes_att": pd.Int64Dtype(),
        "clinch_strikes_succ": pd.Int64Dtype(),
        "total_strikes_att": pd.Int64Dtype(),  # significant and not significant
        "total_strikes_succ": pd.Int64Dtype(),
        "takedown_att": pd.Int64Dtype(),
        "takedown_succ": pd.Int64Dtype(),
        "submission_att": pd.Int64Dtype(),
        "reversals": pd.Int64Dtype(),
        "ctrl_time": str,
    }
    sort_fields = ["fight_id", "fighter_id", "round"]
    data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()})
    filename = "round_data.csv"


[docs]
    @staticmethod
    def get_stats(
        fight_stats: bs4.element.ResultSet, fighter: int, round_: int, finish_round: int
    ) -> Tuple[str, ...]:
        """
        Extracts round statistics for a specific fighter in a given fight.

        Args:
            fight_stats: A ResultSet containing fight statistics.
            fighter: The index of the fighter (0 or 1).
            round_: The round number.
            finish_round: The total number of rounds.

        Returns:
            A tuple of statistics for the specified fighter in the given round.
            Returns "" for all fields if an error occurs.

        Raises:
            ValueError: If `fighter` is not 0 or 1.
        """
        if fighter not in (0, 1):
            raise ValueError(f"fighter must be 0 or 1, not {fighter}")

        shift_general = 20 * round_
        shift_striking = 20 * (finish_round + 1) + 18 * (round_)

        if fighter == 1:
            shift_general += 1
            shift_striking += 1
        try:
            data = (
                fight_stats[2 + shift_general].text,  # knockdowns
                fight_stats[2 + shift_striking].text.split(" of ")[
                    1
                ],  # Significant strikes
                fight_stats[2 + shift_striking].text.split(" of ")[0],
                fight_stats[6 + shift_striking].text.split(" of ")[1],  # Head
                fight_stats[6 + shift_striking].text.split(" of ")[0],
                fight_stats[8 + shift_striking].text.split(" of ")[1],  # Body
                fight_stats[8 + shift_striking].text.split(" of ")[0],
                fight_stats[10 + shift_striking].text.split(" of ")[1],  # Leg
                fight_stats[10 + shift_striking].text.split(" of ")[0],
                fight_stats[12 + shift_striking].text.split(" of ")[1],  # Distance
                fight_stats[12 + shift_striking].text.split(" of ")[0],
                fight_stats[16 + shift_striking].text.split(" of ")[1],  # Ground
                fight_stats[16 + shift_striking].text.split(" of ")[0],
                fight_stats[14 + shift_striking].text.split(" of ")[1],  # Clinch
                fight_stats[14 + shift_striking].text.split(" of ")[0],
                fight_stats[8 + shift_general].text.split(" of ")[1],  # Total strikes
                fight_stats[8 + shift_general].text.split(" of ")[0],
                fight_stats[10 + shift_general].text.split(" of ")[1],  # Takedown
                fight_stats[10 + shift_general].text.split(" of ")[0],
                fight_stats[14 + shift_general].text,  # Submission attempts
                fight_stats[16 + shift_general].text,  # Reversals
                fight_stats[18 + shift_general].text,  # Control time
            )

            return tuple(datum.strip() for datum in data)
        except:
            return ("",) * 22