Source code for ufcscraper.fight_scraper

"""
This module defines classes for scraping fight and round data from the UFCStats
website.

Classes:
    FightScraper: Inherits from `BaseScraper` and is responsible for scraping
    detailed fight statistics, such as fighter information, results, referees,
    and more. The data is stored in a CSV file named `fight_data.csv`. It also
    interacts with the `RoundsHandler` to scrape and store round-specific
    statistics.

    RoundsHandler: Inherits from `BaseFileHandler` and manages the collection
    and storage of round-specific fight data. The data is saved in a CSV file
    named `round_data.csv`. It handles statistics like strikes, takedowns,
    control time, and more.
"""

from __future__ import annotations

from abc import ABC
import csv
import logging
import re
from typing import TYPE_CHECKING
import pandas as pd

from ufcscraper.base import BaseScraper, BaseFileHandler
from ufcscraper.event_scraper import EventScraper, UpcomingEventScraper
from ufcscraper.fighter_scraper import FighterScraper
from ufcscraper.utils import links_to_soups

if TYPE_CHECKING:  # pragma: no cover
    import bs4
    from typing import Any, Dict, List, Tuple

logger = logging.getLogger(__name__)

[docs] class BaseFightScraper(BaseScraper, ABC): """Base class for fight scrapers. This class provides the basic functionality to scrape fight data from the UFCStats it should be inherited by specific fight scraper classes. """ event_scraper = EventScraper
[docs] @classmethod def url_from_id(cls, id_: str) -> str: """Constructs the fight URL using the fight ID. Args: id_: The unique identifier for the fight. Returns: The full URL to the fight's details page on UFCStats. """ return f"{cls.web_url}/fight-details/{id_}"
[docs] def get_fight_urls(self, get_all_events: bool = False) -> List[str]: """Retrieves URLs of all fights from UFCStats. Args: get_all_events: If False, only gets URLs for fights from events not already scraped. Returns: A list of URLs for fights. """ logger.info("Scraping fight links...") logger.info("Opening event information to extract event urls...") event_scraper_instance = self.event_scraper(self.data_folder, self.n_sessions, self.delay) event_ids = event_scraper_instance.data["event_id"].unique().tolist() # Remove events for which information is extracted if not get_all_events: event_ids = [ id_ for id_ in event_ids if id_ not in self.data["event_id"].unique().tolist() ] event_urls: List[str] = list(map(self.event_scraper.url_from_id, event_ids)) fight_urls = event_scraper_instance.get_fight_urls_from_event_urls(event_urls) logger.info(f"Got {len(fight_urls)} fight links...") return list(fight_urls)
[docs] @staticmethod def get_fighters( fight_details: bs4.element.ResultSet, fight_soup: bs4.BeautifulSoup ) -> Tuple[str, str]: """Extracts fighter IDs from the fight details. Args: fight_details: A ResultSet containing fight detail information. fight_soup: The BeautifulSoup object containing the fight page. Returns: A tuple containing the IDs of the two fighters. """ # Scrape both fighter names try: fighters = ( fight_details[0].select("a.b-link.b-link_style_black")[0]["href"], fight_details[1].select("a.b-link.b-link_style_black")[0]["href"], ) except: # pragma: no cover fighters = ( fight_soup.select("a.b-fight-details__person-link")[0]["href"], fight_soup.select("a.b-fight-details__person-link")[1]["href"], ) fighter_1, fighter_2 = map( FighterScraper.id_from_url, fighters, ) return fighter_1, fighter_2
# Checks if fight is title fight
[docs] @staticmethod def get_title_fight(fight_type: bs4.element.ResultSet) -> str: """Determines if the fight is a title fight. Args: fight_type: A ResultSet containing fight type information. Returns: 'T' if it's a title fight, 'F' otherwise. """ if "Title" in fight_type[0].text: return "T" else: return "F"
# Scrapes weight class of fight
[docs] @staticmethod def get_weight_class(fight_type: bs4.element.ResultSet) -> str: """Extracts the weight class of the fight. Args: fight_type: A ResultSet containing fight type information. Returns: The weight class of the fight, or '' if not found. """ if "Light Heavyweight" in fight_type[0].text.strip(): return "Light Heavyweight" elif "Women" in fight_type[0].text.strip(): return "Women's " + re.findall(r"\w*weight", fight_type[0].text.strip())[0] elif "Catch Weight" in fight_type[0].text.strip(): return "Catch Weight" elif "Open Weight" in fight_type[0].text.strip(): return "Open Weight" else: try: return re.findall(r"\w*weight", fight_type[0].text.strip())[0] except: return ""
[docs] class FightScraper(BaseFightScraper): """Scrapes fight data from the UFCStats website. This class inherits from `BaseScraper` and handles scraping detailed fight statistics including fighters, referees, results, and more. It saves the scraped data into two CSV files: one for fights and one for rounds (through the companion class `RoundsHandler`). """ dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = { "fight_id": str, "event_id": str, "referee": str, "fighter_1": str, "fighter_2": str, "winner": str, "num_rounds": pd.Int64Dtype(), "title_fight": str, "weight_class": str, "gender": str, "result": str, "result_details": str, "finish_round": pd.Int64Dtype(), "finish_time": str, "time_format": str, "scores_1": pd.Int64Dtype(), "scores_2": pd.Int64Dtype(), } sort_fields = ["event_id", "fight_id"] data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()}) filename = "fight_data.csv" def __init__(self, *args: Any, **kwargs: Any) -> None: """Initializes the FightScraper and the companion RoundsHandler. Args: *args: Additional positional arguments passed to the base class. **kwargs: Additional keyword arguments passed to the base class. """ super().__init__(*args, **kwargs) self.rounds_handler = RoundsHandler(self.data_folder)
[docs] def scrape_fights(self, get_all_events: bool = False) -> None: """Scrapes fight data and saves it to CSV files. This method scrapes fight details and round statistics. It saves the fight details and round statistics to separate CSV files. Args: get_all_events: If False, only scrapes fights from events not already scraped. """ existing_urls = set(map(self.url_from_id, self.data["fight_id"])) ufcstats_fight_urls = self.get_fight_urls(get_all_events) urls_to_scrape = set(ufcstats_fight_urls) - existing_urls logger.info(f"Opening round information to scrape stats") rounds_handler = RoundsHandler(self.data_folder) logger.info(f"Scraping {len(urls_to_scrape)} fights...") with ( open(self.data_file, "a") as f_fights, open(rounds_handler.data_file, "a+") as f_rounds, ): writer_fights = csv.writer(f_fights) writer_rounds = csv.writer(f_rounds) for i, (url, soup) in enumerate( links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay) ): try: overview = soup.select("i.b-fight-details__text-item") select_result = soup.select("i.b-fight-details__text-item_first") select_result_details = soup.select("p.b-fight-details__text") fight_details = soup.select("p.b-fight-details__table-text") fight_type = soup.select("i.b-fight-details__fight-title") win_lose = soup.select("i.b-fight-details__person-status") if soup.h2 is not None: event_id = self.event_scraper.id_from_url( str(soup.h2.select("a.b-link")[0]["href"]) ) else: raise TypeError("Couldn't find header in the soup.") referee = self.get_referee(overview) fighter_1, fighter_2 = self.get_fighters(fight_details, soup) num_rounds = overview[2].text.split(":")[1].strip()[0].strip() num_rounds = str(int(num_rounds)) if num_rounds != "N" else "" title_fight = self.get_title_fight(fight_type) weight_class = self.get_weight_class(fight_type) gender = self.get_gender(fight_type) result, result_details = self.get_result( select_result, select_result_details ) finish_round = int(overview[0].text.split(":")[1].strip()) finish_time = re.findall(r"\d:\d\d", overview[1].text)[0] winner = self.get_winner(fighter_1, fighter_2, win_lose) time_format = overview[2].text.split(":")[1].strip() fight_id = self.id_from_url(url) scores_1, scores_2 = self.get_scores( overview, select_result, select_result_details ) # Correctly assign winner, in UFCStats winner is the scores_2 # always... # I also need to flip in case of tie (right score for the higher # ranked) if winner != fighter_2: scores_1, scores_2 = scores_2, scores_1 # I am saving first the rounds and then the fights # in case of error the fight doesn't count as scraped fight_stats_select = soup.select("p.b-fight-details__table-text") for j, fighter_id in enumerate((fighter_1, fighter_2)): for round_ in range(1, finish_round + 1): stats = rounds_handler.get_stats( fight_stats_select, fighter=j, round_=round_, finish_round=finish_round, ) writer_rounds.writerow( (fight_id, fighter_id, round_) + stats ) writer_fights.writerow( [ fight_id, event_id, referee.strip(), fighter_1, fighter_2, winner.strip(), num_rounds, title_fight, weight_class, gender, result.strip(), result_details.strip(), finish_round, finish_time.strip(), time_format.strip(), scores_1, scores_2, ] ) logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} fights...") except Exception as e: logger.error(f"Error saving data from url: {url}\nError: {e}") self.remove_duplicates_from_file() self.rounds_handler.remove_duplicates_from_file()
[docs] @staticmethod def get_referee(overview: bs4.element.ResultSet) -> str: """Extracts the referee's name from the fight overview. Args: overview: A ResultSet containing fight overview information. Returns: The referee's name, or '' if not found. """ try: return overview[3].text.split(":")[1] except: return ""
# Scrape name of winner
[docs] @staticmethod def get_winner( fighter_1: str, fighter_2: str, win_lose: bs4.element.ResultSet ) -> str: """Determines the winner of the fight based on the win/lose status. Args: fighter_1: The ID of the first fighter. fighter_2: The ID of the second fighter. win_lose: A ResultSet containing win/lose status for the fighters. Returns: The ID of the winner, or 'Draw' if it's a draw, or 'NC if no contest or '' if not determined. """ fighter_1_result = win_lose[0].text.strip() fighter_2_result = win_lose[1].text.strip() if fighter_1_result == "D" and fighter_2_result == "D": return "Draw" elif fighter_1_result == "NC" and fighter_2_result == "NC": return "NC" elif fighter_1_result == "W": return fighter_1 elif fighter_2_result == "W": return fighter_2 else: return ""
# Checks gender of fight
[docs] @staticmethod def get_gender(fight_type: bs4.element.ResultSet) -> str: """Determines the gender of the fight. Args: fight_type: A ResultSet containing fight type information. Returns: 'F' if it's a women's fight, 'M' otherwise. """ if "Women" in fight_type[0].text: return "F" else: return "M"
# Scrapes the way the fight ended (e.g. KO, decision, etc.)
[docs] @staticmethod def get_result( select_result: bs4.element.ResultSet, select_result_details: bs4.element.ResultSet, ) -> Tuple[str, str]: """ Extracts the result and details of the fight. Args: select_result: A ResultSet containing the fight result. select_result_details: A ResultSet containing additional result details. Returns: A tuple with the result type and result details. """ if "Decision" in select_result[0].text.split(":")[1]: return ( select_result[0].text.split(":")[1].split()[0], select_result[0].text.split(":")[1].split()[-1], ) else: result = select_result[0].text.split(":")[1] result_details = select_result_details[1].text.split(":")[-1] if result_details.count("-") >= 3: # This is the case of an overturned decision where the # - appearing at least three times is the score '29 - 28' # for the three judges (+ maybe an extra term in the # description) return ( result, " ".join(result_details.split(".")[-4].split("-")[-2].split()[:-3]), ) return result, result_details
[docs] @staticmethod def get_scores( overview: bs4.element.ResultSet, select_result: bs4.element.ResultSet, select_result_details: bs4.element.ResultSet, ) -> Tuple[str, str]: """ Extracts the scores of the fight if they the fight went the distance. Args: overview: A ResultSet containing the fight overview. select_result: A ResultSet containing the fight result. Returns: A tuple with the scores of the fight. As str to be written to the CSV file. """ result_details = select_result_details[1].text.split(":")[-1] if ("Decision" in select_result[0].text.split(":")[1]) or ( result_details.count("-") >= 3 ): # Initialize a list to hold the extracted scores scores = [] # Define the regex pattern for capturing the scores (e.g., 27 - 30, 28 - 29, etc.) score_pattern = re.compile(r"(\d{1,2})\s*-\s*(\d{1,2})\.") # Iterate over the selected elements and check for score patterns for detail in overview: text = detail.get_text(strip=True) matches = score_pattern.findall(text) # Find all matches in the text for match in matches: scores.append(match) # Append each found score to the list if len(scores) > 0: scores1 = 0 scores2 = 0 for s1, s2 in scores: scores1 += int(s1) scores2 += int(s2) return str(scores1), str(scores2) return "", ""
[docs] class UpcomingFightScraper(BaseFightScraper): """Scrapes fight data for upcoming events from the UFCStats website. This class inherits from `FightScraper` and is specifically designed to scrape fight data for upcoming events. It uses the `UpcomingEventScraper` to get event URLs and then scrapes fight details from those events. """ dtypes: Dict[str, type] = { "fight_id": str, "event_id": str, "fighter_1": str, "fighter_2": str, "title_fight": str, "weight_class": str, } sort_fields = ["event_id", "fight_id"] data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()}) filename = "upcoming_fight_data.csv" event_scraper = UpcomingEventScraper
[docs] def scrape_fights(self) -> None: """Scrapes fight data and saves it to CSV files. This method scrapes fight details and saves them to a CSV file. """ existing_urls = set(map(self.url_from_id, self.data["fight_id"])) ufcstats_fight_urls = set(self.get_fight_urls(get_all_events=True)) urls_to_scrape = ufcstats_fight_urls - existing_urls urls_to_remove = existing_urls - ufcstats_fight_urls if urls_to_remove: logger.info(f"Removing {len(urls_to_remove)} outdated fight URLs...") self.remove_rows_from_table(list(urls_to_remove)) logger.info(f"Scraping {len(urls_to_scrape)} fights...") with open(self.data_file, "a") as f_fights: writer = csv.writer(f_fights) for i, (url, soup) in enumerate( links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay) ): try: fight_details = soup.select("p.b-fight-details__table-text") fight_type = soup.select("i.b-fight-details__fight-title") if soup.h2 is not None: event_id = self.event_scraper.id_from_url( str(soup.h2.select("a.b-link")[0]["href"]) ) else: raise TypeError("Couldn't find header in the soup.") fighter_1, fighter_2 = self.get_fighters(fight_details, soup) title_fight = self.get_title_fight(fight_type) fight_id = self.id_from_url(url) weight_class = self.get_weight_class(fight_type) writer.writerow( [fight_id, event_id, fighter_1, fighter_2, title_fight, weight_class] ) logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} fights...") except Exception as e: logger.error(f"Error saving data from url: {url}\nError: {e}") self.remove_duplicates_from_file()
[docs] def remove_rows_from_table(self, fight_ids: List[str]) -> None: """Removes rows from the fight data table based on fight IDs. Args: fight_ids: A list of fight IDs to be removed from the data. """ self.data = self.data[~self.data["fight_id"].isin(fight_ids)] self.data.to_csv(self.data_file, index=False) self.remove_duplicates_from_file()
[docs] class RoundsHandler(BaseFileHandler): """Handles the manipulation and storage of round statistics. This class inherits from `BaseFileHandler` and manages round-specific statistics, including strikes, takedowns, and control time. It formats and saves the data to a CSV file. """ dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = { "fight_id": str, "fighter_id": str, "round": pd.Int64Dtype(), "knockdowns": pd.Int64Dtype(), "strikes_att": pd.Int64Dtype(), # If not stated otherwise they are significant "strikes_succ": pd.Int64Dtype(), "head_strikes_att": pd.Int64Dtype(), "head_strikes_succ": pd.Int64Dtype(), "body_strikes_att": pd.Int64Dtype(), "body_strikes_succ": pd.Int64Dtype(), "leg_strikes_att": pd.Int64Dtype(), "leg_strikes_succ": pd.Int64Dtype(), "distance_strikes_att": pd.Int64Dtype(), "distance_strikes_succ": pd.Int64Dtype(), "ground_strikes_att": pd.Int64Dtype(), "ground_strikes_succ": pd.Int64Dtype(), "clinch_strikes_att": pd.Int64Dtype(), "clinch_strikes_succ": pd.Int64Dtype(), "total_strikes_att": pd.Int64Dtype(), # significant and not significant "total_strikes_succ": pd.Int64Dtype(), "takedown_att": pd.Int64Dtype(), "takedown_succ": pd.Int64Dtype(), "submission_att": pd.Int64Dtype(), "reversals": pd.Int64Dtype(), "ctrl_time": str, } sort_fields = ["fight_id", "fighter_id", "round"] data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()}) filename = "round_data.csv"
[docs] @staticmethod def get_stats( fight_stats: bs4.element.ResultSet, fighter: int, round_: int, finish_round: int ) -> Tuple[str, ...]: """ Extracts round statistics for a specific fighter in a given fight. Args: fight_stats: A ResultSet containing fight statistics. fighter: The index of the fighter (0 or 1). round_: The round number. finish_round: The total number of rounds. Returns: A tuple of statistics for the specified fighter in the given round. Returns "" for all fields if an error occurs. Raises: ValueError: If `fighter` is not 0 or 1. """ if fighter not in (0, 1): raise ValueError(f"fighter must be 0 or 1, not {fighter}") shift_general = 20 * round_ shift_striking = 20 * (finish_round + 1) + 18 * (round_) if fighter == 1: shift_general += 1 shift_striking += 1 try: data = ( fight_stats[2 + shift_general].text, # knockdowns fight_stats[2 + shift_striking].text.split(" of ")[ 1 ], # Significant strikes fight_stats[2 + shift_striking].text.split(" of ")[0], fight_stats[6 + shift_striking].text.split(" of ")[1], # Head fight_stats[6 + shift_striking].text.split(" of ")[0], fight_stats[8 + shift_striking].text.split(" of ")[1], # Body fight_stats[8 + shift_striking].text.split(" of ")[0], fight_stats[10 + shift_striking].text.split(" of ")[1], # Leg fight_stats[10 + shift_striking].text.split(" of ")[0], fight_stats[12 + shift_striking].text.split(" of ")[1], # Distance fight_stats[12 + shift_striking].text.split(" of ")[0], fight_stats[16 + shift_striking].text.split(" of ")[1], # Ground fight_stats[16 + shift_striking].text.split(" of ")[0], fight_stats[14 + shift_striking].text.split(" of ")[1], # Clinch fight_stats[14 + shift_striking].text.split(" of ")[0], fight_stats[8 + shift_general].text.split(" of ")[1], # Total strikes fight_stats[8 + shift_general].text.split(" of ")[0], fight_stats[10 + shift_general].text.split(" of ")[1], # Takedown fight_stats[10 + shift_general].text.split(" of ")[0], fight_stats[14 + shift_general].text, # Submission attempts fight_stats[16 + shift_general].text, # Reversals fight_stats[18 + shift_general].text, # Control time ) return tuple(datum.strip() for datum in data) except: return ("",) * 22