Source code for ufcscraper.fighter_scraper

"""
This module defines a `FighterScraper` class for scraping and processing fighter
data from UFCStats.

The `FighterScraper` class inherits from the `BaseScraper` class and is designed
to retrieve detailed information about UFC fighters, including personal details,
physical attributes, and fight records. The scraped data is processed and saved
into a CSV file for later analysis. The module also provides methods for parsing
and converting specific attributes like height, weight, reach, and more from the
scraped HTML content.
"""

from __future__ import annotations

import csv
import datetime
import logging
from typing import TYPE_CHECKING

import pandas as pd

from ufcscraper.base import BaseScraper
from ufcscraper.utils import links_to_soups

if TYPE_CHECKING:  # pragma: no cover
    import bs4
    from typing import Dict, List

logger = logging.getLogger(__name__)


[docs] class FighterScraper(BaseScraper): """Scrapes and stores fighter data from UFCStats. This class handles scraping fighter details from UFCStats, including personal information, physical attributes, and fight records. The data is saved to a CSV file for further analysis. """ dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = { "fighter_id": str, "fighter_f_name": str, "fighter_l_name": str, "fighter_nickname": str, "fighter_height_cm": float, "fighter_weight_lbs": float, "fighter_reach_cm": float, "fighter_stance": str, "fighter_dob": "datetime64[ns]", "fighter_w": pd.Int64Dtype(), "fighter_l": pd.Int64Dtype(), "fighter_d": pd.Int64Dtype(), "fighter_nc_dq": pd.Int64Dtype(), } sort_fields = ["fighter_l_name", "fighter_f_name", "fighter_id"] data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()}) filename = "fighter_data.csv"
[docs] @classmethod def url_from_id(cls, id_: str) -> str: """Constructs the URL for a fighter's details page based on their ID. Args: id_: The fighter's unique identifier. Returns: The URL for the fighter's details page. """ return f"{cls.web_url}/fighter-details/{id_}"
[docs] def scrape_fighters(self) -> None: """Scrapes fighter details from URLs and saves the data to a CSV file. This method retrieves fighter URLs, scrapes details from each URL, and appends the data to the CSV file. Handles errors and logs progress. """ existing_urls = set(map(self.url_from_id, self.data["fighter_id"])) ufcstats_fighter_urls = self.get_fighter_urls() urls_to_scrape = set(ufcstats_fighter_urls) - existing_urls logger.info(f"Scraping {len(urls_to_scrape)} fighters...") with open(self.data_file, "a+") as f: writer = csv.writer(f) for i, (url, soup) in enumerate( links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay) ): try: name = soup.select("span")[0].text.split() nickname = soup.select("p.b-content__Nickname")[0] details = soup.select("li.b-list__box-list-item") record = ( soup.select("span.b-content__title-record")[0] .text.split(":")[1] .strip() .split("-") ) f_name = name[0].strip() l_name = self.parse_l_name(name).strip() nickname_str = self.parse_nickname(nickname).strip() height = self.parse_height(details[0]) weight = self.parse_weight(details[1]) reach = self.parse_reach(details[2]) stance = self.parse_stance(details[3]) dob = self.parse_dob(details[4]) w = record[0] l = record[1] d = record[-1][0] if len(record[-1]) > 1 else record[-1] nc_dq = record[-1].split("(")[-1][0] if len(record[-1]) > 1 else "" writer.writerow( [ self.id_from_url(url), f_name, l_name, nickname_str, height, weight, reach, stance, dob, w, l, d, nc_dq, ] ) logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} fighters...") except Exception as e: logger.error(f"Error saving data from url: {url}\nError: {e}") self.remove_duplicates_from_file()
[docs] def add_name_column(self) -> None: """ Adds a combined name column to the DataFrame. The new column is created by concatenating the fighter's first and last names. """ self.data["fighter_name"] = ( self.data["fighter_f_name"] + " " + self.data["fighter_l_name"].fillna("") ).str.strip()
[docs] def get_fighter_urls(self) -> List[str]: """ Retrieves the URLs for fighter profiles. Returns: A list of URLs to fighter profiles. """ logger.info("Scraping fighter links...") # Search fighters by letter urls = [ f"{self.web_url}/statistics/fighters?char={letter}&page=all" for letter in "abcdefghijklmnopqrstuvwxyz" ] soups = [result[1] for result in links_to_soups(urls, self.n_sessions)] # Collect fighter URLs from each page fighter_urls = [] for soup in soups: if soup is not None: for link in soup.select("a.b-link")[1::3]: fighter_urls.append(str(link.get("href"))) logger.info(f"Got {len(fighter_urls)} urls...") return fighter_urls
[docs] @staticmethod def parse_l_name(name: List[str]) -> str: """ Parses the last name from a list of name parts. Args: name: List of name parts. Returns: The parsed last name, or "" if it cannot be determined. """ if len(name) == 2: return name[-1] elif len(name) == 1: return "" elif len(name) == 3: return name[-2] + " " + name[-1] elif len(name) == 4: return name[-3] + " " + name[-2] + " " + name[-1] else: return ""
[docs] @staticmethod def parse_nickname(nickname: bs4.element.Tag) -> str: """ Parses the fighter's nickname. Args: nickname: BeautifulSoup tag containing the nickname. Returns: The parsed nickname, or "" if not available. """ if nickname.text == "\n": return "" else: return nickname.text.strip()
[docs] @staticmethod def parse_height(height: bs4.element.Tag) -> str: """ Parses and converts fighter's height from feet and inches to cm. Args: height: BeautifulSoup tag containing the height in feet and inches. Returns: The height in centimeters, or "" if not available. """ height_text = height.text.split(":")[1].strip() if "--" in height_text.split("'"): return "" else: height_ft = int(height_text[0]) height_in = int(height_text.split("'")[1].strip().strip('"')) height_cm = (height_ft * 12.0 * 2.54) + (height_in * 2.54) return str(height_cm)
[docs] @staticmethod def parse_reach(reach: bs4.element.Tag) -> str: """ Parses and converts fighter's reach from inches to cm. Args: reach: BeautifulSoup tag containing the reach in inches. Returns: The reach in centimeters, or "" if not available. """ reach_text = reach.text.split(":")[1] if "--" in reach_text: return "" else: return str(round(int(reach_text.strip().strip('"')) * 2.54, 2))
[docs] @staticmethod def parse_weight(weight_element: bs4.element.Tag) -> str: """ Parses the fighter's weight. Args: weight_element: BeautifulSoup tag containing the weight. Returns: The weight in pounds, or "" if not available. """ weight_text = weight_element.text.split(":")[1] if "--" in weight_text: return "" else: return weight_text.split()[0].strip()
[docs] @staticmethod def parse_stance(stance: bs4.element.Tag) -> str: """ Parses the fighter's stance. Args: stance: BeautifulSoup tag containing the stance. Returns: The stance, or "" if not available. """ stance_text = stance.text.split(":")[1] if stance_text == "": return "" else: return stance_text.strip()
[docs] @staticmethod def parse_dob(dob: bs4.element.Tag) -> str: """ Parses and formats the fighter's date of birth. Args: dob: BeautifulSoup tag containing the date of birth. Returns: The date of birth in YYYY-MM-DD format, or "" if not available. """ dob_text = dob.text.split(":")[1].strip() if dob_text == "--": return "" else: return str(datetime.datetime.strptime(dob_text, "%b %d, %Y"))[0:10]