"""
This module defines a `FighterScraper` class for scraping and processing fighter
data from UFCStats.
The `FighterScraper` class inherits from the `BaseScraper` class and is designed
to retrieve detailed information about UFC fighters, including personal details,
physical attributes, and fight records. The scraped data is processed and saved
into a CSV file for later analysis. The module also provides methods for parsing
and converting specific attributes like height, weight, reach, and more from the
scraped HTML content.
"""
from __future__ import annotations
import csv
import datetime
import logging
from typing import TYPE_CHECKING
import pandas as pd
from ufcscraper.base import BaseScraper
from ufcscraper.utils import links_to_soups
if TYPE_CHECKING: # pragma: no cover
import bs4
from typing import Dict, List
logger = logging.getLogger(__name__)
[docs]
class FighterScraper(BaseScraper):
"""Scrapes and stores fighter data from UFCStats.
This class handles scraping fighter details from UFCStats, including
personal information, physical attributes, and fight records. The data
is saved to a CSV file for further analysis.
"""
dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = {
"fighter_id": str,
"fighter_f_name": str,
"fighter_l_name": str,
"fighter_nickname": str,
"fighter_height_cm": float,
"fighter_weight_lbs": float,
"fighter_reach_cm": float,
"fighter_stance": str,
"fighter_dob": "datetime64[ns]",
"fighter_w": pd.Int64Dtype(),
"fighter_l": pd.Int64Dtype(),
"fighter_d": pd.Int64Dtype(),
"fighter_nc_dq": pd.Int64Dtype(),
}
sort_fields = ["fighter_l_name", "fighter_f_name", "fighter_id"]
data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()})
filename = "fighter_data.csv"
[docs]
@classmethod
def url_from_id(cls, id_: str) -> str:
"""Constructs the URL for a fighter's details page based on their ID.
Args:
id_: The fighter's unique identifier.
Returns:
The URL for the fighter's details page.
"""
return f"{cls.web_url}/fighter-details/{id_}"
[docs]
def scrape_fighters(self) -> None:
"""Scrapes fighter details from URLs and saves the data to a CSV file.
This method retrieves fighter URLs, scrapes details from each URL,
and appends the data to the CSV file. Handles errors and logs progress.
"""
existing_urls = set(map(self.url_from_id, self.data["fighter_id"]))
ufcstats_fighter_urls = self.get_fighter_urls()
urls_to_scrape = set(ufcstats_fighter_urls) - existing_urls
logger.info(f"Scraping {len(urls_to_scrape)} fighters...")
with open(self.data_file, "a+") as f:
writer = csv.writer(f)
for i, (url, soup) in enumerate(
links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay)
):
try:
name = soup.select("span")[0].text.split()
nickname = soup.select("p.b-content__Nickname")[0]
details = soup.select("li.b-list__box-list-item")
record = (
soup.select("span.b-content__title-record")[0]
.text.split(":")[1]
.strip()
.split("-")
)
f_name = name[0].strip()
l_name = self.parse_l_name(name).strip()
nickname_str = self.parse_nickname(nickname).strip()
height = self.parse_height(details[0])
weight = self.parse_weight(details[1])
reach = self.parse_reach(details[2])
stance = self.parse_stance(details[3])
dob = self.parse_dob(details[4])
w = record[0]
l = record[1]
d = record[-1][0] if len(record[-1]) > 1 else record[-1]
nc_dq = record[-1].split("(")[-1][0] if len(record[-1]) > 1 else ""
writer.writerow(
[
self.id_from_url(url),
f_name,
l_name,
nickname_str,
height,
weight,
reach,
stance,
dob,
w,
l,
d,
nc_dq,
]
)
logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} fighters...")
except Exception as e:
logger.error(f"Error saving data from url: {url}\nError: {e}")
self.remove_duplicates_from_file()
[docs]
def add_name_column(self) -> None:
"""
Adds a combined name column to the DataFrame.
The new column is created by concatenating the fighter's first
and last names.
"""
self.data["fighter_name"] = (
self.data["fighter_f_name"] + " " + self.data["fighter_l_name"].fillna("")
).str.strip()
[docs]
def get_fighter_urls(self) -> List[str]:
"""
Retrieves the URLs for fighter profiles.
Returns:
A list of URLs to fighter profiles.
"""
logger.info("Scraping fighter links...")
# Search fighters by letter
urls = [
f"{self.web_url}/statistics/fighters?char={letter}&page=all"
for letter in "abcdefghijklmnopqrstuvwxyz"
]
soups = [result[1] for result in links_to_soups(urls, self.n_sessions)]
# Collect fighter URLs from each page
fighter_urls = []
for soup in soups:
if soup is not None:
for link in soup.select("a.b-link")[1::3]:
fighter_urls.append(str(link.get("href")))
logger.info(f"Got {len(fighter_urls)} urls...")
return fighter_urls
[docs]
@staticmethod
def parse_l_name(name: List[str]) -> str:
"""
Parses the last name from a list of name parts.
Args:
name: List of name parts.
Returns:
The parsed last name, or "" if it cannot be determined.
"""
if len(name) == 2:
return name[-1]
elif len(name) == 1:
return ""
elif len(name) == 3:
return name[-2] + " " + name[-1]
elif len(name) == 4:
return name[-3] + " " + name[-2] + " " + name[-1]
else:
return ""
[docs]
@staticmethod
def parse_nickname(nickname: bs4.element.Tag) -> str:
"""
Parses the fighter's nickname.
Args:
nickname: BeautifulSoup tag containing the nickname.
Returns:
The parsed nickname, or "" if not available.
"""
if nickname.text == "\n":
return ""
else:
return nickname.text.strip()
[docs]
@staticmethod
def parse_height(height: bs4.element.Tag) -> str:
"""
Parses and converts fighter's height from feet and inches to cm.
Args:
height: BeautifulSoup tag containing the height in feet and inches.
Returns:
The height in centimeters, or "" if not available.
"""
height_text = height.text.split(":")[1].strip()
if "--" in height_text.split("'"):
return ""
else:
height_ft = int(height_text[0])
height_in = int(height_text.split("'")[1].strip().strip('"'))
height_cm = (height_ft * 12.0 * 2.54) + (height_in * 2.54)
return str(height_cm)
[docs]
@staticmethod
def parse_reach(reach: bs4.element.Tag) -> str:
"""
Parses and converts fighter's reach from inches to cm.
Args:
reach: BeautifulSoup tag containing the reach in inches.
Returns:
The reach in centimeters, or "" if not available.
"""
reach_text = reach.text.split(":")[1]
if "--" in reach_text:
return ""
else:
return str(round(int(reach_text.strip().strip('"')) * 2.54, 2))
[docs]
@staticmethod
def parse_weight(weight_element: bs4.element.Tag) -> str:
"""
Parses the fighter's weight.
Args:
weight_element: BeautifulSoup tag containing the weight.
Returns:
The weight in pounds, or "" if not available.
"""
weight_text = weight_element.text.split(":")[1]
if "--" in weight_text:
return ""
else:
return weight_text.split()[0].strip()
[docs]
@staticmethod
def parse_stance(stance: bs4.element.Tag) -> str:
"""
Parses the fighter's stance.
Args:
stance: BeautifulSoup tag containing the stance.
Returns:
The stance, or "" if not available.
"""
stance_text = stance.text.split(":")[1]
if stance_text == "":
return ""
else:
return stance_text.strip()
[docs]
@staticmethod
def parse_dob(dob: bs4.element.Tag) -> str:
"""
Parses and formats the fighter's date of birth.
Args:
dob: BeautifulSoup tag containing the date of birth.
Returns:
The date of birth in YYYY-MM-DD format, or "" if not available.
"""
dob_text = dob.text.split(":")[1].strip()
if dob_text == "--":
return ""
else:
return str(datetime.datetime.strptime(dob_text, "%b %d, %Y"))[0:10]