"""
This module contains the `EventScraper` class, which is responsible for scraping
event data from the UFCStats website.
The `EventScraper` class inherits from `BaseScraper` and provides functionality
to retrieve and process event details such as event name, date, city, state, and
country. The scraped data is stored in a CSV file (`event_data.csv`) and can be
used for further analysis.
"""
from __future__ import annotations
import csv
import datetime
import logging
from typing import TYPE_CHECKING, List
import pandas as pd
from ufcscraper.base import BaseScraper
from ufcscraper.utils import link_to_soup, links_to_soups
if TYPE_CHECKING: # pragma: no cover
from typing import Dict, List
logger = logging.getLogger(__name__)
[docs]
class EventScraper(BaseScraper):
"""Scrapes event data from the UFCStats website.
This class handles scraping event details such as event name, date, city,
state, and country, and stores them in a CSV file. It inherits basic
scraping functionality from `BaseScraper`.
"""
dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] = {
"event_id": str,
"event_name": str,
"event_date": "datetime64[ns]",
"event_city": str,
"event_state": str,
"event_country": str,
}
sort_fields = ["event_date", "event_name"]
data = pd.DataFrame({col: pd.Series(dtype=dt) for col, dt in dtypes.items()})
filename = "event_data.csv"
event_type = "completed"
[docs]
@classmethod
def url_from_id(cls, id_: str) -> str:
"""Constructs the event URL using the event ID.
Args:
id_: The unique identifier for the event.
Returns:
The full URL to the event's details page on UFCStats.
"""
return f"{cls.web_url}/event-details/{id_}"
[docs]
def scrape_events(self) -> None:
"""Scrapes event data and saves it to a CSV file.
This method compares existing event URLs with those available on the
UFCStats website, scrapes details of new events, and appends them to
the CSV file. Logs the progress and any errors encountered.
"""
existing_urls = set(map(self.url_from_id, self.data["event_id"]))
ufcstats_event_urls = self.get_event_urls()
urls_to_scrape = set(ufcstats_event_urls) - existing_urls
logger.info(f"Scraping {len(urls_to_scrape)} events...")
with open(self.data_file, "a+") as f:
writer = csv.writer(f)
i = 0
for i, (url, soup) in enumerate(
links_to_soups(list(urls_to_scrape), self.n_sessions, self.delay)
):
try:
full_location = (
soup.select("li")[4].text.split(":")[1].strip().split(",")
)
event_name = soup.select("h2")[0].text
event_date = str(
datetime.datetime.strptime(
soup.select("li")[3].text.split(":")[-1].strip(),
"%B %d, %Y",
)
)
event_city = full_location[0]
event_country = full_location[-1]
# Check event location contains state details
if len(full_location) > 2:
event_state = full_location[1]
else:
event_state = ""
writer.writerow(
[
self.id_from_url(url),
event_name.strip(),
event_date[0:10],
event_city.strip(),
event_state.strip(),
event_country.strip(),
]
)
logger.info(f"Scraped {i+1}/{len(urls_to_scrape)} events...")
except Exception as e:
logger.error(f"Error saving data from url: {url}\nError: {e}")
self.remove_duplicates_from_file()
[docs]
def get_event_urls(self) -> List[str]:
"""Retrieves the URLs of all completed events from UFCStats.
This method scrapes the UFCStats website for event URLs that contain
the keyword 'event-details'. It returns a list of these URLs.
Returns:
A list of URLs for completed events.
"""
logger.info("Scraping event links...")
soup = link_to_soup(
f"{self.web_url}/statistics/events/{self.event_type}?page=all"
)
# Adds href to list if href contains a link with keyword 'event-details'
event_urls = [
item.get("href")
for item in soup.find_all("a")
if type(item.get("href")) == str and "event-details" in item.get("href")
]
logger.info(f"Got {len(event_urls)} event links...")
return event_urls
[docs]
def get_fight_urls_from_event_urls(self, event_urls: List[str]) -> List[str]:
"""Extracts fight URLs from a list of event URLs.
Args:
event_urls: A list of event URLs from which to extract fight URLs.
Returns:
A list of fight URLs extracted from the provided event URLs.
"""
fight_urls = set()
i = 1
for _, soup in links_to_soups(event_urls, self.n_sessions):
for item in soup.find_all("a", class_="b-flag b-flag_style_green"):
fight_urls.add(item.get("href"))
for item in soup.find_all("a", class_="b-flag b-flag_style_bordered"):
fight_urls.add(item.get("href"))
print(f"Scraped {i}/{len(event_urls)} events...", end="\r")
i += 1
return list(fight_urls)
[docs]
class UpcomingEventScraper(EventScraper):
filename = "upcoming_event_data.csv"
event_type = "upcoming"
[docs]
def get_fight_urls_from_event_urls(self, event_urls: List[str]) -> List[str]:
fight_urls = set()
i = 1
for _, soup in links_to_soups(event_urls, self.n_sessions):
for item in soup.find_all("a", class_="b-link b-link_style_black"):
if "View" in item.get_text() and "Matchup" in item.get_text():
fight_urls.add(item.get("data-link"))
print(f"Scraped {i}/{len(event_urls)} events...", end="\r")
i += 1
return list(fight_urls)