Source code for ufcscraper.base

"""Base modules for ufc scraper

This module defines BaseFileHandler and BaseScraper classes,
meant to be inherited by specific scraper or file handler
modules.
"""

from __future__ import annotations

import csv
import logging
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING
from abc import ABC

import pandas as pd


if TYPE_CHECKING:  # pragma: no cover
    from typing import Dict, List, Optional

logger = logging.getLogger(__name__)


[docs] class BaseFileHandler(ABC): """Base class for file handlers associated with a CSV table. This class provides the basic functionality to manage data stored in a CSV file. It handles checking the existence of the file, initializing it with columns if it's missing, removing duplicates, and loading the data into a pandas DataFrame. Attributes: dtypes: A dictionary mapping column names to their data types. sort_fields: A list of column names used for sorting the data. data_folder: The folder where the CSV file is stored. filename: The name of the CSV file. This should be defined in subclasses. data: A pandas DataFrame that holds the data loaded from the CSV file. """ dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype] sort_fields: List[str] data_folder: Path filename: str data = pd.DataFrame([]) def __init__( self, data_folder: Path | str, ): """Initializes the BaseFileHandler with the specified data folder. Args: data_folder (Path | str): The folder where the CSV file is stored or will be created. """ self.data_folder = Path(data_folder) self.data_file: Path = Path(self.data_folder) / self.filename self.check_data_file() self.load_data()
[docs] def check_data_file(self) -> None: """Checks if the CSV file exists in the specified data folder. If the file does not exist, it creates a new file with the specified columns. Logs the status of the file (whether new or existing) using the logger. """ if not self.data_file.is_file(): with open(self.data_file, "w", newline="", encoding="UTF8") as f: writer = csv.writer(f) writer.writerow(self.dtypes.keys()) logger.info(f"Using new file:\n\t{self.data_file}") else: logger.info(f"Using existing file:\n\t{self.data_file}")
[docs] def remove_duplicates_from_file(self) -> None: """Removes duplicate rows from the CSV file. This method reads the CSV file, removes any duplicate rows, and then saves the cleaned data back to the same file. """ date_columns = [ col for col, dtype in self.dtypes.items() if dtype == "datetime64[ns]" ] non_date_types = { col: dtype for col, dtype in self.dtypes.items() if dtype != "datetime64[ns]" } data = pd.read_csv( self.data_file, dtype=non_date_types, parse_dates=date_columns ).drop_duplicates() data = data.sort_values(by=self.sort_fields).reset_index(drop=True) data.to_csv(self.data_file, index=False)
[docs] def load_data(self) -> None: """Loads the data from the CSV file into the `data` DataFrame. This method reads the CSV file, removes duplicates, and stores the data in the `data` attribute for further processing. """ date_columns = [ col for col, dtype in self.dtypes.items() if dtype == "datetime64[ns]" ] non_date_types = { col: dtype for col, dtype in self.dtypes.items() if dtype != "datetime64[ns]" } self.data = pd.read_csv( self.data_file, dtype=non_date_types, parse_dates=date_columns ).drop_duplicates()
[docs] class BaseScraper(BaseFileHandler): """Base class for web scrapers associated with a CSV file. This class provides basic functionality for scraping data from specific webs and storing it in a CSV file. It includes default settings for web scraping such as the base URL, the number of concurrent sessions, and the delay between requests. Attributes: web_url: The base URL for the website to scrape. n_sessions: Number of concurrent sessions for scraping. delay: Delay between requests to avoid being blocked. """ web_url: str = "http://www.ufcstats.com" n_sessions: int = 1 # these are the defaults delay: float = 0.1 def __init__( self, data_folder: Path | str, n_sessions: Optional[int] = None, delay: Optional[float] = None, ): """Initializes the BaseScraper with the specified parameters. Args: n_sessions: Number of concurrent sessions for scraping. If not provided, defaults to the class attribute. delay: Delay between requests to avoid being blocked. If not provided, defaults to the class attribute. """ super().__init__(data_folder) self.n_sessions = n_sessions or self.n_sessions self.delay = delay or self.delay
[docs] @staticmethod def id_from_url(url: str) -> str: """Extracts and returns the ID from a given URL. Args: url: The URL from which to extract the ID. Returns: The extracted ID as a string. """ if url[-1] == "/": return BaseScraper.id_from_url(url[:-1]) return url.split("/")[-1]
[docs] class BaseHTMLReader(BaseFileHandler): """Base class for HTML readers associated with a CSV file. This class provides basic functionality for reading HTML files and storing the data in a CSV file. It includes methods to read HTML content and convert it into a pandas DataFrame. """ def __init__(self, html_file: Path | str, data_folder: Path | str): """Initializes the BaseHTMLReader with the specified HTML file and data folder. Args: html_file (Path | str): The path to the HTML file to read. data_folder (Path | str): The folder where the CSV file is stored or will be created. """ super().__init__(data_folder) self.html_file = Path(html_file) self.html_datetime = datetime.fromtimestamp(self.html_file.stat().st_mtime)
[docs] def read_html(self) -> str: """Reads the HTML content from the specified HTML file. Returns: str: The HTML content as a string. """ return self.html_file.read_text()