Source code for ufcscraper.base

"""Base modules for ufc scraper

This module defines BaseFileHandler and BaseScraper classes,
meant to be inherited by specific scraper or file handler
modules.
"""

from __future__ import annotations

import csv
import logging
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING
from abc import ABC

import pandas as pd


if TYPE_CHECKING:  # pragma: no cover
    from typing import Dict, List, Optional

logger = logging.getLogger(__name__)



[docs]
class BaseFileHandler(ABC):
    """Base class for file handlers associated with a CSV table.

    This class provides the basic functionality to manage data stored in a CSV
    file. It handles checking the existence of the file, initializing it with
    columns if it's missing, removing duplicates, and loading the data into a
    pandas DataFrame.

    Attributes:
        dtypes: A dictionary mapping column names to their data types.
        sort_fields: A list of column names used for sorting the data.
        data_folder: The folder where the CSV file is stored.
        filename: The name of the CSV file. This should be defined in subclasses.
        data: A pandas DataFrame that holds the data loaded from the CSV file.
    """

    dtypes: Dict[str, type | pd.core.arrays.integer.Int64Dtype]
    sort_fields: List[str]
    data_folder: Path
    filename: str

    data = pd.DataFrame([])

    def __init__(
        self,
        data_folder: Path | str,
    ):
        """Initializes the BaseFileHandler with the specified data folder.

        Args:
            data_folder (Path | str): The folder where the CSV file is stored
            or will be created.
        """
        self.data_folder = Path(data_folder)
        self.data_file: Path = Path(self.data_folder) / self.filename

        self.check_data_file()
        self.load_data()


[docs]
    def check_data_file(self) -> None:
        """Checks if the CSV file exists in the specified data folder.

        If the file does not exist, it creates a new file with the specified columns.
        Logs the status of the file (whether new or existing) using the logger.
        """
        if not self.data_file.is_file():
            with open(self.data_file, "w", newline="", encoding="UTF8") as f:
                writer = csv.writer(f)
                writer.writerow(self.dtypes.keys())

            logger.info(f"Using new file:\n\t{self.data_file}")
        else:
            logger.info(f"Using existing file:\n\t{self.data_file}")



[docs]
    def remove_duplicates_from_file(self) -> None:
        """Removes duplicate rows from the CSV file.

        This method reads the CSV file, removes any duplicate rows, and then
        saves the cleaned data back to the same file.
        """
        date_columns = [
            col for col, dtype in self.dtypes.items() if dtype == "datetime64[ns]"
        ]
        non_date_types = {
            col: dtype
            for col, dtype in self.dtypes.items()
            if dtype != "datetime64[ns]"
        }
        data = pd.read_csv(
            self.data_file, dtype=non_date_types, parse_dates=date_columns
        ).drop_duplicates()
        data = data.sort_values(by=self.sort_fields).reset_index(drop=True)
        data.to_csv(self.data_file, index=False)



[docs]
    def load_data(self) -> None:
        """Loads the data from the CSV file into the `data` DataFrame.

        This method reads the CSV file, removes duplicates, and stores the data
        in the `data` attribute for further processing.
        """
        date_columns = [
            col for col, dtype in self.dtypes.items() if dtype == "datetime64[ns]"
        ]
        non_date_types = {
            col: dtype
            for col, dtype in self.dtypes.items()
            if dtype != "datetime64[ns]"
        }
        self.data = pd.read_csv(
            self.data_file, dtype=non_date_types, parse_dates=date_columns
        ).drop_duplicates()





[docs]
class BaseScraper(BaseFileHandler):
    """Base class for web scrapers associated with a CSV file.

    This class provides basic functionality for scraping data from specific
    webs and storing it in a CSV file. It includes default settings for web
    scraping such as the base URL, the number of concurrent sessions, and
    the delay between requests.

    Attributes:
        web_url: The base URL for the website to scrape.
        n_sessions: Number of concurrent sessions for scraping.
        delay: Delay between requests to avoid being blocked.
    """

    web_url: str = "http://www.ufcstats.com"
    n_sessions: int = 1  # these are the defaults
    delay: float = 0.1

    def __init__(
        self,
        data_folder: Path | str,
        n_sessions: Optional[int] = None,
        delay: Optional[float] = None,
    ):
        """Initializes the BaseScraper with the specified parameters.

        Args:
            n_sessions: Number of concurrent sessions for scraping. If not
                provided, defaults to the class attribute.
            delay: Delay between requests to avoid being blocked. If not
                provided, defaults to the class attribute.
        """
        super().__init__(data_folder)
        self.n_sessions = n_sessions or self.n_sessions
        self.delay = delay or self.delay


[docs]
    @staticmethod
    def id_from_url(url: str) -> str:
        """Extracts and returns the ID from a given URL.

        Args:
            url: The URL from which to extract the ID.

        Returns:
            The extracted ID as a string.
        """
        if url[-1] == "/":
            return BaseScraper.id_from_url(url[:-1])

        return url.split("/")[-1]





[docs]
class BaseHTMLReader(BaseFileHandler):
    """Base class for HTML readers associated with a CSV file.

    This class provides basic functionality for reading HTML files and
    storing the data in a CSV file. It includes methods to read HTML content
    and convert it into a pandas DataFrame.
    """

    def __init__(self, html_file: Path | str, data_folder: Path | str):
        """Initializes the BaseHTMLReader with the specified HTML file and data folder.

        Args:
            html_file (Path | str): The path to the HTML file to read.
            data_folder (Path | str): The folder where the CSV file is stored or will be created.
        """
        super().__init__(data_folder)
        self.html_file = Path(html_file)
        self.html_datetime = datetime.fromtimestamp(self.html_file.stat().st_mtime)


[docs]
    def read_html(self) -> str:
        """Reads the HTML content from the specified HTML file.

        Returns:
            str: The HTML content as a string.
        """
        return self.html_file.read_text()