Source code for loaders.loading_utils

"""
Loading utilities for data file operations.

This module provides functions to load data from CSV, TXT and Excel files.
"""

# Third-party packages
import pandas as pd

# Local imports
from i18n import t
from utils import (
    DataLoadError,
    get_logger,
    validate_dataframe,
    validate_data_format,
    validate_file_path,
)

logger = get_logger(__name__)


[docs] def csv_reader(file_path: str) -> pd.DataFrame: """ Load data from a CSV file. Args: file_path: Path to the CSV file Returns: DataFrame with the CSV data, treating 'no' as NaN values Raises: FileNotFoundError: If file does not exist DataLoadError: If file cannot be read """ logger.info(t("log.loading_csv_file", path=file_path)) # Validate file exists validate_file_path(file_path) try: data = pd.read_csv(file_path, na_values=["no"]) validate_dataframe(data) validate_data_format(data) logger.info( t("log.successfully_loaded_csv", rows=len(data), columns=len(data.columns)) ) return data except pd.errors.EmptyDataError: logger.error(t("log.csv_file_empty", path=file_path)) raise DataLoadError(t("error.csv_file_empty", path=file_path)) except pd.errors.ParserError as e: logger.error(t("log.csv_parsing_error", path=file_path, error=str(e))) raise DataLoadError(t("error.csv_parsing_error", error=str(e))) except Exception as e: logger.error( t("log.unexpected_error_reading_csv", path=file_path, error=str(e)), exc_info=True, ) raise DataLoadError(t("error.unexpected_loading_csv", error=str(e)))
[docs] def txt_reader(file_path: str) -> pd.DataFrame: """ Load data from a text file (whitespace or tab separated). Uses pandas read_csv with sep=None (delimiter sniffing) so that tab-separated and space-separated values are detected automatically. Args: file_path: Path to the text file. Returns: DataFrame with the text file data, treating 'no' as NaN values. Raises: FileNotFoundError: If file does not exist DataLoadError: If file cannot be read """ logger.info(t("log.loading_txt_file", path=file_path)) validate_file_path(file_path) try: # sep=None triggers Python engine's delimiter sniffing (tab, space, etc.) data = pd.read_csv(file_path, sep=None, engine="python", na_values=["no"]) validate_dataframe(data) validate_data_format(data) logger.info( t("log.successfully_loaded_txt", rows=len(data), columns=len(data.columns)) ) return data except pd.errors.EmptyDataError: logger.error(t("log.csv_file_empty", path=file_path)) raise DataLoadError(t("error.csv_file_empty", path=file_path)) except pd.errors.ParserError as e: logger.error(t("log.csv_parsing_error", path=file_path, error=str(e))) raise DataLoadError(t("error.csv_parsing_error", error=str(e))) except Exception as e: logger.error( t("log.unexpected_error_reading_csv", path=file_path, error=str(e)), exc_info=True, ) raise DataLoadError(t("error.unexpected_loading_csv", error=str(e)))
[docs] def excel_reader(file_path: str) -> pd.DataFrame: """ Load data from an Excel file (.xlsx). Args: file_path: Path to the Excel file Returns: DataFrame with the Excel data Raises: FileNotFoundError: If file does not exist DataLoadError: If file cannot be read """ logger.info(t("log.loading_excel_file", path=file_path)) # Validate file exists validate_file_path(file_path) try: data = pd.read_excel(file_path) validate_dataframe(data) validate_data_format(data) logger.info( t( "log.successfully_loaded_excel", rows=len(data), columns=len(data.columns), ) ) return data except Exception as e: logger.error( t("log.error_reading_excel", path=file_path, error=str(e)), exc_info=True ) raise DataLoadError(t("error.loading_excel_file", error=str(e)))