Source code for data_io

import pathlib
import re
import os

from typing import List, Optional, Union

# ---

from common.logging_facilities import logi, logd

# ---

import pandas as pd

[docs] def read_from_file(path, file_format='feather', sample:Optional[float]=None, sample_seed:int=23, filter_query:str = None): if file_format == 'feather': try: data = pd.read_feather(path) if sample: logi(f'sampling {sample*100}% of data from {path}') data = data.sample(frac=sample, random_state=sample_seed) if filter_query: logi(f'filtering data with the query expression "{filter_query}"') data.query(filter_query, inplace=True) except Exception as e: raise Exception(f'Could not read from: {path}\n{e}') return data elif file_format == 'hdf': try: data = pd.read_hdf(path) if sample: logi(f'sampling {sample*100}% of data from {path}') data = data.sample(frac=sample, random_state=sample_seed) if filter_query: logi(f'filtering data with the query expression "{filter_query}"') data.query(filter_query, inplace=True) except Exception as e: raise Exception(f'Could not read from: {path}\n{e}') return data
[docs] class DataSet: def __init__(self, data_path:Union[List[str], str]): self.data_path = data_path self.common_root, self.data_files = self.expand_data_path() if len(self.data_files) == 0: raise Exception(f'No input files for path(s): {data_path}\n')
[docs] def get_data_path(self) -> Union[List[str], str]: return self.data_path
[docs] def get_file_list(self) -> List[str]: return self.data_files
[docs] def get_common_root(self) -> str: return self.common_root
[docs] def expand_data_path(self): file_list = [] base_paths = [] if isinstance(self.data_path, list): for entry in self.data_path: base_path, files = self.evaluate_regex_path(entry) file_list.extend(files) base_paths.append(base_path) common_root = os.path.commonprefix(base_paths) else: common_root, file_list = self.evaluate_regex_path(self.data_path) return common_root, file_list
[docs] @staticmethod def evaluate_regex_path(data_path:str): r""" Take the given regular expression to generate a list of paths that match it. Parameters ---------- data_path : str The regular expression that is used to select files. Returns ------- List[str] A list of paths matching the regular expression. """ data_files = [] common_root = DataSet.find_base_path_in_regex(data_path) logd(f"determined common root path for input_files: {common_root=}") relative_regex = pathlib.Path(data_path).relative_to(common_root) logd(f"searching for files with regex {relative_regex=} in root path") regex = re.compile(str(relative_regex)) for root, dirs, files in os.walk(common_root): for file in files: # Check if the file matches any of the regex patterns file_full_path = os.path.join(root, file) relative_path = pathlib.Path(file_full_path).relative_to(common_root) if regex.search(str(relative_path)): logd(f"adding {file_full_path=}") data_files.append(file_full_path) return str(common_root), data_files
[docs] @staticmethod def find_base_path_in_regex(path_regex:str) -> str: r""" Takes a regular expression for a file path and determines the static part of the given path regex. Parameters ---------- path_regex : str The regex pattern for the file path. Returns ------- str Static parts of the path that do not contain any regex. """ # Pattern to match literal text within the regex # This pattern looks for sequences of characters that are not special regex symbols. # It will also match on literals that need to be escaped in a regex, such as parentheses # in the path, but those can then be treated as part of the regex. base_path_pattern = re.compile(r'[a-zA-Z0-9_\-/]+') # Find all static parts in the given regex pattern base_path = base_path_pattern.findall(path_regex) common_root = pathlib.Path(base_path[0]).parent return common_root