"""Provides a fast way to load data and preprocess it"""
from typing import List, Optional, Tuple
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from yaml.loader import SafeLoader
from beexai.utils.path import get_path
[docs]
class LoadData:
"""Load data from a csv file and return a dataframe
Attributes:
path (str): path to the csv file
Methods:
load_csv: load the csv file
Args:
path (str): path to the csv file
"""
def __init__(self, path: str):
self.path = get_path(path)
[docs]
def load_csv(self, keep_index: bool = False) -> pd.DataFrame:
"""Load the csv file
Args:
keep_index (bool): whether to keep the index or not. Defaults to False.
Returns:
pd.DataFrame: dataframe
"""
if keep_index:
return pd.read_csv(self.path, index_col=0)
return pd.read_csv(self.path)
[docs]
class Preprocessor:
"""Preprocess the data by deleting entries, adding new columns,
converting to datetime and adding date infos.
Attributes:
df (pd.DataFrame): input dataframe
encoder (object): encoder to use
target_col (str): target column name
datetime_cols (list): list of datetime columns
values_to_delete (list): list of tuples (col_name,value to delete)
add_cols (list): list of tuples (new_col_name,new_col_value,
cast_to_type)
cols_to_delete (list): list of columns to delete
Methods:
delete_entries: delete entries from the dataframe
add_entries: add new colums to the dataframe with new values.
These values are combinations of existing columns.
convert_to_datetime: convert columns to datetime
add_date_infos: add year, month, day and hour to the dataframe
preprocess: preprocess the data
save_cleaned_data: save the cleaned data
Args:
df (pd.DataFrame): input dataframe
target_col (str, optional): target column name. Defaults to None.
values_to_delete (List[Tuple[str,str]], optional):
(col_name,value to delete) values to delete from the dataframe.
Defaults to None.
datetime_cols (list, optional): columns to convert to
datetime. Defaults to None.
add_cols (list, optional):
(new_col_name,new_col_value,cast_to_type) columns to add
to the dataframe. Defaults to None.
cols_to_delete (list, optional): columns to delete from
the dataframe. Defaults to None.
"""
def __init__(
self,
df: pd.DataFrame,
target_col: Optional[str] = None,
values_to_delete: Optional[List[Tuple[str, str]]] = None,
datetime_cols: Optional[List[str]] = None,
add_cols: Optional[List[Tuple[str, str, str]]] = None,
cols_to_delete: Optional[List[str]] = None,
):
self.df = df
self.target_col = target_col
self.datetime_cols = datetime_cols
self.values_to_delete = values_to_delete
self.add_cols = add_cols
self.cols_to_delete = cols_to_delete
[docs]
def delete_entries(
self, df: pd.DataFrame, values_to_delete: List[Tuple[str, str]]
) -> pd.DataFrame:
"""Delete entries from the dataframe
Args:
df (pd.DataFrame): input dataframe
values_to_delete (list): list of tuples (col_name,value to delete)
Returns:
pd.DataFrame: dataframe without the specified values
"""
for col, value in values_to_delete:
df = df[df[col] != value]
return df
[docs]
def add_entries(
self, df: pd.DataFrame, add_cols: List[Tuple[str, str, str]]
) -> pd.DataFrame:
"""Add new colums to the dataframe with new values.
These values are combinations of existing columns.
Args:
df (pd.DataFrame): input dataframe
add_cols (list): list of tuples
(new_col_name,new_col_value,cast_to_type)
Returns:
pd.DataFrame: dataframe with new columns
"""
for new_col_name, new_col_value, cast_to_type in add_cols:
df[new_col_name] = new_col_value
if cast_to_type is not None:
df[new_col_name] = df[new_col_name].astype(cast_to_type)
return df
[docs]
def convert_to_datetime(self, df: pd.DataFrame) -> pd.DataFrame:
"""Convert columns to datetime
Args:
df (pd.DataFrame): dataframe
Returns:
pd.DataFrame: dataframe with datetime columns
"""
for col in self.datetime_cols:
df[col] = pd.to_datetime(df[col])
return df
[docs]
def add_date_infos(self, df: pd.DataFrame, col: str) -> pd.DataFrame:
"""Add year, month, day and hour to the dataframe
Args:
df (pd.DataFrame): dataframe
col (str): column name
Returns:
pd.DataFrame: dataframe with new columns
"""
for attr in ["year", "month", "day", "hour"]:
if getattr(df[col].dt, attr).isnull().sum() == 0:
df[col + "_" + attr] = getattr(df[col].dt, attr)
return df
[docs]
def preprocess(self) -> pd.DataFrame:
"""Preprocess the data
Returns:
pd.DataFrame: dataframe
"""
df = self.df
if self.datetime_cols is not None:
df = self.convert_to_datetime(df)
for col in self.datetime_cols:
df = self.add_date_infos(df, col)
df = df.drop(col, axis=1)
if self.add_cols is not None:
df = self.add_entries(df, self.add_cols)
if self.values_to_delete is not None:
df = self.delete_entries(df, self.values_to_delete)
df.dropna(inplace=True)
if "Unnamed: 0" in df.columns:
df.drop("Unnamed: 0", axis=1, inplace=True)
if self.cols_to_delete is not None:
df = df.drop(self.cols_to_delete, axis=1)
return df
[docs]
def save_cleaned_data(self, df: pd.DataFrame, path: str) -> None:
"""Save the cleaned data
Args:
df (pd.DataFrame): dataframe
path (str): path to save the dataframe
"""
path = get_path(path, check_dir=True)
df.to_csv(path, index=False)
[docs]
class DataCleaner:
"""Clean the data by removing correlated features
Attributes:
df (pd.DataFrame): input dataframe
target_col (str): target column name
corr_threshold (float): correlation threshold
Methods:
compute_correlation_matrix: compute the correlation matrix
plot_corr_matrix: plot the correlation matrix
remove_correlated_features: remove correlated features from the
dataframe with a threshold
clean_data: clean the data
Args:
df (pd.DataFrame): input dataframe
target_col (str): target column name
corr_threshold (float, optional): correlation threshold.
Defaults to 0.7.
"""
def __init__(self, df, target_col: str, corr_threshold: float = 0.7):
self.df = df
self.corr_threshold = corr_threshold
self.target_col = target_col
[docs]
def compute_correlation_matrix(self, df: pd.DataFrame) -> pd.DataFrame:
"""Compute the correlation matrix
Args:
df (pd.DataFrame): dataframe
Returns:
pd.DataFrame: correlation matrix
"""
return df.corr().abs()
[docs]
def plot_corr_matrix(self, df: pd.DataFrame) -> None:
"""Plot the correlation matrix
Args:
df (pd.DataFrame): dataframe
"""
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True)
plt.show()
[docs]
def clean_data(self) -> pd.DataFrame:
"""Clean the data
Returns:
pd.DataFrame: dataframe
"""
df = self.remove_correlated_features(self.df)
return df
[docs]
def fast_load(
config_path: str,
values_to_delete: Optional[List[Tuple[str, str]]] = None,
adding_cols: Optional[List[Tuple[str, str, str]]] = None,
keep_corr_features: bool = True,
) -> List:
"""Provides a fast way to load data and preprocess it
Args:
config_path (str): path to the config file
values_to_delete (list, optional): list of tuples
(col_name,value to delete). Defaults to None.
adding_cols (list, optional): list of tuples
(col_name,fun_to_add,cast_to_type). Defaults to None.
keep_corr_features (bool, optional): whether to keep correlated
features or not. Defaults to True.
Returns:
list: a list containing the data, the target column name,
the task and the data_cleaner object
"""
config_path = get_path(config_path)
with open(config_path, encoding="utf-8") as f:
config = yaml.load(f, Loader=SafeLoader)
path = config["path"]
target_col = config["target_col"]
task = config["task"]
cleaned_data_path = config["cleaned_data_path"]
if "datetime_cols" not in config.keys():
datetime_cols = None
else:
datetime_cols = config["datetime_cols"]
if "cols_to_delete" not in config.keys():
cols_to_delete = None
else:
cols_to_delete = config["cols_to_delete"]
data_loader = LoadData(path=path)
data = data_loader.load_csv()
if adding_cols is not None:
add_cols_copy = adding_cols.copy()
add_cols_copy = [
(col_name, fun(data), cast_to_type)
for col_name, fun, cast_to_type in adding_cols
]
else:
add_cols_copy = None
preprocessor = Preprocessor(
data,
target_col,
values_to_delete,
datetime_cols,
add_cols_copy,
cols_to_delete,
)
data = preprocessor.preprocess()
if not keep_corr_features:
data_cleaner = DataCleaner(data, target_col)
data = data_cleaner.clean_data()
else:
data_cleaner = None
preprocessor.save_cleaned_data(data, cleaned_data_path)
return {
"data": data,
"target_col": target_col,
"task": task,
"data_cleaner": data_cleaner,
}.values()
[docs]
def load_data(
from_cleaned: bool,
config_path: str,
values_to_delete: Optional[List[Tuple[str, str]]] = None,
add_list: Optional[List[Tuple[str, str, str]]] = None,
keep_corr_features: bool = True,
) -> List:
"""Load data from a config file
Args:
from_cleaned (bool): whether to load the data directly from the
cleaned data or not
config_path (str): path to the config file
values_to_delete (list, optional): list of tuples
(col_name,value to delete). Defaults to None.
add_list (list, optional): list of tuples
(col_name,fun_to_add,cast_to_type). Defaults to None.
keep_corr_features (bool, optional): whether to keep correlated
features or not. Defaults to True.
Returns:
list: a list containing the data, the target column name,
the task and the data_cleaner object
"""
config_path = get_path(config_path)
with open(config_path, encoding="utf-8") as f:
config = yaml.load(f, Loader=SafeLoader)
if from_cleaned:
path = get_path(config["cleaned_data_path"])
data = pd.read_csv(path)
return {
"data": data,
"target_col": config["target_col"],
"task": config["task"],
"data_cleaner": None,
}.values()
data, target_col, task, data_cleaner = fast_load(
config_path, values_to_delete, add_list, keep_corr_features
)
return {
"data": data,
"target_col": target_col,
"task": task,
"data_cleaner": data_cleaner,
}.values()