"""Training models and evaluating their performance."""
from typing import Callable, Optional, Tuple, Union
import joblib
import numpy as np
import pandas as pd
import torch
from sklearn.ensemble import (GradientBoostingClassifier,
GradientBoostingRegressor,
HistGradientBoostingClassifier,
HistGradientBoostingRegressor,
RandomForestClassifier, RandomForestRegressor)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score,
mean_absolute_percentage_error,
mean_squared_error, r2_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
cross_validate)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor
from beexai.training.models import NeuralNetwork, NNModel
from beexai.utils.convert import convert_to_numpy
from beexai.utils.path import get_path
[docs]
class Trainer:
"""Trainer class
Attributes:
models (dict): dictionary of available models
model_name (str): name of the model
model_params (dict): parameters of the model
task (str): task to perform
device (str): device to use
model (callable): model object
Methods:
cross_val: cross validation for the model
train: train the model
get_metrics: get the metrics of the model
save_model: save the model
load_model: load the model
Args:
model_name (str): Name of the model from models dict. Must be one of
'LogisticRegression', 'LinearRegression', 'DecisionTreeClassifier',
'RandomForestClassifier', 'GradientBoostingClassifier', 'XGBClassifier',
'DecisionTreeRegressor', 'RandomForestRegressor', 'GradientBoostingRegressor',
'XGBRegressor', 'NeuralNetwork', 'HistGradientBoostingClassifier',
'HistGradientBoostingRegressor'
task (str): "classification" or "regression".
model_params (dict): Parameters for the model
device (str): device to use. Defaults to "cpu".
"""
def __init__(
self,
model_name: str,
task: str,
model_params: Optional[dict] = None,
device: str = "cpu",
):
assert task in [
"classification",
"regression",
], f"Task must be either classification or regression, got {task}"
self.models = {
"LogisticRegression": LogisticRegression,
"LinearRegression": LinearRegression,
"DecisionTreeClassifier": DecisionTreeClassifier,
"RandomForestClassifier": RandomForestClassifier,
"GradientBoostingClassifier": GradientBoostingClassifier,
"XGBClassifier": XGBClassifier,
"DecisionTreeRegressor": DecisionTreeRegressor,
"RandomForestRegressor": RandomForestRegressor,
"GradientBoostingRegressor": GradientBoostingRegressor,
"XGBRegressor": XGBRegressor,
"NeuralNetwork": NeuralNetwork,
"HistGradientBoostingClassifier": HistGradientBoostingClassifier,
"HistGradientBoostingRegressor": HistGradientBoostingRegressor,
}
self.model_name = model_name
self.model_params = model_params if model_params is not None else {}
self.task = task
self.device = device
assert (
self.model_name in self.models
), f"Model name must be one of {self.models.keys()}"
if model_name == "NeuralNetwork":
self.model = NNModel(**self.model_params, device=device, task=task).to(
device
)
else:
self.model = self.models[model_name](**self.model_params)
[docs]
def cross_val(
self,
x_train: pd.DataFrame,
y_train: pd.DataFrame,
param_grid: Optional[dict] = None,
scoring: Optional[str] = None,
kfold: Union[int, KFold] = 5,
search_type: str = "grid",
) -> Callable:
"""Cross validation for the model
Args:
x_train (pd.DataFrame): train set
y_train (pd.DataFrame): target
param_grid (dict, optional): grid search parameters. Defaults to None.
scoring (str, optional): scoring metric. Defaults to None.
kfold (Union[int, KFold], optional): number of folds or kfold object. Defaults to 5.
search_type (str, optional): "grid" or "random". Defaults to "grid".
Returns:
callable: best model
"""
assert search_type in ["grid", "random"]
if self.model_name == "NeuralNetwork":
x_train_copy = x_train.values
y_train_copy = y_train.values
if self.task == "classification":
y_train_copy = y_train_copy.astype(np.int64)
else:
y_train_copy = y_train_copy.astype(np.float32)
else:
x_train_copy = x_train
y_train_copy = y_train
if not isinstance(param_grid, type(None)):
if self.model_name == "NeuralNetwork":
self.model.set_params(train_split=False, verbose=0)
if search_type == "grid":
grid_search = GridSearchCV(
self.model,
param_grid=param_grid,
cv=kfold,
scoring=scoring,
n_jobs=-1,
)
if search_type == "random":
grid_search = RandomizedSearchCV(
self.model,
param_distributions=param_grid,
cv=kfold,
scoring=scoring,
n_jobs=-1,
n_iter=10,
)
grid_search.fit(x_train_copy, y_train_copy)
self.model = grid_search.best_estimator_
scores = cross_validate(
self.model,
x_train_copy,
y_train_copy,
cv=kfold,
scoring=scoring,
return_estimator=True,
)
print(f"Best estimator: {grid_search.best_estimator_}")
print(f"Best score: {scores['test_score'].mean()}")
return self.model, grid_search.best_params_
scores = cross_validate(
self.model,
x_train,
y_train,
cv=kfold,
scoring=scoring,
return_estimator=True,
)
print(f"Best score: {scores['test_score'].mean()}")
return self.model
[docs]
def train(
self,
x_train: Union[pd.DataFrame, np.ndarray, torch.Tensor],
y_train: Union[pd.DataFrame, np.ndarray, torch.Tensor],
learning_rate: float = 0.005,
epochs: int = 1000,
loss_file: Optional[str] = None,
x_val: Optional[Union[pd.DataFrame, np.ndarray, torch.Tensor]] = None,
y_val: Optional[Union[pd.DataFrame, np.ndarray, torch.Tensor]] = None,
) -> Callable:
"""Perform training on the whole training set.
Args:
x_train (pd.DataFrame): x_train
y_train (pd.DataFrame): y_train
learning_rate (float, optional): learning rate. Defaults to 0.005.
epochs (int, optional): number of epochs. Defaults to 1000.
loss_file (str, optional): path to save the loss plot. Defaults to None.
x_val (pd.DataFrame, optional): validation set. Defaults to None.
y_val (pd.DataFrame, optional): validation target. Defaults to None.
Returns:
callable: trained model
"""
if self.model_name == "NeuralNetwork":
if not isinstance(x_train, np.ndarray):
x_train_copy = x_train.values
else:
x_train_copy = x_train
if not isinstance(y_train, np.ndarray):
y_train_copy = y_train.values
else:
y_train_copy = y_train
if self.task == "classification":
y_train_copy = y_train_copy.astype(np.longlong)
else:
y_train_copy = y_train_copy.astype(np.float32)
self.model = self.model.fit(
x_train_copy,
y_train_copy,
learning_rate=learning_rate,
epochs=epochs,
loss_file=loss_file,
x_val=x_val,
y_val=y_val,
)
else:
self.model.fit(x_train, y_train)
return self.model
[docs]
def get_metrics(self, x: pd.DataFrame, y: pd.DataFrame) -> dict:
"""Get metrics for the model. Accuracy and f1 score for
classification, mse and r2 score for regression.
Args:
x (pd.DataFrame): test set
y (pd.DataFrame): target
Raises:
Exception: Task must be either classification or regression
Returns:
dict: dictionary of metrics
"""
if self.model_name == "NeuralNetwork" and not isinstance(x, np.ndarray):
x_copy = x.values
y_copy = y.values.squeeze()
else:
x_copy = x
y_copy = y.squeeze()
with torch.no_grad():
pred = self.model.predict(x_copy)
pred = convert_to_numpy(pred)
if self.task == "classification":
metrics = {
"accuracy": accuracy_score(pred, y_copy),
"f1 score": f1_score(pred, y_copy, average="weighted"),
}
if self.task == "regression":
metrics = {
"mse": mean_squared_error(pred, y_copy),
"rmse": np.sqrt(mean_squared_error(pred, y_copy)),
"mape": mean_absolute_percentage_error(pred, y_copy),
"r2 score": r2_score(pred, y_copy),
}
return metrics
[docs]
def save_model(self, path: str):
"""Save the model"""
path = get_path(path, check_dir=True)
is_nn = self.model_name == "NeuralNetwork"
if is_nn:
torch.save(self.model.state_dict(), path)
else:
joblib.dump(self.model, path)
[docs]
def load_model(self, path: str):
"""Load the model"""
path = get_path(path)
is_nn = self.model_name == "NeuralNetwork"
if is_nn:
self.model.load_state_dict(torch.load(path))
self.model.eval()
self.model.to(self.device)
else:
self.model = joblib.load(path)
[docs]
def test_all_models(
task: str,
x_train: pd.DataFrame,
x_test: pd.DataFrame,
y_train: pd.DataFrame,
y_test: pd.DataFrame,
params_dict: Optional[dict] = None,
) -> None:
"""Train and test all models on the whole training set
Args:
task (str): "classification" or "regression"
x_train (pd.DataFrame): train set
x_test (pd.DataFrame): test set
y_train (pd.DataFrame): train target
y_test (pd.DataFrame): test target
params_dict (dict, optional): parameters for each model. Defaults to None.
"""
params_dict = {} if params_dict is None else params_dict
for model_name in params_dict.keys():
print(f"Testing model: {model_name}")
model = Trainer(model_name, task, params_dict[model_name])
model.train(x_train, y_train)
with torch.no_grad():
pred = model.model.predict(x_test)
if task == "classification":
print(f"Accuracy: {accuracy_score(pred, y_test)}")
print(f"F1 score: \n{f1_score(pred, y_test, average=None)}")
print("\n")
elif task == "regression":
print(f"MSE: {mean_squared_error(pred, y_test)}")
print(f"R2 Score: {r2_score(pred, y_test)}")
print("\n")
[docs]
def grid_search_all_models(
x_train: pd.DataFrame,
y_train: pd.DataFrame,
task: str,
params_dict: Optional[dict] = None,
params_grid_dict: Optional[dict] = None,
scoring: Optional[str] = None,
kfold: Union[int, KFold] = 5,
search_type: str = "grid",
) -> Tuple[dict, dict]:
"""Grid search for all models
Args:
x_train (pd.DataFrame): x_train
y_train (pd.DataFrame): y_train
task (str): "classification" or "regression"
params_dict (dict, optional): parameters for each model. Defaults to None.
params_grid_dict (dict, optional): grid search parameters for each model.
Defaults to None.
scoring (str, optional): scoring metric. Defaults to None.
kfold (Union[int, KFold], optional): kfold object. Defaults to 5.
search_type (str, optional): "grid" or "random". Defaults to "grid".
Returns:
Tuple[dict, dict]: best models and best parameters
"""
best_models = {}
best_params = {}
params_dict = {} if params_dict is None else params_dict
params_grid_dict = {} if params_grid_dict is None else params_grid_dict
if params_dict == {}:
params_dict = params_grid_dict.copy()
for model_name in params_dict.keys():
model = Trainer(model_name, task, params_dict[model_name])
model, params = model.cross_val(
x_train,
y_train,
param_grid=params_grid_dict[model_name],
scoring=scoring,
kfold=kfold,
search_type=search_type,
)
best_models[model_name] = model
best_params[model_name] = params
print(f"Best params for {model_name}: {params}")
return best_models, best_params