Source code for src.train

import pandas as pd
import os
import time
import pickle

from sklearn.preprocessing import MinMaxScaler

from helper import set_all_seeds
from utils import split_dataset, compute_metrics, pkl_to_df, cross_entropy
from constants import (
    PATH_TO_DATA,
    PATH_TO_DATA_TEST,
    PATH_TO_DATA_TRAIN,
    PATH_TO_DATA_VAL,
    alt_spec_features,
)
from models_wrapper import RUMBoost, TasteNet, GBDT, DNN
from rumboost.datasets import load_preprocess_LPMC


[docs] def train(args): """ Train the specified model. """ if not args.outpath: args.outpath = f"results/{args.dataset}/{args.model}/" # create the output directory if it does not exist os.makedirs(args.outpath, exist_ok=True) if os.path.exists(args.outpath + f"results_dict_fi{args.functional_intercept}_fp{args.functional_params}.csv"): print(f"Results for dataset {args.dataset}, model {args.model}, func_int {args.functional_intercept}, func_params {args.functional_params} already exist. Skipping...") return # set the random seed for reproducibility set_all_seeds(args.seed) all_alt_spec_features = [] for _, value in alt_spec_features[args.dataset].items(): all_alt_spec_features.extend(value) # load the data if args.dataset == "SwissMetro": data_train = pkl_to_df(PATH_TO_DATA_TRAIN[args.dataset]) data_val = pkl_to_df(PATH_TO_DATA_VAL[args.dataset]) data_test = pkl_to_df(PATH_TO_DATA_TEST[args.dataset]) if args.optimal_hyperparams: data_train = pd.concat([data_train, data_val], axis=0) columns = data_train.columns features = [col for col in columns if col not in ["CHOICE"]] target = "CHOICE" socio_demo_chars = [ col for col in columns if col not in all_alt_spec_features and col not in ["CHOICE"] ] num_classes = 3 elif args.dataset == "easySHARE": if args.optimal_hyperparams: data_train = pd.read_csv(PATH_TO_DATA_TRAIN[args.dataset]) data_test = pd.read_csv(PATH_TO_DATA_TEST[args.dataset]) columns = data_train.columns else: data = pd.read_csv(PATH_TO_DATA[args.dataset]) columns = data.columns features = [ col for col in columns if col not in ["mergeid", "hhid", "coupleid", "depression_scale"] ] target = "depression_scale" socio_demo_chars = [ col for col in columns if col not in all_alt_spec_features and col not in ["mergeid", "hhid", "coupleid", "depression_scale"] ] num_classes = 13 elif args.dataset == "LPMC": data_train, data_test, folds = load_preprocess_LPMC(PATH_TO_DATA[args.dataset]) if not args.optimal_hyperparams: for i, (train_idx, val_idx) in enumerate(folds): if i > 1: continue data_val = data_train.iloc[val_idx] data_train = data_train.iloc[train_idx] features = [col for col in data_train.columns if col not in ["choice", "household_id"]] target = "choice" socio_demo_chars = [ col for col in data_train.columns if col not in all_alt_spec_features and col not in ["choice", "household_id"] ] num_classes = 4 # split data if args.optimal_hyperparams: X_train, y_train = data_train[features], data_train[target] X_test, y_test = data_test[features], data_test[target] X_val, y_val = None, None else: if args.dataset == "easySHARE": X_train, y_train, X_val, y_val, X_test, y_test = split_dataset( data, target, features, train_size=args.train_size, val_size=args.val_size, groups=data["hhid"], random_state=args.seed, ) else: X_train, y_train = data_train[features], data_train[target] X_test, y_test = data_test[features], data_test[target] X_val, y_val = data_val[features], data_val[target] # scale the features scaler = MinMaxScaler() X_train[all_alt_spec_features + socio_demo_chars] = scaler.fit_transform( X_train[all_alt_spec_features + socio_demo_chars] ) X_test[all_alt_spec_features + socio_demo_chars] = scaler.transform( X_test[all_alt_spec_features + socio_demo_chars] ) if not args.optimal_hyperparams: X_val[all_alt_spec_features + socio_demo_chars] = scaler.transform( X_val[all_alt_spec_features + socio_demo_chars] ) if args.optimal_hyperparams: # load the optimal hyperparameters for the model try: opt_hyperparams_path = f"results/{args.dataset}/{args.model}/best_params_fi{args.functional_intercept}_fp{args.functional_params}.pkl" with open(opt_hyperparams_path, "rb") as f: optimal_hyperparams = pickle.load(f) if "layer_sizes" in optimal_hyperparams: optimal_hyperparams["layer_sizes"] = [ int(size) for size in optimal_hyperparams["layer_sizes"].split(",") ] if "learning_rate" not in optimal_hyperparams: optimal_hyperparams["learning_rate"] = 1 args.__dict__.update(optimal_hyperparams) except FileNotFoundError: print( f"Optimal hyperparameters not found for {args.model}. Using default hyperparameters." ) optimal_hyperparams = None if args.model == "RUMBoost": if args.optimal_hyperparams and optimal_hyperparams: args.num_iterations = int(optimal_hyperparams["best_iteration"]) args.early_stopping_rounds = None model = RUMBoost( alt_spec_features=alt_spec_features[args.dataset], socio_demo_chars=socio_demo_chars, num_classes=num_classes, args=args, ) save_path = ( args.outpath + f"model_fi{args.functional_intercept}_fp{args.functional_params}.json" ) elif args.model == "TasteNet": if args.optimal_hyperparams and optimal_hyperparams: args.num_epochs = int(optimal_hyperparams["best_iteration"]) args.patience = args.num_epochs model = TasteNet( alt_spec_features=alt_spec_features[args.dataset], socio_demo_chars=socio_demo_chars, num_classes=num_classes, num_latent_vals=1 if args.dataset == "easySHARE" else None, args=args, ) save_path = ( args.outpath + f"model_fi{args.functional_intercept}_fp{args.functional_params}.pth" ) elif args.model == "GBDT": if args.optimal_hyperparams and optimal_hyperparams: args.num_iterations = int(optimal_hyperparams["best_iteration"]) if args.num_iterations == 0: args.num_iterations = 3000 #falls back to default args.early_stopping_rounds = None if args.functional_intercept: features = list(set(all_alt_spec_features)) + socio_demo_chars else: features = list(set(all_alt_spec_features)) model = GBDT( alt_spec_features=features, socio_demo_chars=socio_demo_chars, num_classes=num_classes, args=args, ) save_path = ( args.outpath + f"model_fi{args.functional_intercept}_fp{args.functional_params}.pkl" ) elif args.model == "DNN": if args.optimal_hyperparams and optimal_hyperparams: args.num_epochs = int(optimal_hyperparams["best_iteration"]) args.patience = args.num_epochs if args.functional_intercept: features = list(set(all_alt_spec_features)) + socio_demo_chars else: features = list(set(all_alt_spec_features)) print(f"Features: {features}") model = DNN( alt_spec_features=features, socio_demo_chars=socio_demo_chars, num_classes=num_classes, args=args, ) save_path = ( args.outpath + f"model_fi{args.functional_intercept}_fp{args.functional_params}.pth" ) model.build_dataloader(X_train, y_train, X_val, y_val) # fit the model start_time = time.time() best_train_loss, best_val_loss = model.fit() end_time = time.time() # test the model preds, binary_preds, labels = model.predict(X_test) if args.dataset == "easySHARE": mae_test, loss_test, emae_test = compute_metrics( preds, binary_preds, labels, y_test ) else: loss_test = cross_entropy(y_test, preds) mae_test = 0 emae_test = 0 print(f"Best Train Loss: {best_train_loss}, Best Val Loss: {best_val_loss}") print(f"Test MAE: {mae_test}, Test Loss: {loss_test}, Test EMAE: {emae_test}") results_dict = { "train_loss": best_train_loss, "val_loss": best_val_loss, "train_time": end_time - start_time, "mae_test": mae_test, "loss_test": loss_test, "emae_test": emae_test, } if args.save_model: # save the results pd.DataFrame(results_dict, index=[0]).to_csv( args.outpath + f"results_dict_fi{args.functional_intercept}_fp{args.functional_params}.csv" ) model.save_model(save_path)