[75.06 / 95.58] Organización de Datos
Trabajo Práctico 2: Machine Learning

Parameter Tuning¶

Grupo 30: Datatouille

http://fdelmazo.github.io/7506-Datos/

Siendo este un proceso muy costoso en tiempo, se corre una vez y se guardan los resultados para luego poder importarlos

def get_hiper_params():
    # Se esconde en un def para poder importarlo

    return {
        'decision_tree':{'criterion': 'gini',
                         'max_features': 0.30000000000000004,
                         'max_depth': 8.75,
                         'min_samples_split': 0.1,
                         'min_samples_leaf': 0.0001},

        'random_forest': {'n_estimators': 200,
                         'criterion': 'entropy',
                         'max_features': 0.2,
                         'max_depth': 16.5,
                         'min_samples_split': 0.1,
                         'min_samples_leaf': 0.1},
        
        # AUC: 0.8695
        'xgboost': {'learning_rate': 0.1,
                     'objective': 'binary:logistic',
                     'n_estimators': 16,
                     'scale_pos_weight': 2,
                     'max_depth': 4,
                     'min_child_weight': 5,
                     'gamma': 0.0,
                     'colsample_bytree': 0.7500000000000001,
                     'subsample': 0.7,
                     'colsample_bylevel': 0.65},
        
        'knn': {'n_neighbors':21, 'weights':'uniform', 'n_jobs':-1},
                
        # AUC: 0.8700
        'lightgbm': {'objective': 'binary',
                     'num_leaves': 36,
                     'n_estimators': 70,
                     'min_split_gain': 0.01,
                     'min_child_weight': 5.00001,
                     'max_depth': 4,
                     'learning_rate': 0.05,
                     'lambda_l2': 0,
                     'feature_fraction': 0.7000000000000001,
                     'bagging_fraction': 1.0},

        'catboost': { 'eval_metric': 'AUC',
                     'iterations': 678,
                     'random_strength': 42,
                     'learning_rate': 0.01,
                     'depth': 1,
                     'l2_leaf_reg': 2},
        
        'gradient_boosting': {'max_leaf_nodes': None,
                     'min_weight_fraction_leaf': 0,
                     'learning_rate': 0.1,
                     'max_features': 1,
                     'min_samples_split': 1.0,
                     'min_samples_leaf': 0.1,
                     'max_depth': 1.0,
                     'n_estimators': 1,
                     'subsample': 0.8,
                     'loss': 'deviance',
                     'warm_start': False,
                     'presort': 'auto'},
        
        'neuralnetwork': {'hidden_layer_sizes': (4, 4),
                         'activation': 'relu',
                         'alpha': 0.0001,
                         'beta_1': 0.05,
                         'beta_2': 0.86,
                         'early_stopping': False,
                         'epsilon': 1e-08,
                         'learning_rate': 'constant',
                         'solver': 'adam',
                         'validation_fraction': 0.15}
        
}

import nbimporter # pip install nbimporter
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import submission_framework as SF

Importing Jupyter notebook from submission_framework.ipynb

df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()

columnas_a_mano = [ 'total_checkouts',
 'total_conversions',
 'total_events',
 'total_sessions',
 'total_session_checkout',
 'total_session_conversion',
 'total_events_ad_session',
 'total_ad_sessions',
 'avg_events_per_session',
 'avg_events_per_ad_session',
 'percentage_session_ad',
 'has_checkout',
 'has_conversion',
 'total_viewed_products_month_1',
 'total_checkouts_month_1',
 'total_conversions_month_1',
 'total_events_month_1',
 'total_sessions_month_1',
 'total_session_checkouts_month_1',
 'total_session_conversions_month_1',
 'total_events_ad_session_month_1',
 'total_ad_sessions_month_1',
 'has_checkout_month_1',
 'has_conversion_month_1',
 'total_viewed_products_month_2',
 'total_checkouts_month_2',
 'total_conversions_month_2',
 'total_events_month_2',
 'total_sessions_month_2',
 'total_session_checkouts_month_2',
 'total_session_conversions_month_2',
 'total_events_ad_session_month_2',
 'total_ad_sessions_month_2',
 'has_checkout_month_2',
 'has_conversion_month_2',
 'total_viewed_products_month_3',
 'total_checkouts_month_3',
 'total_conversions_month_3',
 'total_events_month_3',
 'total_sessions_month_3',
 'total_session_checkouts_month_3',
 'total_session_conversions_month_3',
 'total_events_ad_session_month_3',
 'total_ad_sessions_month_3',
 'has_checkout_month_3',
 'has_conversion_month_3',
 'total_viewed_products_month_4',
 'total_checkouts_month_4',
 'total_conversions_month_4',
 'total_events_month_4',
 'total_sessions_month_4',
 'total_session_checkouts_month_4',
 'total_session_conversions_month_4',
 'total_events_ad_session_month_4',
 'total_ad_sessions_month_4',
 'has_checkout_month_4',
 'has_conversion_month_4',
 'total_viewed_products_month_5',
 'total_checkouts_month_5',
 'total_conversions_month_5',
 'total_events_month_5',
 'total_sessions_month_5',
 'total_session_checkouts_month_5',
 'total_session_conversions_month_5',
 'total_events_ad_session_month_5',
 'total_ad_sessions_month_5',
 'has_checkout_month_5',
 'has_conversion_month_5',
 'total_viewed_products_months_1_to_4',
 'total_checkouts_months_1_to_4',
 'total_conversions_months_1_to_4',
 'total_events_months_1_to_4',
 'total_sessions_months_1_to_4',
 'total_session_checkouts_months_1_to_4',
 'total_session_conversions_months_1_to_4',
 'total_events_ad_session_months_1_to_4',
 'total_ad_sessions_months_1_to_4',
 'has_checkout_months_1_to_4',
 'has_conversion_months_1_to_4',
 'total_viewed_products_lw',
 'total_checkouts_lw',
 'total_conversions_lw',
 'total_events_lw',
 'total_sessions_lw',
 'total_session_checkouts_lw',
 'total_session_conversions_lw',
 'total_events_ad_session_lw',
 'total_ad_sessions_lw',
 'has_checkout_lw',
 'has_conversion_lw',
 'amount_of_months_that_has_bought',
 'timestamp_last_event',
 'timestamp_last_checkout',
 'timestamp_last_conversion',
 'timestamp_last_viewed_product',
 'days_to_last_event',
 'days_to_last_checkout',
 'days_to_last_conversion',
 'days_to_last_viewed_product',
 'doy_last_event',
 'dow_last_event',
 'dom_last_event',
 'woy_last_event',
 'doy_last_checkout',
 'dow_last_checkout',
 'dom_last_checkout',
 'woy_last_checkout',
 'doy_last_conversion',
 'dow_last_conversion',
 'dom_last_conversion',
 'woy_last_conversion',
 'doy_last_viewed_product',
 'dow_last_viewed_product',
 'dom_last_viewed_product',
 'woy_last_viewed_product',
 'last_conversion_sku',
 'last_conversion_price',
 'percentage_last_week_activity',
 'percentage_last_month_activity',
 'days_between_last_event_and_checkout',
 'percentage_regular_celphones_activity',
 'var_viewed',
 'conversion_gt_media'
]

def find_best_params(df_x, df_y, orig_model_with_name, default_params, list_of_progressive_params,
                     columns=None, seed=0, cv=5, normalize=False):
    
    """Encuentra los mejores hiper-parametros con grid search pero de manera secuencial. 
    Es decir, en vez de probar 3 variables con 5 valores cada uno, lo cual resulta en 455 modelos, se puede partir en dos variables de 5 valores (25 modelos) y del mejor de eso los otros 5 valores (125 modelos en total)
    Ciertamente, no es óptimo. Pero ahorra tiempo valioso"""
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_GSF'
    acc_params = {}
    i=1
    
    for params_grid in list_of_progressive_params:
        print(f"Best Params So Far: {default_params} {acc_params}\n\n")

        model_new = GridSearchCV(orig_model(**default_params,**acc_params,random_state=seed) ,params_grid, cv=cv, verbose=1, scoring='roc_auc',n_jobs=4)
   
        model_with_name = (orig_model_name,model_new)
        model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name, columns=columns, normalize=normalize)
        acc_params.update(model.best_params_)
        
        i+=1

    default_params.update(acc_params)
    return default_params

def find_best_params_random(df_x, df_y, orig_model_with_name, default_params, params_grid,
                            n_iter=15, columns=None, seed=0, cv=5):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_RS'
    i=1

    model_new = RandomizedSearchCV(orig_model(**default_params,random_state=seed), params_grid, n_iter=n_iter, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)

    model_with_name = (orig_model_name,model_new)
    model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
    
    return model.best_params_

def find_best_params_gridsearch(df_x, df_y, orig_model_with_name, default_params, params_grid,
                                columns=None, seed=42, cv=5):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_GS'
    i=1
    
    model_new = GridSearchCV(orig_model(**default_params), params_grid, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)
    
    model_with_name = (orig_model_name,model_new)
    model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
    
    return model.best_params_

Decision Tree¶

https://medium.com/@mohtedibf/indepth-parameter-tuning-for-decision-tree-6753118a03c3

from sklearn.tree import DecisionTreeClassifier, export_graphviz

list_of_progressive_params = [{'criterion':['gini','entropy']},
                              {'max_features': np.arange(0.1,0.8,0.1)},
                              {'max_depth': np.linspace(1, 32, 5, endpoint=True)},
                              {'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
                              {'min_samples_leaf': np.arange(0.0001,0.5,0.1)}
]

model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params(df_users,df_y,model_with_name, {},list_of_progressive_params, columns=columnas_a_mano) 
best_params_decision_tree

Best Params So Far: {} {}


Fitting 5 folds for each of 2 candidates, totalling 10 fits

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   14.7s finished

Model: decision_tree_GSF - AUC: 0.5729 - AUCPR:0.0745 - Accuracy: 0.9126 
Best Params So Far: {} {'criterion': 'entropy'}


Fitting 5 folds for each of 7 candidates, totalling 35 fits

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  35 out of  35 | elapsed:   12.7s finished

Model: decision_tree_GSF - AUC: 0.5786 - AUCPR:0.0774 - Accuracy: 0.9126 
Best Params So Far: {} {'criterion': 'entropy', 'max_features': 0.7000000000000001}


Fitting 5 folds for each of 5 candidates, totalling 25 fits

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: Proceso muy costoso en tiempo. Se corta la ejecución, que ya sus resultados fueron guardados

Método Random¶

list_of_progressive_params = {'criterion':['gini','entropy'],
                              'max_features': np.arange(0.1,0.8,0.1),
                              'max_depth': np.linspace(1, 32, 5, endpoint=True),
                              'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
                              'min_samples_leaf': np.arange(0.0001,0.5,0.1)
                             }


model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params_random(df_users,df_y,model_with_name, {}, list_of_progressive_params, n_iter=100, columns=columnas_a_mano) 
best_params_decision_tree

Método GridSearch¶

list_of_progressive_params = {'criterion':['gini','entropy'],
                              'max_features': np.arange(0.1,0.8,0.1),
                              'max_depth': np.linspace(1, 32, 5, endpoint=True),
                              'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
                              'min_samples_leaf': np.arange(0.0001,0.5,0.1)
                             }


model_with_name = ('decision_tree', DecisionTreeClassifier)

best_params_decision_tree = find_best_params_gridsearch(df_users,df_y,model_with_name, {}, list_of_progressive_params, columns=columnas_a_mano) 
best_params_decision_tree

Random Forest¶

https://medium.com/all-things-ai/in-depth-parameter-tuning-for-random-forest-d67bb7e920d

from sklearn.ensemble import RandomForestClassifier
   
list_of_progressive_params = [{'n_estimators':[1, 2, 4, 8, 16, 32, 64, 100, 200]},
                              {'criterion':['gini','entropy']},
                              {'max_features': np.arange(0.1,0.4,0.1)},
                              {'max_depth': np.linspace(1, 32, 3, endpoint=True)},
                              {'min_samples_split': np.arange(0.1, 1.0, 0.1)},
                              {'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)}
                   ]  

model_with_name = ('random_forest', RandomForestClassifier)

best_params_random_forest = find_best_params(df_users,df_y,model_with_name, {},list_of_progressive_params, columns=columnas_a_mano) 
best_params_random_forest

XGBoost¶

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

import xgboost as xgb #conda install -c conda-forge xgboost 

# Pls dejar, dio muy bien y no es muy replicable
"""
0.8695

{'learning_rate': 0.1,
 'objective': 'binary:logistic',
 'n_estimators': 16,
 'scale_pos_weight': 2,
 'max_depth': 4,
 'min_child_weight': 5,
 'gamma': 0.0,
 'colsample_bytree': 0.7500000000000001,
 'subsample': 0.7,
 'colsample_bylevel': 0.65}
"""

    
list_of_progressive_params = [
                              {'objective': ['binary:logistic'],'learning_rate':np.arange(0.1,0.5,0.1)},
                              {'n_estimators':np.arange(16,116,15)},
                              {'scale_pos_weight':np.arange(2,6,1)},
                              {'max_depth':np.arange(4,12,1),'min_child_weight':np.arange(1,10,1)},
                              {'gamma':np.arange(0,0.5,0.1)},
                              {'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05)},
                              {'colsample_bylevel':np.arange(0.6,0.91,0.05)}#,
                             # {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]} # Empeoraba muchísimo con esto, y Luis dijo que no importaba
                   ]

model_with_name = ('xgbost', xgb.XGBClassifier)

best_params_xgboost = find_best_params(df_users,df_y,model_with_name,{}, list_of_progressive_params, columns=columnas_a_mano) 
best_params_xgboost

list_of_progressive_params = [
                              {'objective': ['binary:logistic','reg:linear'],'learning_rate':np.arange(0.1,0.5,0.1)},
                              {'n_estimators':np.arange(16,116,15)},
                              {'scale_pos_weight':np.arange(2,6,1)},
                              {'max_depth':np.arange(4,12,1),'min_child_weight':np.arange(1,10,1)},
                              {'gamma':np.arange(0,0.5,0.1)},
                              {'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05)},
                              {'colsample_bylevel':np.arange(0.6,0.91,0.05)}#,
                             # {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]} # Empeoraba muchísimo con esto, y Luis dijo que no importaba
                   ]

model_with_name = ('xgbost', xgb.XGBClassifier)

best_params_xgboost = find_best_params(df_users_norm,df_y,model_with_name,columnas_a_mano,{}, list_of_progressive_params) 
best_params_xgboost

KNN¶

from sklearn.neighbors import KNeighborsClassifier
    
list_of_progressive_params = [
                              {'n_neighbors': np.arange(1,30)},
                              {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
                              {'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
                   ]

model_with_name = ('knn', KNeighborsClassifier)

best_params_knn = find_best_params(df_users, df_y, model_with_name, {},list_of_progressive_params, seed=-1, normalize=True) 
best_params_knn

Light GBM¶

https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc

Método original¶

import lightgbm as lgb #conda install -c conda-forge lightgbm 

"""
0.8688

{'objective': 'binary',
 'learning_rate': 0.01,
 'n_estimators': 190,
 'num_leaves': 27,
 'feature_fraction': 0.9000000000000001,
 'bagging_fraction': 0.8,
 'max_depth': 4,
 'lambda_l2': 2,
 'min_split_gain': 0.01,
 'min_child_weight': 10.00001}
"""

list_of_progressive_params = [{'objective':['binary']},
                             {'learning_rate':[0.005,0.01,0.05,0.1,0.3]},
                             {'n_estimators':np.arange(25,200,15)},
                             {'num_leaves': np.arange(24, 45,3)},
                             {'feature_fraction': np.arange(0.1, 0.91, 0.2)},   
                             {'bagging_fraction': np.arange(0.8, 1.01, 0.1)},
                             {'max_depth': np.arange(3, 12, 1)},
                            #{'lambda_l1': np.arange(0, 5)}, # Restaba mucho
                             {'lambda_l2': np.arange(0, 3)},
                             {'min_split_gain': [0.001, 0.01, 0.1]},
                             {'min_child_weight': [1e-05]+np.arange(5, 11)}
                             ]


model_with_name = ('lightgbm', lgb.LGBMClassifier)

best_params_lightgbm= find_best_params(df_users,df_y,model_with_name,{}, list_of_progressive_params) 
best_params_lightgbm

Método Random (lightgbm)¶

list_of_progressive_params = {'objective':['binary'],
                             'learning_rate':[0.005,0.01,0.05,0.1,0.3],
                             'n_estimators':np.arange(25,200,15),
                             'num_leaves': np.arange(24, 45,3),
                             'feature_fraction': np.arange(0.1, 0.91, 0.2),   
                             'bagging_fraction': np.arange(0.8, 1.01, 0.1),
                             'max_depth': np.arange(3, 12, 1),
                             'lambda_l2': np.arange(0, 3),
                             'min_split_gain': [0.001, 0.01, 0.1],
                             'min_child_weight': [1e-05]+np.arange(5, 11)
                             }


model_with_name = ('lightgbm', lgb.LGBMClassifier)

best_params_decision_tree = find_best_params_random(df_users,df_y,model_with_name, {}, list_of_progressive_params, n_iter=100) 
best_params_decision_tree

Former Champion¶

# Last best parameters 
# AUC: 0.0700 con todo el dataset
# AUC: 0.8711 con columnas a mano
params = {'objective': 'binary',
 'num_leaves': 36,
 'n_estimators': 70,
 'min_split_gain': 0.01,
 'min_child_weight': 5.00001,
 'max_depth': 4,
 'learning_rate': 0.05,
 'lambda_l2': 0,
 'feature_fraction': 0.7000000000000001,
 'bagging_fraction': 1.0}

model_with_name = ('lightgbm', lgb.LGBMClassifier(**params))
SF.full_framework_wrapper(df_users, df_y, model_with_name)

Neural Network¶

from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing

list_of_progressive_params = [{'hidden_layer_sizes':[(4,7), (4,4), (4,3,2)]},
                              {'activation':['relu', 'logistic']},
                              {'alpha':[1e-06,1e-05,1e-04,1e-03,1e-02,1e-01,1]},
                              {'beta_1':[0.7,0.91,0.05]},
                              {'beta_2':[0.75, 0.86, 0.05]},
                              {'early_stopping':[False]},
                              {'epsilon':[1e-07,1e-08]},
                              {'learning_rate':['constant', 'adaptive']},
                              {'solver':['adam', 'lbfgs']},
                              {'validation_fraction':np.arange(0.15,0.26,0.05)}
                             ]
    
    
model_with_name = ('neuralnetwork', MLPClassifier)

"""
min_max_scaler = preprocessing.MinMaxScaler()
df_users_norm = pd.DataFrame(min_max_scaler.fit_transform(df_users.values))
df_users_norm.columns = df_users.columns
df_users_norm.index = df_users.index
"""

best_params_neuralnetwork = find_best_params(df_users, df_y, model_with_name, {}, list_of_progressive_params, normalize=True)
best_params_neuralnetwork

CatBoost¶

https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/

import catboost as cb #conda install -c conda-forge catboost 

list_of_progressive_params = [{'random_strength':[42],'eval_metric':['AUC'],'iterations': [80, 100,256,465,678,1000]},
                             {'learning_rate':[0.01,0.05,0.1,0.3]},
                             {'depth':np.arange(1,12,1)},
                             {'l2_leaf_reg':np.arange(2,10,1)},
                             ]

model_with_name = ('catboost', cb.CatBoostClassifier)

best_params_catboost = find_best_params(df_users,df_y,model_with_name,{'verbose':True}, list_of_progressive_params, cv=2,columns=columnas_a_mano)
best_params_catboost

Gradient Boosting¶

https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/ https://medium.com/all-things-ai/in-depth-parameter-tuning-for-gradient-boosting-3363992e9bae

from sklearn.ensemble import GradientBoostingClassifier as GBC  

list_of_progressive_params = [
                             {'max_leaf_nodes': [None]},
                             {'min_weight_fraction_leaf': [0]},
                             {'learning_rate': [0.1]},
                             {'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
                             {'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)},  
                             {'max_features' : list(range(1,len(columnas_a_mano)))},
                             {'max_depth': np.linspace(1, 32, 32, endpoint=True)},
                             {'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 20]},
                             {'subsample': np.arange(0.8, 1)},
                             {'loss': ['deviance']},
                             {'warm_start': [False]},
                             {'presort': ['auto']}
                             ]
    
model_with_name = ('gradient_boosting', GBC)

best_params_boosting= find_best_params(df_users,df_y,model_with_name, list_of_progressive_params, columns=columnas_a_mano) 
best_params_boosting

Post-Entrega: Como persistir modelos¶

GridSearchCV y RandomizedSearchCV no solo encuentran los mejores hiper-parametros, también hacen Cross-Validation sobre los modelos, y por eso es importante quedarse con el modelo una vez que termino la ejecución, sin necesariamente correrlo de nuevo.

import pickle
import lightgbm as lgb #conda install -c conda-forge lightgbm 

def find_best_params_random(df_x, df_y, orig_model_with_name, default_params, params_grid,
                            n_iter=15, columns=None, seed=0, cv=5):
    
    orig_model_name, orig_model = orig_model_with_name
    orig_model_name+='_RS'
    i=1

    model_new = RandomizedSearchCV(orig_model(**default_params,random_state=seed), params_grid, n_iter=n_iter, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)

    model_with_name = (orig_model_name,model_new)
    model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
    
    return model

list_of_progressive_params = {'n_estimators':np.arange(25,200,15),
                              'max_depth': np.arange(3, 12, 1)}


model_with_name = ('lightgbm', lgb.LGBMClassifier)

best_model_lightgbm = find_best_params_random(df_users,df_y,model_with_name,{}, list_of_progressive_params) 
best_model_lightgbm

Fitting 5 folds for each of 15 candidates, totalling 75 fits

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  2.7min finished

Model: lightgbm_RS - AUC: 0.8595 - AUCPR:0.2359 - Accuracy: 0.9494

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid='warn', n_iter=15, n_jobs=4,
          param_distributions={'n_estimators': array([ 25,  40,  55,  70,  85, 100, 115, 130, 145, 160, 175, 190]), 'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=1)

# Guardarlo:
model_file = open('lightgbm-rg-2.0-8718.pickle.dat','wb')
pickle.dump(best_model_lightgbm, model_file)
model_file.close()

# Cargarlo:
model_file = open('lightgbm-rg-2.0-8718.pickle.dat','rb')
loaded_model = pickle.load(model_file)
model_file.close()

loaded_model

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
          fit_params=None, iid='warn', n_iter=15, n_jobs=4,
          param_distributions={'n_estimators': array([ 25,  40,  55,  70,  85, 100, 115, 130, 145, 160, 175, 190]), 'max_depth': array([ 3,  4,  5,  6,  7,  8,  9, 10, 11])},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=1)

[75.06 / 95.58] Organización de Datos Trabajo Práctico 2: Machine Learning