Grupo 30: Datatouille
Siendo este un proceso muy costoso en tiempo, se corre una vez y se guardan los resultados para luego poder importarlos
def get_hiper_params():
# Se esconde en un def para poder importarlo
return {
'decision_tree':{'criterion': 'gini',
'max_features': 0.30000000000000004,
'max_depth': 8.75,
'min_samples_split': 0.1,
'min_samples_leaf': 0.0001},
'random_forest': {'n_estimators': 200,
'criterion': 'entropy',
'max_features': 0.2,
'max_depth': 16.5,
'min_samples_split': 0.1,
'min_samples_leaf': 0.1},
# AUC: 0.8695
'xgboost': {'learning_rate': 0.1,
'objective': 'binary:logistic',
'n_estimators': 16,
'scale_pos_weight': 2,
'max_depth': 4,
'min_child_weight': 5,
'gamma': 0.0,
'colsample_bytree': 0.7500000000000001,
'subsample': 0.7,
'colsample_bylevel': 0.65},
'knn': {'n_neighbors':21, 'weights':'uniform', 'n_jobs':-1},
# AUC: 0.8700
'lightgbm': {'objective': 'binary',
'num_leaves': 36,
'n_estimators': 70,
'min_split_gain': 0.01,
'min_child_weight': 5.00001,
'max_depth': 4,
'learning_rate': 0.05,
'lambda_l2': 0,
'feature_fraction': 0.7000000000000001,
'bagging_fraction': 1.0},
'catboost': { 'eval_metric': 'AUC',
'iterations': 678,
'random_strength': 42,
'learning_rate': 0.01,
'depth': 1,
'l2_leaf_reg': 2},
'gradient_boosting': {'max_leaf_nodes': None,
'min_weight_fraction_leaf': 0,
'learning_rate': 0.1,
'max_features': 1,
'min_samples_split': 1.0,
'min_samples_leaf': 0.1,
'max_depth': 1.0,
'n_estimators': 1,
'subsample': 0.8,
'loss': 'deviance',
'warm_start': False,
'presort': 'auto'},
'neuralnetwork': {'hidden_layer_sizes': (4, 4),
'activation': 'relu',
'alpha': 0.0001,
'beta_1': 0.05,
'beta_2': 0.86,
'early_stopping': False,
'epsilon': 1e-08,
'learning_rate': 'constant',
'solver': 'adam',
'validation_fraction': 0.15}
}
import nbimporter # pip install nbimporter
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import submission_framework as SF
df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()
columnas_a_mano = [ 'total_checkouts',
'total_conversions',
'total_events',
'total_sessions',
'total_session_checkout',
'total_session_conversion',
'total_events_ad_session',
'total_ad_sessions',
'avg_events_per_session',
'avg_events_per_ad_session',
'percentage_session_ad',
'has_checkout',
'has_conversion',
'total_viewed_products_month_1',
'total_checkouts_month_1',
'total_conversions_month_1',
'total_events_month_1',
'total_sessions_month_1',
'total_session_checkouts_month_1',
'total_session_conversions_month_1',
'total_events_ad_session_month_1',
'total_ad_sessions_month_1',
'has_checkout_month_1',
'has_conversion_month_1',
'total_viewed_products_month_2',
'total_checkouts_month_2',
'total_conversions_month_2',
'total_events_month_2',
'total_sessions_month_2',
'total_session_checkouts_month_2',
'total_session_conversions_month_2',
'total_events_ad_session_month_2',
'total_ad_sessions_month_2',
'has_checkout_month_2',
'has_conversion_month_2',
'total_viewed_products_month_3',
'total_checkouts_month_3',
'total_conversions_month_3',
'total_events_month_3',
'total_sessions_month_3',
'total_session_checkouts_month_3',
'total_session_conversions_month_3',
'total_events_ad_session_month_3',
'total_ad_sessions_month_3',
'has_checkout_month_3',
'has_conversion_month_3',
'total_viewed_products_month_4',
'total_checkouts_month_4',
'total_conversions_month_4',
'total_events_month_4',
'total_sessions_month_4',
'total_session_checkouts_month_4',
'total_session_conversions_month_4',
'total_events_ad_session_month_4',
'total_ad_sessions_month_4',
'has_checkout_month_4',
'has_conversion_month_4',
'total_viewed_products_month_5',
'total_checkouts_month_5',
'total_conversions_month_5',
'total_events_month_5',
'total_sessions_month_5',
'total_session_checkouts_month_5',
'total_session_conversions_month_5',
'total_events_ad_session_month_5',
'total_ad_sessions_month_5',
'has_checkout_month_5',
'has_conversion_month_5',
'total_viewed_products_months_1_to_4',
'total_checkouts_months_1_to_4',
'total_conversions_months_1_to_4',
'total_events_months_1_to_4',
'total_sessions_months_1_to_4',
'total_session_checkouts_months_1_to_4',
'total_session_conversions_months_1_to_4',
'total_events_ad_session_months_1_to_4',
'total_ad_sessions_months_1_to_4',
'has_checkout_months_1_to_4',
'has_conversion_months_1_to_4',
'total_viewed_products_lw',
'total_checkouts_lw',
'total_conversions_lw',
'total_events_lw',
'total_sessions_lw',
'total_session_checkouts_lw',
'total_session_conversions_lw',
'total_events_ad_session_lw',
'total_ad_sessions_lw',
'has_checkout_lw',
'has_conversion_lw',
'amount_of_months_that_has_bought',
'timestamp_last_event',
'timestamp_last_checkout',
'timestamp_last_conversion',
'timestamp_last_viewed_product',
'days_to_last_event',
'days_to_last_checkout',
'days_to_last_conversion',
'days_to_last_viewed_product',
'doy_last_event',
'dow_last_event',
'dom_last_event',
'woy_last_event',
'doy_last_checkout',
'dow_last_checkout',
'dom_last_checkout',
'woy_last_checkout',
'doy_last_conversion',
'dow_last_conversion',
'dom_last_conversion',
'woy_last_conversion',
'doy_last_viewed_product',
'dow_last_viewed_product',
'dom_last_viewed_product',
'woy_last_viewed_product',
'last_conversion_sku',
'last_conversion_price',
'percentage_last_week_activity',
'percentage_last_month_activity',
'days_between_last_event_and_checkout',
'percentage_regular_celphones_activity',
'var_viewed',
'conversion_gt_media'
]
def find_best_params(df_x, df_y, orig_model_with_name, default_params, list_of_progressive_params,
columns=None, seed=0, cv=5, normalize=False):
"""Encuentra los mejores hiper-parametros con grid search pero de manera secuencial.
Es decir, en vez de probar 3 variables con 5 valores cada uno, lo cual resulta en 455 modelos, se puede partir en dos variables de 5 valores (25 modelos) y del mejor de eso los otros 5 valores (125 modelos en total)
Ciertamente, no es óptimo. Pero ahorra tiempo valioso"""
orig_model_name, orig_model = orig_model_with_name
orig_model_name+='_GSF'
acc_params = {}
i=1
for params_grid in list_of_progressive_params:
print(f"Best Params So Far: {default_params} {acc_params}\n\n")
model_new = GridSearchCV(orig_model(**default_params,**acc_params,random_state=seed) ,params_grid, cv=cv, verbose=1, scoring='roc_auc',n_jobs=4)
model_with_name = (orig_model_name,model_new)
model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name, columns=columns, normalize=normalize)
acc_params.update(model.best_params_)
i+=1
default_params.update(acc_params)
return default_params
def find_best_params_random(df_x, df_y, orig_model_with_name, default_params, params_grid,
n_iter=15, columns=None, seed=0, cv=5):
orig_model_name, orig_model = orig_model_with_name
orig_model_name+='_RS'
i=1
model_new = RandomizedSearchCV(orig_model(**default_params,random_state=seed), params_grid, n_iter=n_iter, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)
model_with_name = (orig_model_name,model_new)
model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
return model.best_params_
def find_best_params_gridsearch(df_x, df_y, orig_model_with_name, default_params, params_grid,
columns=None, seed=42, cv=5):
orig_model_name, orig_model = orig_model_with_name
orig_model_name+='_GS'
i=1
model_new = GridSearchCV(orig_model(**default_params), params_grid, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)
model_with_name = (orig_model_name,model_new)
model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
return model.best_params_
from sklearn.tree import DecisionTreeClassifier, export_graphviz
list_of_progressive_params = [{'criterion':['gini','entropy']},
{'max_features': np.arange(0.1,0.8,0.1)},
{'max_depth': np.linspace(1, 32, 5, endpoint=True)},
{'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
{'min_samples_leaf': np.arange(0.0001,0.5,0.1)}
]
model_with_name = ('decision_tree', DecisionTreeClassifier)
best_params_decision_tree = find_best_params(df_users,df_y,model_with_name, {},list_of_progressive_params, columns=columnas_a_mano)
best_params_decision_tree
list_of_progressive_params = {'criterion':['gini','entropy'],
'max_features': np.arange(0.1,0.8,0.1),
'max_depth': np.linspace(1, 32, 5, endpoint=True),
'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
'min_samples_leaf': np.arange(0.0001,0.5,0.1)
}
model_with_name = ('decision_tree', DecisionTreeClassifier)
best_params_decision_tree = find_best_params_random(df_users,df_y,model_with_name, {}, list_of_progressive_params, n_iter=100, columns=columnas_a_mano)
best_params_decision_tree
list_of_progressive_params = {'criterion':['gini','entropy'],
'max_features': np.arange(0.1,0.8,0.1),
'max_depth': np.linspace(1, 32, 5, endpoint=True),
'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
'min_samples_leaf': np.arange(0.0001,0.5,0.1)
}
model_with_name = ('decision_tree', DecisionTreeClassifier)
best_params_decision_tree = find_best_params_gridsearch(df_users,df_y,model_with_name, {}, list_of_progressive_params, columns=columnas_a_mano)
best_params_decision_tree
from sklearn.ensemble import RandomForestClassifier
list_of_progressive_params = [{'n_estimators':[1, 2, 4, 8, 16, 32, 64, 100, 200]},
{'criterion':['gini','entropy']},
{'max_features': np.arange(0.1,0.4,0.1)},
{'max_depth': np.linspace(1, 32, 3, endpoint=True)},
{'min_samples_split': np.arange(0.1, 1.0, 0.1)},
{'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)}
]
model_with_name = ('random_forest', RandomForestClassifier)
best_params_random_forest = find_best_params(df_users,df_y,model_with_name, {},list_of_progressive_params, columns=columnas_a_mano)
best_params_random_forest
import xgboost as xgb #conda install -c conda-forge xgboost
# Pls dejar, dio muy bien y no es muy replicable
"""
0.8695
{'learning_rate': 0.1,
'objective': 'binary:logistic',
'n_estimators': 16,
'scale_pos_weight': 2,
'max_depth': 4,
'min_child_weight': 5,
'gamma': 0.0,
'colsample_bytree': 0.7500000000000001,
'subsample': 0.7,
'colsample_bylevel': 0.65}
"""
list_of_progressive_params = [
{'objective': ['binary:logistic'],'learning_rate':np.arange(0.1,0.5,0.1)},
{'n_estimators':np.arange(16,116,15)},
{'scale_pos_weight':np.arange(2,6,1)},
{'max_depth':np.arange(4,12,1),'min_child_weight':np.arange(1,10,1)},
{'gamma':np.arange(0,0.5,0.1)},
{'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05)},
{'colsample_bylevel':np.arange(0.6,0.91,0.05)}#,
# {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]} # Empeoraba muchísimo con esto, y Luis dijo que no importaba
]
model_with_name = ('xgbost', xgb.XGBClassifier)
best_params_xgboost = find_best_params(df_users,df_y,model_with_name,{}, list_of_progressive_params, columns=columnas_a_mano)
best_params_xgboost
list_of_progressive_params = [
{'objective': ['binary:logistic','reg:linear'],'learning_rate':np.arange(0.1,0.5,0.1)},
{'n_estimators':np.arange(16,116,15)},
{'scale_pos_weight':np.arange(2,6,1)},
{'max_depth':np.arange(4,12,1),'min_child_weight':np.arange(1,10,1)},
{'gamma':np.arange(0,0.5,0.1)},
{'subsample':np.arange(0.6,1,0.1),'colsample_bytree':np.arange(0.6,0.91,0.05)},
{'colsample_bylevel':np.arange(0.6,0.91,0.05)}#,
# {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]} # Empeoraba muchísimo con esto, y Luis dijo que no importaba
]
model_with_name = ('xgbost', xgb.XGBClassifier)
best_params_xgboost = find_best_params(df_users_norm,df_y,model_with_name,columnas_a_mano,{}, list_of_progressive_params)
best_params_xgboost
from sklearn.neighbors import KNeighborsClassifier
list_of_progressive_params = [
{'n_neighbors': np.arange(1,30)},
{'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
{'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}
]
model_with_name = ('knn', KNeighborsClassifier)
best_params_knn = find_best_params(df_users, df_y, model_with_name, {},list_of_progressive_params, seed=-1, normalize=True)
best_params_knn
import lightgbm as lgb #conda install -c conda-forge lightgbm
"""
0.8688
{'objective': 'binary',
'learning_rate': 0.01,
'n_estimators': 190,
'num_leaves': 27,
'feature_fraction': 0.9000000000000001,
'bagging_fraction': 0.8,
'max_depth': 4,
'lambda_l2': 2,
'min_split_gain': 0.01,
'min_child_weight': 10.00001}
"""
list_of_progressive_params = [{'objective':['binary']},
{'learning_rate':[0.005,0.01,0.05,0.1,0.3]},
{'n_estimators':np.arange(25,200,15)},
{'num_leaves': np.arange(24, 45,3)},
{'feature_fraction': np.arange(0.1, 0.91, 0.2)},
{'bagging_fraction': np.arange(0.8, 1.01, 0.1)},
{'max_depth': np.arange(3, 12, 1)},
#{'lambda_l1': np.arange(0, 5)}, # Restaba mucho
{'lambda_l2': np.arange(0, 3)},
{'min_split_gain': [0.001, 0.01, 0.1]},
{'min_child_weight': [1e-05]+np.arange(5, 11)}
]
model_with_name = ('lightgbm', lgb.LGBMClassifier)
best_params_lightgbm= find_best_params(df_users,df_y,model_with_name,{}, list_of_progressive_params)
best_params_lightgbm
list_of_progressive_params = {'objective':['binary'],
'learning_rate':[0.005,0.01,0.05,0.1,0.3],
'n_estimators':np.arange(25,200,15),
'num_leaves': np.arange(24, 45,3),
'feature_fraction': np.arange(0.1, 0.91, 0.2),
'bagging_fraction': np.arange(0.8, 1.01, 0.1),
'max_depth': np.arange(3, 12, 1),
'lambda_l2': np.arange(0, 3),
'min_split_gain': [0.001, 0.01, 0.1],
'min_child_weight': [1e-05]+np.arange(5, 11)
}
model_with_name = ('lightgbm', lgb.LGBMClassifier)
best_params_decision_tree = find_best_params_random(df_users,df_y,model_with_name, {}, list_of_progressive_params, n_iter=100)
best_params_decision_tree
# Last best parameters
# AUC: 0.0700 con todo el dataset
# AUC: 0.8711 con columnas a mano
params = {'objective': 'binary',
'num_leaves': 36,
'n_estimators': 70,
'min_split_gain': 0.01,
'min_child_weight': 5.00001,
'max_depth': 4,
'learning_rate': 0.05,
'lambda_l2': 0,
'feature_fraction': 0.7000000000000001,
'bagging_fraction': 1.0}
model_with_name = ('lightgbm', lgb.LGBMClassifier(**params))
SF.full_framework_wrapper(df_users, df_y, model_with_name)
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
list_of_progressive_params = [{'hidden_layer_sizes':[(4,7), (4,4), (4,3,2)]},
{'activation':['relu', 'logistic']},
{'alpha':[1e-06,1e-05,1e-04,1e-03,1e-02,1e-01,1]},
{'beta_1':[0.7,0.91,0.05]},
{'beta_2':[0.75, 0.86, 0.05]},
{'early_stopping':[False]},
{'epsilon':[1e-07,1e-08]},
{'learning_rate':['constant', 'adaptive']},
{'solver':['adam', 'lbfgs']},
{'validation_fraction':np.arange(0.15,0.26,0.05)}
]
model_with_name = ('neuralnetwork', MLPClassifier)
"""
min_max_scaler = preprocessing.MinMaxScaler()
df_users_norm = pd.DataFrame(min_max_scaler.fit_transform(df_users.values))
df_users_norm.columns = df_users.columns
df_users_norm.index = df_users.index
"""
best_params_neuralnetwork = find_best_params(df_users, df_y, model_with_name, {}, list_of_progressive_params, normalize=True)
best_params_neuralnetwork
import catboost as cb #conda install -c conda-forge catboost
list_of_progressive_params = [{'random_strength':[42],'eval_metric':['AUC'],'iterations': [80, 100,256,465,678,1000]},
{'learning_rate':[0.01,0.05,0.1,0.3]},
{'depth':np.arange(1,12,1)},
{'l2_leaf_reg':np.arange(2,10,1)},
]
model_with_name = ('catboost', cb.CatBoostClassifier)
best_params_catboost = find_best_params(df_users,df_y,model_with_name,{'verbose':True}, list_of_progressive_params, cv=2,columns=columnas_a_mano)
best_params_catboost
from sklearn.ensemble import GradientBoostingClassifier as GBC
list_of_progressive_params = [
{'max_leaf_nodes': [None]},
{'min_weight_fraction_leaf': [0]},
{'learning_rate': [0.1]},
{'min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True)},
{'min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)},
{'max_features' : list(range(1,len(columnas_a_mano)))},
{'max_depth': np.linspace(1, 32, 32, endpoint=True)},
{'n_estimators': [1, 2, 4, 8, 16, 32, 64, 100, 20]},
{'subsample': np.arange(0.8, 1)},
{'loss': ['deviance']},
{'warm_start': [False]},
{'presort': ['auto']}
]
model_with_name = ('gradient_boosting', GBC)
best_params_boosting= find_best_params(df_users,df_y,model_with_name, list_of_progressive_params, columns=columnas_a_mano)
best_params_boosting
GridSearchCV y RandomizedSearchCV no solo encuentran los mejores hiper-parametros, también hacen Cross-Validation sobre los modelos, y por eso es importante quedarse con el modelo una vez que termino la ejecución, sin necesariamente correrlo de nuevo.
import pickle
import lightgbm as lgb #conda install -c conda-forge lightgbm
def find_best_params_random(df_x, df_y, orig_model_with_name, default_params, params_grid,
n_iter=15, columns=None, seed=0, cv=5):
orig_model_name, orig_model = orig_model_with_name
orig_model_name+='_RS'
i=1
model_new = RandomizedSearchCV(orig_model(**default_params,random_state=seed), params_grid, n_iter=n_iter, cv=cv, verbose=1, scoring='roc_auc', n_jobs=4)
model_with_name = (orig_model_name,model_new)
model, auc = SF.full_framework_wrapper(df_x, df_y, model_with_name,columns=columns)
return model
list_of_progressive_params = {'n_estimators':np.arange(25,200,15),
'max_depth': np.arange(3, 12, 1)}
model_with_name = ('lightgbm', lgb.LGBMClassifier)
best_model_lightgbm = find_best_params_random(df_users,df_y,model_with_name,{}, list_of_progressive_params)
best_model_lightgbm
# Guardarlo:
model_file = open('lightgbm-rg-2.0-8718.pickle.dat','wb')
pickle.dump(best_model_lightgbm, model_file)
model_file.close()
# Cargarlo:
model_file = open('lightgbm-rg-2.0-8718.pickle.dat','rb')
loaded_model = pickle.load(model_file)
model_file.close()
loaded_model