Grupo 30: Datatouille
http://fdelmazo.github.io/7506-Datos/
Continuando la investigación sobre la empresa Trocafone realizada en el TP1, se busca determinar la probabilidad de que un usuario del sitio realice una conversión en el período determinado.
Notebooks en orden de corrida y lectura:
TP1 y su anexo --> Familiarización con el set de datos y exploración de estos.
Investigación Previa --> Con ayuda de lo trabajado en el TP1, se averiguan más cosas de las datos, en busqueda de que poder reutilizar.
Creación de Dataframes --> Como parte del feature engineering, se crean dataframes nuevos con información de los productos del sitio y de como se accede a este (marcas, sistemas operativos, etc).
Feature Engineering --> Busqueda de atributos de los usuarios de los cuales se busca predecir la conversión.
Feature Selection --> Busqueda de la combinación de features más favorable.
Parameter Tuning --> Busqueda de los mejores hiper-parametros para cada algoritmo de ML.
Submission Framework --> Pequeño framework para construir las postulaciones de labels.
TP2 (este notebook)--> Teniendo todo en cuenta, usando los dataframes con todos los atributos buscados y encontrados, se definen y aplican los algoritmos de clasificación, se realizan los entrenamientos y posteriores predicciones de conversiones y finalmente se arman las postulaciones de labels.
Antes de comenzar, setear las credenciales (usuario y token)
# !unzip -q -o data/events_up_to_01062018.zip -d data
# !pip install kaggle
# !pip install nbimporter
# !pip install ggplot
# !pip install hdbscan
# !conda install -y -c conda-forge xgboost
# !conda install -y -c conda-forge lightgbm
# !conda install -y -c conda-forge catboost
import nbimporter # pip install nbimporter
import pandas as pd
import numpy as np
import time
import calendar
from itertools import combinations
import random
from time import sleep
from parameter_tuning import get_hiper_params
from feature_selection import get_feature_selection
import submission_framework as SF
seed = 42
hiper_params = get_hiper_params()
feature_selection = get_feature_selection()
df_users = pd.read_csv('data/user-features.csv',low_memory=False).set_index('person')
df_y = pd.read_csv('data/labels_training_set.csv').groupby('person').sum()
display(df_users.head(), df_y.head())
posibilidades_algoritmos = []
from sklearn.tree import DecisionTreeClassifier, export_graphviz
model_name = 'decision_tree'
params = hiper_params[model_name]
model = DecisionTreeClassifier(**params,random_state=seed)
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
from sklearn.ensemble import RandomForestClassifier
model_name = 'random_forest'
params = hiper_params[model_name]
model = RandomForestClassifier(**params,random_state=seed)
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
import xgboost as xgb #conda install -c conda-forge xgboost
model_name = 'xgboost'
params = hiper_params[model_name]
model = xgb.XGBClassifier(**params,random_state=seed)
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
from sklearn.neighbors import KNeighborsClassifier
model_name = 'knn'
params = hiper_params[model_name]
K = params['n_neighbors']
model_name = f'KNN{K}'
model = KNeighborsClassifier(**params)
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name, normalize=True)
posibilidades_algoritmos.append(model_with_name)
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB,ComplementNB
model_name = 'naive_bayes_Gaussian'
model = GaussianNB(**{'var_smoothing': 1e-09})
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
model_name = 'naive_bayes_Bernoulli'
model = BernoulliNB()
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
model_name = 'naive_bayes_Multinomial'
model = MultinomialNB()
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
model_name = 'naive_bayes_Complement'
model = ComplementNB()
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
import lightgbm as lgb #conda install -c conda-forge lightgbm
model_name = 'lightgbm'
params = hiper_params[model_name]
model = lgb.LGBMClassifier(**params)
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y, model_with_name)
posibilidades_algoritmos.append(model_with_name)
from sklearn.neural_network import MLPClassifier
model_name = 'neuralnetwork'
params = hiper_params[model_name]
model = MLPClassifier(**params)
model_with_name = (model_name, model)
# Funciona sólo con datos normalizados
SF.full_framework_wrapper(df_users, df_y, model_with_name, normalize=True)
posibilidades_algoritmos.append(model_with_name)
from sklearn.ensemble import GradientBoostingClassifier as GBC
model_name = 'gradient_boosting'
params = hiper_params[model_name]
model = GBC(**params)
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
from sklearn.ensemble import AdaBoostClassifier
model_name = 'adaboost'
model = AdaBoostClassifier()
model_with_name = (model_name,model)
SF.full_framework_wrapper(df_users,df_y,model_with_name)
posibilidades_algoritmos.append(model_with_name)
# Muy costoso en tiempo, no lo valen sus resultados
# import catboost as cb #conda install -c conda-forge catboost
# model_name = 'catboost'
# params = hiper_params[model_name]
# model = cb.CatBoostClassifier(**params,verbose=False)
# model_with_name = (model_name,model)
# SF.full_framework_wrapper(df_users,df_y,model_with_name)
# posibilidades_algoritmos.append(model_with_name)
# Resultados pauperrimos
# from sklearn.linear_model import LogisticRegression
# model_name = 'logistic_regresion'
# model = LogisticRegression(solver='lbfgs')
# model_with_name = (model_name,model)
# SF.full_framework_wrapper(df_users,df_y,model_with_name)
# posibilidades_algoritmos.append(model_with_name)
Corremos todos los algoritmos definidos sobre esas combinaciones, incluso ensamblados, en busqueda de su mejor combinación de hiper-parametros.
Finalmente, se corren todos los algoritmos en su mejor combinación contra todos los set de features definidos, en busqueda de la mejor fusión universal.
print([x[0] for x in posibilidades_algoritmos])
posibilidades_algoritmos_y_ensambles = posibilidades_algoritmos[:]
from sklearn.ensemble import BaggingClassifier
# Se excluyen los algoritmos que ya de por si son de Bagging y los que consumen mucho tiempo
EXCLUDED = ['catboost','neuralnetwork',f'KNN{K}','random_forest']
def bagging(posibilidades):
posibilidades = list(filter(lambda x:x[0] not in EXCLUDED,posibilidades))
baggins = [] # Frodo
for n,m in posibilidades:
baggins.append((n+'_bagging',BaggingClassifier(m)))
return baggins
posibilidades_algoritmos_y_ensambles += bagging(posibilidades_algoritmos)
EXCLUDED = ['catboost','neuralnetwork']
def ensamblar_algoritmos(posibilidades, n, tipo):
posibilidades = list(filter(lambda x:x[0] not in EXCLUDED,posibilidades))
result = list(combinations(posibilidades,n))
result_names = [f'{x[0][0]}+{x[1][0]}_{tipo}' for x in result]
return list(zip(result_names,result))
def ensambles_a_mano(posibilidades, nombres,tipo):
result = list(combinations(posibilidades, len(nombres)))
for r in result:
if not r: continue
names = [x[0] for x in r]
if all([x in names for x in nombres]):
result_names = '+'.join(nombres)
if tipo=='both':
return [(result_names+'_hard',r),(result_names+'_soft',r)]
result_names += f"_{tipo}"
return [(result_names,r)]
posibilidades_algoritmos_y_ensambles += ensambles_a_mano(posibilidades_algoritmos,['random_forest','lightgbm','neuralnetwork'],'soft')
posibilidades_algoritmos_y_ensambles += ensambles_a_mano(posibilidades_algoritmos,['random_forest','xgboost','naive_bayes_Bernoulli'],'soft')
#Agrega todos los ensambles posibles de pares.
# Mas allá de algunas corridas para dar una idea de lo que funciona y lo que no,
# no sirve en el día a dia por su fuerza bruta y el tardar tanto
# posibilidades_algoritmos_y_ensambles += ensamblar_algoritmos(posibilidades_algoritmos,2)
print([x[0] for x in posibilidades_algoritmos_y_ensambles])
posibilidades_features = {
'Cumulative Importance':feature_selection['best_features_progresivo'],
'Forward Selection':feature_selection['best_features_forward'],
'Backward Elimination':feature_selection['best_features_backward'],
'Stepwise Regression':feature_selection['best_features_stepwise'],
'Full Dataframe':[],
}
todo_junto = [x for f in posibilidades_features.values() for x in f]
intersec = list(set([x for x in todo_junto if todo_junto.count(x)>=2]))
posibilidades_features['Feature Intersection'] = intersec
posibilidades_features['Seleccion a Mano (Boj)'] = ['total_checkouts_month_5',
'timestamp_last_checkout',
'timestamp_last_event',
'has_checkout_month_5',
'total_checkouts',
'days_to_last_event',
'total_checkouts_lw',
'total_checkouts_months_1_to_4',
'total_conversions',
'total_session_conversion',
'total_events',
'total_sessions',
'avg_events_per_session',
'total_session_checkout',
'has_checkout'
]
posibilidades_features['Seleccion a Mano (Souto)'] = ['dow_last_conversion',
'has_conversion_lw', 'total_conversions_month_4',
'total_session_checkout', 'doy_last_conversion', 'timestamp_last_event',
'dow_last_checkout', 'total_checkouts', 'has_checkout', 'doy_last_checkout',
'has_checkout_month_1', 'timestamp_last_checkout', 'total_sessions',
'woy_last_event', 'has_checkout_month_5', 'avg_events_per_session']
posibilidades_features['Seleccion a Mano (Chortas)'] = [
'dow_last_conversion',
'timestamp_last_event',
'timestamp_last_checkout',
'timestamp_last_conversion',
'timestamp_last_viewed_product',
'days_to_last_event',
'days_to_last_checkout',
'days_to_last_conversion',
'days_to_last_viewed_product',
'total_brand_listings_lw',
'total_viewed_products_lw',
'total_checkouts_lw',
'total_conversions_lw',
'total_events_lw',
'total_sessions_lw',
'total_session_checkouts_lw',
'total_session_conversions_lw',
'total_events_ad_session_lw',
'total_ad_sessions_lw',
'has_checkout_lw',
'has_conversion_lw',
'percentage_last_week_activity',
'percentage_last_week_brand_listings',
'percentage_last_week_viewed_products',
'percentage_last_week_checkouts',
'percentage_last_week_conversions',
'amount_of_months_that_has_bought'
]
posibilidades_features['Seleccion a Mano (FdM)'] = [ 'total_checkouts',
'total_conversions',
'total_events',
'total_sessions',
'total_session_checkout',
'total_session_conversion',
'total_events_ad_session',
'total_ad_sessions',
'avg_events_per_session',
'avg_events_per_ad_session',
'percentage_session_ad',
'has_checkout',
'has_conversion',
'total_viewed_products_month_1',
'total_checkouts_month_1',
'total_conversions_month_1',
'total_events_month_1',
'total_sessions_month_1',
'total_session_checkouts_month_1',
'total_session_conversions_month_1',
'total_events_ad_session_month_1',
'total_ad_sessions_month_1',
'has_checkout_month_1',
'has_conversion_month_1',
'total_viewed_products_month_2',
'total_checkouts_month_2',
'total_conversions_month_2',
'total_events_month_2',
'total_sessions_month_2',
'total_session_checkouts_month_2',
'total_session_conversions_month_2',
'total_events_ad_session_month_2',
'total_ad_sessions_month_2',
'has_checkout_month_2',
'has_conversion_month_2',
'total_viewed_products_month_3',
'total_checkouts_month_3',
'total_conversions_month_3',
'total_events_month_3',
'total_sessions_month_3',
'total_session_checkouts_month_3',
'total_session_conversions_month_3',
'total_events_ad_session_month_3',
'total_ad_sessions_month_3',
'has_checkout_month_3',
'has_conversion_month_3',
'total_viewed_products_month_4',
'total_checkouts_month_4',
'total_conversions_month_4',
'total_events_month_4',
'total_sessions_month_4',
'total_session_checkouts_month_4',
'total_session_conversions_month_4',
'total_events_ad_session_month_4',
'total_ad_sessions_month_4',
'has_checkout_month_4',
'has_conversion_month_4',
'total_viewed_products_month_5',
'total_checkouts_month_5',
'total_conversions_month_5',
'total_events_month_5',
'total_sessions_month_5',
'total_session_checkouts_month_5',
'total_session_conversions_month_5',
'total_events_ad_session_month_5',
'total_ad_sessions_month_5',
'has_checkout_month_5',
'has_conversion_month_5',
'total_viewed_products_months_1_to_4',
'total_checkouts_months_1_to_4',
'total_conversions_months_1_to_4',
'total_events_months_1_to_4',
'total_sessions_months_1_to_4',
'total_session_checkouts_months_1_to_4',
'total_session_conversions_months_1_to_4',
'total_events_ad_session_months_1_to_4',
'total_ad_sessions_months_1_to_4',
'has_checkout_months_1_to_4',
'has_conversion_months_1_to_4',
'total_viewed_products_lw',
'total_checkouts_lw',
'total_conversions_lw',
'total_events_lw',
'total_sessions_lw',
'total_session_checkouts_lw',
'total_session_conversions_lw',
'total_events_ad_session_lw',
'total_ad_sessions_lw',
'has_checkout_lw',
'has_conversion_lw',
'amount_of_months_that_has_bought',
'timestamp_last_event',
'timestamp_last_checkout',
'timestamp_last_conversion',
'timestamp_last_viewed_product',
'days_to_last_event',
'days_to_last_checkout',
'days_to_last_conversion',
'days_to_last_viewed_product',
'doy_last_event',
'dow_last_event',
'dom_last_event',
'woy_last_event',
'doy_last_checkout',
'dow_last_checkout',
'dom_last_checkout',
'woy_last_checkout',
'doy_last_conversion',
'dow_last_conversion',
'dom_last_conversion',
'woy_last_conversion',
'doy_last_viewed_product',
'dow_last_viewed_product',
'dom_last_viewed_product',
'woy_last_viewed_product',
'last_conversion_sku',
'last_conversion_price',
'percentage_last_week_activity',
'percentage_last_month_activity',
'days_between_last_event_and_checkout',
'percentage_regular_celphones_activity',
'var_viewed',
'conversion_gt_media'
]
cant_features = 30
posibilidades_features[f'{cant_features} Random Sample'] = random.sample(df_users.columns.tolist(),cant_features)
posibilidades_features[f'{cant_features} Random Sample 2'] = random.sample(df_users.columns.tolist(),cant_features)
print([x for x in posibilidades_features])
resultados = []
global_time = 0
norm = False
for forma, features in posibilidades_features.items():
global_start = time.process_time()
print("{: ^100}\n{: ^100s}".format(f"Tardó: {global_time:.2f}s",'-----------------------'))
print(f'{forma}:\n')
print(f'{features}\n\n')
for nombre,algoritmo in posibilidades_algoritmos_y_ensambles:
norm = True if ('NN' in nombre or 'neuralnetwork' in nombre) and ('+' not in nombre) else False
print('\t * ',end='')
model_with_name = (f'{nombre}',algoritmo)
start = time.process_time()
model, auc = SF.full_framework_wrapper(df_users, df_y, model_with_name, columns=features, normalize=norm)
end = time.process_time()
print(f'\t\t Tardó: {end-start:.2f}s')
resultados.append((auc, forma, (nombre, algoritmo), features))
global_end = time.process_time()
global_time = global_end-global_start
resultados.sort(reverse=True)
display([(x[0],x[1],x[2][0]) for x in resultados])
Se corre entrenando con X (y no X_train) el submit final.
max_auc, campeon_forma, (campeon_nombre, campeon_algoritmo), campeon_features = resultados[0]
display(f"Mejor Apuesta: {campeon_nombre} ({max_auc:.4f} AUC) - Features: {campeon_forma}")
display(f"Features: {campeon_features}")
print(f"{campeon_nombre} - {campeon_forma} - {max_auc:.4f}")
norm = True if ('NN' in campeon_nombre or 'neuralnetwork' in campeon_nombre) and ('+' not in campeon_nombre) else False
campeon_model, campeon_auc, csv_name, campeon_message = SF.full_framework_wrapper(df_users,
df_y,
(campeon_nombre,campeon_algoritmo),
columns=campeon_features,
submit=True,
all_in=True,
normalize=norm)
#!kaggle competitions submit -f {csv_name} -m "{campeon_message}" trocafone
# Quemar n submits de punta a punta
# for resultado in resultados:
# print(f"\n\n{resultado[2][0]} - {resultado[1]} - {resultado[0]:.4f}\n\n)
# max_auc, campeon_forma, (campeon_nombre, campeon_algoritmo), campeon_features = resultado
# norm = True if ('NN' in campeon_nombre or 'neuralnetwork' in campeon_nombre) and ('+' not in campeon_nombre) else False
# campeon_model, campeon_auc, csv_name, campeon_message = SF.full_framework_wrapper(df_users,
# df_y,
# (campeon_nombre,campeon_algoritmo),
# columns=campeon_features,
# submit=True,
# all_in=True,
# normalize=norm)
# !kaggle competitions submit -f {csv_name} -m "{campeon_message}" trocafone
# sleep(10)
# print()
#!kaggle competitions leaderboard -d trocafone
#!unzip -o trocafone.zip
#print('\n\nLast Best Score')
#!cat trocafone-publicleaderboard.csv | grep Datatouille | tail -n 1 | awk '{split($0,a,","); print "\t Fecha: " a[3] ; print "\t Porcentaje: " a[4]}'