Grupo 30: Datatouille
En este notebook se definen unas simples funciones de I/O para armar las postulaciones de predicciones del trabajo práctico.
Uso:
Crea la matriz X
y el vector y
para entrenar
Split para generar los set de entrenamiento y de prueba
Se ejecuta el algoritmo de ML (debe devolver un dataframe con person en el indice y labels como unica columna)
Se obtienen las distintas métricas de la predicción para obtener la precisión
Se predicen las probabilidades
Se ve información relevante de la ejecución
Se guarda como un csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score
from sklearn import preprocessing
import numpy as np
import pandas as pd
import os.path
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
def assert_equals(x,y):
if not (x==y):
msg = f'{x} no equivale a {y}'
print(msg)
return False
return True
def df_label_xor(df1, df2):
merged = df1.merge(df2, how='outer', left_index=True, right_index=True, indicator=True)
merged = merged.query('_merge != "both"')
return merged
def fr1_extract_X_y(df, df_y, normalize=False):
if not assert_equals(len(df), 38829): return
if not assert_equals(len(df_y), 19414): return
data = df.merge(df_y, how='inner', left_index= True, right_index=True)
if not assert_equals(len(data), 19414): return
X = data.drop('label', axis=1).values
if normalize:
min_max_scaler = preprocessing.MinMaxScaler()
X_scaled = min_max_scaler.fit_transform(X)
X = X_scaled
y = df_y.values
y.shape = y.shape[0]
return X, y
def fr2_train_test_split(X, y, seed, test_size):
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=test_size,
stratify=y,
random_state=seed)
y_train.shape = y_train.shape[0]
return X_train, X_test, y_train, y_test
def fr4_metric_score(X_test, y_test, model, model_name):
y_pred = model.predict(X_test)
if not assert_equals(len(y_test), len(y_pred)): return
accuracy = accuracy_score(y_test, y_pred.round())
auc = make_scorer(roc_auc_score, needs_threshold=True)(model, X_test, y_test)
aucpr = make_scorer(average_precision_score, needs_threshold=True)(model, X_test, y_test)
return accuracy, auc, aucpr
def fr5_extract_X_to_predict(df, df_y, model):
if not assert_equals(len(df), 38829): return
if not assert_equals(len(df_y), 19414): return
data = df_label_xor(df, df_y)
data = data.drop(['label', '_merge'], axis=1)
if not assert_equals(len(data), 19415): return
predictions = model.predict_proba(data.values)
predictions_list = []
for i in predictions:
predictions_list.append(i[1])
predictions_final = np.array(predictions_list)
return data, predictions_final
def fr6_print_information(df, model, X_to_predict, with_features_importance):
if not with_features_importance:
return None
feature_importances = pd.DataFrame(model.feature_importances_,
index=df.columns,
columns=['importance'])
feature_importances = feature_importances.sort_values('importance', ascending=False)
return feature_importances
def fr7_train_final_model(algorithm, X, y):
return algorithm.fit(X, y)
def fr8_to_csv(df, predictions, name, auc):
name = name.replace('+','-')
submission = df
submission['label'] = predictions
submission = submission['label']
if not assert_equals(len(submission), 19415): return
name_csv = f'submission-{name}-{auc:.4f}.csv'
submission.to_csv(name_csv, header=True)
return name_csv, submission
def get_feature_importances(df_x, df_y, model_with_name,
columns=None,
normalize=False,
test_size=0.34, seed=42):
if not columns: columns=df_x.columns.tolist()
model_df_x = df_x[columns]
model_df_y = df_y
model_name,model = model_with_name
X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
model.fit(X_train,y_train)
X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
return fr6_print_information(model_df_x, model, X_to_predict, True)
def full_framework_normal(df_x, df_y,
model_with_name,
columns=None,
normalize=False,
test_size=0.34, seed=42,
verbosity=0,
all_in=False,
n_ensamble=0,
submit=False):
if not columns: columns=df_x.columns.tolist()
model_df_x = df_x[columns]
model_df_y = df_y
model_name,model = model_with_name
if n_ensamble: model_name+=f'_ensamble_{n_ensamble}'
if normalize: model_name+='_normalized'
if all_in: model_name+='_all_in'
X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
model.fit(X_train,y_train)
accuracy, auc, aucpr = fr4_metric_score(X_test, y_test, model, model_name)
if all_in:
model.fit(X,y)
X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
if not n_ensamble == 0:
total_predictions = 0
tmp_seed = seed
accuracy, auc, aucpr = 0,0,0
for i in range(n_ensamble):
print(f'Iteración {i+1} de ensamble de {n_ensamble}')
tmp_seed = tmp_seed + i
model.fit(X_train,y_train)
accuracy_tmp, auc_tmp, aucpr_tmp = fr4_metric_score(X_test, y_test, model, model_name)
accuracy += accuracy_tmp
auc += auc_tmp
aucpr += aucpr_tmp
if all_in: model = model.fit(X,y)
X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
total_predictions += predictions
predictions = total_predictions / n_ensamble
accuracy /= n_ensamble
auc /= n_ensamble
aucpr /= n_ensamble
if verbosity >=0:
print(f'Model: {model_name} - AUC: {auc:.4f} - AUCPR:{aucpr:.4f} - Accuracy: {accuracy:.4f} ')
if verbosity >=1:
print(f'{columns}')
if submit:
csv_name, submission = fr8_to_csv(X_to_predict, predictions, model_name, auc)
message = f"{model_name} - {model.get_params()} - {columns}"
display(csv_name)
display(message)
return model, auc, csv_name, message
return model, auc
def full_framework_ensemble(df_x, df_y, models_with_name,
columns=None,
normalize=False,
test_size=0.34, seed=42,
verbosity=0,
all_in=False,
n_ensamble=0,
submit=False):
if not columns: columns=df_x.columns.tolist()
model_df_x = df_x[columns]
model_df_y = df_y
models_name,models = models_with_name
if n_ensamble: models_name+=f'_ensamble_{n_ensamble}'
if normalize: models_name+='_normalized'
if all_in: models_name+='_all_in'
X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
# Voting Classifier en Soft == Averaging
# Voting Classifier en Hard == Majority Voting
params = {'flatten_transform': None, 'n_jobs': None, 'voting': 'soft', 'weights': None}
ensemble = VotingClassifier(estimators=models, **params)
ensemble = ensemble.fit(X_train, y_train)
auc = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
accuracy = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
auc = max(auc)
accuracy = max(accuracy)
if all_in:
ensemble.fit(X,y)
X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, ensemble)
if not n_ensamble == 0:
total_predictions = 0
tmp_seed = seed
max_seed = seed
accuracy, auc = 0,0
for i in range(n_ensamble):
print(f'Iteración {i+1} de ensamble de {n_ensamble}')
tmp_seed = tmp_seed + i
ensemble.fit(X_train,y_train)
auc_tmp = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
accuracy_tmp = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
if np.mean(auc_tmp) > np.mean(auc):
auc = auc_tmp
accuracy = accuracy_tmp
max_seed = tmp_seed
if all_in: ensemble = ensemble.fit(X,y)
auc = np.mean(auc)
accuracy = np.mean(accuracy)
X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, ensemble)
print(f'El mejor seed: {max_seed}')
print(f'Model: {models_name} - AUC: {auc:.4f} - Accuracy: {accuracy:.4f}')
if verbosity >=1:
print(f'{columns}')
if submit:
csv_name, submission = fr8_to_csv(X_to_predict, predictions, models_name, auc)
message = f"{models_name} - {ensemble.get_params()} - {columns}"
display(csv_name)
display(message)
return ensemble, auc, csv_name, message
return ensemble, auc
def full_framework_wrapper(df_x, df_y,
model_with_name,
columns=None,
normalize=False,
test_size=0.34, seed=42,
verbosity=0,
all_in=False,
n_ensamble=0,
submit=False):
model_name,model = model_with_name
if '+' in model_name:
return full_framework_ensemble(df_x, df_y,model_with_name,columns,normalize,test_size,seed,verbosity,all_in, n_ensamble,submit)
else: return full_framework_normal(df_x, df_y,model_with_name,columns,normalize,test_size,seed,verbosity,all_in, n_ensamble,submit)