Grupo 30: Datatouille
En este notebook se definen unas simples funciones de I/O para armar las postulaciones de predicciones del trabajo práctico.
Uso:
Crea la matriz X y el vector y para entrenar
Split para generar los set de entrenamiento y de prueba
Se ejecuta el algoritmo de ML (debe devolver un dataframe con person en el indice y labels como unica columna)
Se obtienen las distintas métricas de la predicción para obtener la precisión
Se predicen las probabilidades
Se ve información relevante de la ejecución
Se guarda como un csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score
from sklearn import preprocessing
import numpy as np
import pandas as pd
import os.path
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
def assert_equals(x,y):
    if not (x==y): 
        msg = f'{x} no equivale a {y}'
        print(msg)
        return False
    return True
def df_label_xor(df1, df2):
    merged = df1.merge(df2, how='outer', left_index=True, right_index=True, indicator=True)
    merged = merged.query('_merge != "both"')
    return merged
def fr1_extract_X_y(df, df_y, normalize=False):
    if not assert_equals(len(df), 38829): return
    if not assert_equals(len(df_y), 19414): return
    
    data = df.merge(df_y, how='inner', left_index= True, right_index=True)
    if not assert_equals(len(data), 19414): return
    
    X = data.drop('label', axis=1).values
    
    if normalize:
        min_max_scaler = preprocessing.MinMaxScaler()
        X_scaled = min_max_scaler.fit_transform(X)
        X = X_scaled
        
    y = df_y.values
    y.shape = y.shape[0]
    return X, y
def fr2_train_test_split(X, y, seed, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size,
                                                        stratify=y,
                                                        random_state=seed)
    
    y_train.shape = y_train.shape[0]
    return X_train, X_test, y_train, y_test
def fr4_metric_score(X_test, y_test, model, model_name):
    y_pred = model.predict(X_test)
    if not assert_equals(len(y_test), len(y_pred)): return
    accuracy = accuracy_score(y_test, y_pred.round())    
    auc = make_scorer(roc_auc_score, needs_threshold=True)(model, X_test, y_test)
    aucpr = make_scorer(average_precision_score, needs_threshold=True)(model, X_test, y_test)
    return accuracy, auc, aucpr
def fr5_extract_X_to_predict(df, df_y, model):
    if not assert_equals(len(df), 38829): return
    if not assert_equals(len(df_y), 19414): return
    
    data = df_label_xor(df, df_y)
    data = data.drop(['label', '_merge'], axis=1)
    if not assert_equals(len(data), 19415): return
    predictions = model.predict_proba(data.values)
    
    predictions_list = []
    for i in predictions:
        predictions_list.append(i[1])
    predictions_final = np.array(predictions_list)
        
    return data, predictions_final
def fr6_print_information(df, model, X_to_predict, with_features_importance):
    if not with_features_importance:
        return None
    
    feature_importances = pd.DataFrame(model.feature_importances_,
                                  index=df.columns,
                                  columns=['importance'])
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    return feature_importances
    
    
def fr7_train_final_model(algorithm, X, y):
    return algorithm.fit(X, y)
    
    
def fr8_to_csv(df, predictions, name, auc):
    name = name.replace('+','-')
    submission = df
    submission['label'] = predictions
    submission = submission['label']
    
    if not assert_equals(len(submission), 19415): return
    
    name_csv = f'submission-{name}-{auc:.4f}.csv'
    submission.to_csv(name_csv, header=True)
    return name_csv, submission
def get_feature_importances(df_x, df_y, model_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42):
        
    if not columns: columns=df_x.columns.tolist()
    model_df_x = df_x[columns]   
    model_df_y = df_y
    model_name,model = model_with_name   
    
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
        
    model.fit(X_train,y_train)
    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
                
    return fr6_print_information(model_df_x, model, X_to_predict, True)
def full_framework_normal(df_x, df_y, 
                           model_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, 
                           all_in=False,
                           n_ensamble=0,
                           submit=False):
        
    if not columns: columns=df_x.columns.tolist()
    model_df_x = df_x[columns]   
    model_df_y = df_y
    
    model_name,model = model_with_name   
    if n_ensamble: model_name+=f'_ensamble_{n_ensamble}'
    if normalize: model_name+='_normalized'
    if all_in: model_name+='_all_in'
    
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
        
    model.fit(X_train,y_train)
    accuracy, auc, aucpr = fr4_metric_score(X_test, y_test, model, model_name)
    if all_in:
        model.fit(X,y)
    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
    
    if not n_ensamble == 0:
        total_predictions = 0
        tmp_seed = seed
        accuracy, auc, aucpr = 0,0,0
        for i in range(n_ensamble):
            print(f'Iteración {i+1} de ensamble de {n_ensamble}')
            tmp_seed = tmp_seed + i
            
            model.fit(X_train,y_train)
            accuracy_tmp, auc_tmp, aucpr_tmp = fr4_metric_score(X_test, y_test, model, model_name)
            accuracy += accuracy_tmp
            auc += auc_tmp
            aucpr += aucpr_tmp
              
            if all_in: model = model.fit(X,y)
            X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
            total_predictions += predictions
        predictions = total_predictions / n_ensamble
        accuracy /= n_ensamble
        auc /= n_ensamble    
        aucpr /= n_ensamble    
                
    if verbosity >=0: 
        print(f'Model: {model_name} - AUC: {auc:.4f} - AUCPR:{aucpr:.4f} - Accuracy: {accuracy:.4f} ')
    if verbosity >=1:
        print(f'{columns}')
        
    if submit:
        csv_name, submission = fr8_to_csv(X_to_predict, predictions, model_name, auc)
        message = f"{model_name} - {model.get_params()} - {columns}"
        display(csv_name)
        display(message)
        return model, auc, csv_name, message
    
    return model, auc
def full_framework_ensemble(df_x, df_y, models_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, 
                           all_in=False,
                           n_ensamble=0,
                           submit=False):
    if not columns: columns=df_x.columns.tolist()
    model_df_x = df_x[columns]   
    model_df_y = df_y
    models_name,models = models_with_name   
    if n_ensamble: models_name+=f'_ensamble_{n_ensamble}'
    if normalize: models_name+='_normalized'
    if all_in: models_name+='_all_in'
            
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
    
    # Voting Classifier en Soft == Averaging
    # Voting Classifier en Hard == Majority Voting
    params = {'flatten_transform': None, 'n_jobs': None, 'voting': 'soft', 'weights': None}
    ensemble = VotingClassifier(estimators=models, **params)          
    ensemble = ensemble.fit(X_train, y_train)
    auc = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
    accuracy = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
    auc = max(auc)
    accuracy = max(accuracy)
    
    if all_in:
        ensemble.fit(X,y)
    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, ensemble)
    
    if not n_ensamble == 0:
        total_predictions = 0
        tmp_seed = seed
        max_seed = seed
        accuracy, auc = 0,0
        for i in range(n_ensamble):
            print(f'Iteración {i+1} de ensamble de {n_ensamble}')
            tmp_seed = tmp_seed + i
            
            ensemble.fit(X_train,y_train)
            auc_tmp = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
            accuracy_tmp = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
            if np.mean(auc_tmp) > np.mean(auc): 
                auc = auc_tmp
                accuracy = accuracy_tmp
                max_seed = tmp_seed
            if all_in: ensemble = ensemble.fit(X,y)
            auc = np.mean(auc)
            accuracy = np.mean(accuracy)
            X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, ensemble)
        print(f'El mejor seed: {max_seed}')
                
    print(f'Model: {models_name} - AUC: {auc:.4f} - Accuracy: {accuracy:.4f}')
    if verbosity >=1:
        print(f'{columns}')
        
    if submit:
        csv_name, submission = fr8_to_csv(X_to_predict, predictions, models_name, auc)
        message = f"{models_name} - {ensemble.get_params()} - {columns}"
        display(csv_name)
        display(message)
        return ensemble, auc, csv_name, message
    
    return ensemble, auc
def full_framework_wrapper(df_x, df_y, 
                           model_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, 
                           all_in=False,
                           n_ensamble=0,
                           submit=False):
    
    model_name,model = model_with_name   
    if '+' in model_name: 
        return full_framework_ensemble(df_x, df_y,model_with_name,columns,normalize,test_size,seed,verbosity,all_in, n_ensamble,submit)
    
    else: return full_framework_normal(df_x, df_y,model_with_name,columns,normalize,test_size,seed,verbosity,all_in, n_ensamble,submit)