[75.06 / 95.58] Organización de Datos
Trabajo Práctico 2: Machine Learning

Submission Framework

Grupo 30: Datatouille

http://fdelmazo.github.io/7506-Datos/

En este notebook se definen unas simples funciones de I/O para armar las postulaciones de predicciones del trabajo práctico.

Uso:

  1. Crea la matriz X y el vector y para entrenar

  2. Split para generar los set de entrenamiento y de prueba

  3. Se ejecuta el algoritmo de ML (debe devolver un dataframe con person en el indice y labels como unica columna)

  4. Se obtienen las distintas métricas de la predicción para obtener la precisión

  5. Se predicen las probabilidades

  6. Se ve información relevante de la ejecución

  7. Se guarda como un csv

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer, roc_auc_score, average_precision_score
from sklearn import preprocessing
import numpy as np
import pandas as pd
import os.path
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
In [2]:
def assert_equals(x,y):
    if not (x==y): 
        msg = f'{x} no equivale a {y}'
        print(msg)
        return False
    return True

def df_label_xor(df1, df2):
    merged = df1.merge(df2, how='outer', left_index=True, right_index=True, indicator=True)
    merged = merged.query('_merge != "both"')
    return merged
In [3]:
def fr1_extract_X_y(df, df_y, normalize=False):
    if not assert_equals(len(df), 38829): return
    if not assert_equals(len(df_y), 19414): return
    
    data = df.merge(df_y, how='inner', left_index= True, right_index=True)
    if not assert_equals(len(data), 19414): return
    
    X = data.drop('label', axis=1).values
    
    if normalize:
        min_max_scaler = preprocessing.MinMaxScaler()
        X_scaled = min_max_scaler.fit_transform(X)
        X = X_scaled
        
    y = df_y.values
    y.shape = y.shape[0]
    return X, y


def fr2_train_test_split(X, y, seed, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size,
                                                        stratify=y,
                                                        random_state=seed)
    
    y_train.shape = y_train.shape[0]
    return X_train, X_test, y_train, y_test


def fr4_metric_score(X_test, y_test, model, model_name):
    y_pred = model.predict(X_test)
    if not assert_equals(len(y_test), len(y_pred)): return

    accuracy = accuracy_score(y_test, y_pred.round())    
    auc = make_scorer(roc_auc_score, needs_threshold=True)(model, X_test, y_test)
    aucpr = make_scorer(average_precision_score, needs_threshold=True)(model, X_test, y_test)

    return accuracy, auc, aucpr


def fr5_extract_X_to_predict(df, df_y, model):
    if not assert_equals(len(df), 38829): return
    if not assert_equals(len(df_y), 19414): return
    
    data = df_label_xor(df, df_y)
    data = data.drop(['label', '_merge'], axis=1)

    if not assert_equals(len(data), 19415): return

    predictions = model.predict_proba(data.values)
    
    predictions_list = []
    for i in predictions:
        predictions_list.append(i[1])
    predictions_final = np.array(predictions_list)
        
    return data, predictions_final


def fr6_print_information(df, model, X_to_predict, with_features_importance):
    if not with_features_importance:
        return None
    
    feature_importances = pd.DataFrame(model.feature_importances_,
                                  index=df.columns,
                                  columns=['importance'])
    feature_importances = feature_importances.sort_values('importance', ascending=False)
    return feature_importances
    
    
def fr7_train_final_model(algorithm, X, y):
    return algorithm.fit(X, y)
    
    
def fr8_to_csv(df, predictions, name, auc):
    name = name.replace('+','-')
    submission = df
    submission['label'] = predictions
    submission = submission['label']
    
    if not assert_equals(len(submission), 19415): return
    
    name_csv = f'submission-{name}-{auc:.4f}.csv'
    submission.to_csv(name_csv, header=True)
    return name_csv, submission
In [4]:
def get_feature_importances(df_x, df_y, model_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42):
        
    if not columns: columns=df_x.columns.tolist()
    model_df_x = df_x[columns]   
    model_df_y = df_y

    model_name,model = model_with_name   
    
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
        
    model.fit(X_train,y_train)

    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
                
    return fr6_print_information(model_df_x, model, X_to_predict, True)
In [5]:
def full_framework_normal(df_x, df_y, 
                           model_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, 
                           all_in=False,
                           n_ensamble=0,
                           submit=False):
        
    if not columns: columns=df_x.columns.tolist()
    model_df_x = df_x[columns]   
    model_df_y = df_y

    
    model_name,model = model_with_name   
    if n_ensamble: model_name+=f'_ensamble_{n_ensamble}'
    if normalize: model_name+='_normalized'
    if all_in: model_name+='_all_in'
    
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
        
    model.fit(X_train,y_train)
    accuracy, auc, aucpr = fr4_metric_score(X_test, y_test, model, model_name)

    if all_in:
        model.fit(X,y)

    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
    
    if not n_ensamble == 0:
        total_predictions = 0
        tmp_seed = seed
        accuracy, auc, aucpr = 0,0,0
        for i in range(n_ensamble):
            print(f'Iteración {i+1} de ensamble de {n_ensamble}')
            tmp_seed = tmp_seed + i
            
            model.fit(X_train,y_train)
            accuracy_tmp, auc_tmp, aucpr_tmp = fr4_metric_score(X_test, y_test, model, model_name)
            accuracy += accuracy_tmp
            auc += auc_tmp
            aucpr += aucpr_tmp
              
            if all_in: model = model.fit(X,y)
            X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, model)
            total_predictions += predictions

        predictions = total_predictions / n_ensamble
        accuracy /= n_ensamble
        auc /= n_ensamble    
        aucpr /= n_ensamble    
                
    if verbosity >=0: 
        print(f'Model: {model_name} - AUC: {auc:.4f} - AUCPR:{aucpr:.4f} - Accuracy: {accuracy:.4f} ')
    if verbosity >=1:
        print(f'{columns}')
        
    if submit:
        csv_name, submission = fr8_to_csv(X_to_predict, predictions, model_name, auc)
        message = f"{model_name} - {model.get_params()} - {columns}"
        display(csv_name)
        display(message)
        return model, auc, csv_name, message
    
    return model, auc
In [1]:
def full_framework_ensemble(df_x, df_y, models_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, 
                           all_in=False,
                           n_ensamble=0,
                           submit=False):

    if not columns: columns=df_x.columns.tolist()
    model_df_x = df_x[columns]   
    model_df_y = df_y

    models_name,models = models_with_name   
    if n_ensamble: models_name+=f'_ensamble_{n_ensamble}'
    if normalize: models_name+='_normalized'
    if all_in: models_name+='_all_in'
            
    X, y = fr1_extract_X_y(model_df_x, model_df_y, normalize)
    X_train, X_test, y_train, y_test = fr2_train_test_split(X, y, seed, test_size)
    
    # Voting Classifier en Soft == Averaging
    # Voting Classifier en Hard == Majority Voting
    params = {'flatten_transform': None, 'n_jobs': None, 'voting': 'soft', 'weights': None}
    ensemble = VotingClassifier(estimators=models, **params)          
    ensemble = ensemble.fit(X_train, y_train)
    auc = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
    accuracy = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
    auc = max(auc)
    accuracy = max(accuracy)
    
    if all_in:
        ensemble.fit(X,y)

    X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, ensemble)
    
    if not n_ensamble == 0:
        total_predictions = 0
        tmp_seed = seed
        max_seed = seed
        accuracy, auc = 0,0
        for i in range(n_ensamble):
            print(f'Iteración {i+1} de ensamble de {n_ensamble}')
            tmp_seed = tmp_seed + i
            
            ensemble.fit(X_train,y_train)
            auc_tmp = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='roc_auc')
            accuracy_tmp = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='accuracy')
            if np.mean(auc_tmp) > np.mean(auc): 
                auc = auc_tmp
                accuracy = accuracy_tmp
                max_seed = tmp_seed
            if all_in: ensemble = ensemble.fit(X,y)
            auc = np.mean(auc)
            accuracy = np.mean(accuracy)
            X_to_predict, predictions = fr5_extract_X_to_predict(model_df_x, model_df_y, ensemble)

        print(f'El mejor seed: {max_seed}')
                
    print(f'Model: {models_name} - AUC: {auc:.4f} - Accuracy: {accuracy:.4f}')
    if verbosity >=1:
        print(f'{columns}')
        
    if submit:
        csv_name, submission = fr8_to_csv(X_to_predict, predictions, models_name, auc)
        message = f"{models_name} - {ensemble.get_params()} - {columns}"
        display(csv_name)
        display(message)
        return ensemble, auc, csv_name, message
    
    return ensemble, auc
In [7]:
def full_framework_wrapper(df_x, df_y, 
                           model_with_name, 
                           columns=None,
                           normalize=False, 
                           test_size=0.34, seed=42, 
                           verbosity=0, 
                           all_in=False,
                           n_ensamble=0,
                           submit=False):
    
    model_name,model = model_with_name   
    if '+' in model_name: 
        return full_framework_ensemble(df_x, df_y,model_with_name,columns,normalize,test_size,seed,verbosity,all_in, n_ensamble,submit)
    
    else: return full_framework_normal(df_x, df_y,model_with_name,columns,normalize,test_size,seed,verbosity,all_in, n_ensamble,submit)