[75.06 / 95.58] Organización de Datos
Trabajo Práctico 2: Machine Learning

Feature Engineering¶

Grupo 30: Datatouille

http://fdelmazo.github.io/7506-Datos/

En este notebook se buscan atributos nuevos para concatenar al set de datos original, así pudiendo armar un modelo predictivo más robusto y eficiente.

import pandas as pd
import numpy as np
import sklearn.cluster as cluster
from sklearn.feature_extraction import FeatureHasher
import hdbscan

df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('./data/brands.csv')
df_os = pd.read_csv('./data/os.csv')
df_browsers = pd.read_csv('./data/browsers.csv')
df_prices = pd.read_csv('./data/prices.csv')

df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')
df = df.merge(df_prices, how='left', on='sku')

df.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'time_diff_min',
       'session_id', 'session_total_events', 'session_cumno', 'session_first',
       'session_last', 'session_total_conversions', 'session_has_conversion',
       'session_total_checkouts', 'session_has_checkout', 'session_ad',
       'session_timestamp_first', 'session_timestamp_last', 'browser',
       'operating_system', 'brand', 'precio_reales'],
      dtype='object')

with pd.option_context('display.max_column',0):
  display(df.sample(5))

df['timestamp'] = pd.to_datetime(df['timestamp'])
df['session_timestamp_first'] = pd.to_datetime(df['session_timestamp_first'])
df['session_timestamp_last'] = pd.to_datetime(df['session_timestamp_last'])
df['month_number'] = df['timestamp'].dt.month

df['is_brand_listing'] = df['event'] == 'brand listing'
df['is_viewed_product'] = df['event'] == 'viewed product'
df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'
df['session_checkout_first'] = df['session_first'] & df['session_has_checkout']
df['session_conversion_first'] = df['session_first'] & df['session_has_conversion']
df['session_ad_first'] = df['session_first'] & df['session_ad']
df['session_ad_checkout_event'] = df['session_first'] & df['session_checkout_first']
df['session_ad_conversion_event'] = df['session_first'] & df['session_conversion_first']

df.columns

Index(['timestamp', 'event', 'person', 'url', 'sku', 'model', 'condition',
       'storage', 'color', 'skus', 'search_term', 'staticpage',
       'campaign_source', 'search_engine', 'channel', 'new_vs_returning',
       'city', 'region', 'country', 'device_type', 'screen_resolution',
       'operating_system_version', 'browser_version', 'time_diff_min',
       'session_id', 'session_total_events', 'session_cumno', 'session_first',
       'session_last', 'session_total_conversions', 'session_has_conversion',
       'session_total_checkouts', 'session_has_checkout', 'session_ad',
       'session_timestamp_first', 'session_timestamp_last', 'browser',
       'operating_system', 'brand', 'precio_reales', 'month_number',
       'is_brand_listing', 'is_viewed_product', 'is_conversion', 'is_checkout',
       'session_checkout_first', 'session_conversion_first',
       'session_ad_first', 'session_ad_checkout_event',
       'session_ad_conversion_event'],
      dtype='object')

Suma total de eventos¶

Checkouts y conversiones en total por usuario.

A motivos prácticos, en todo este notebook con eventos nos referimos solamente a:

Checkout: Un usuario ingresa al checkout de compra de un producto
Conversión: Un usuario realiza una conversión comprando un producto.

udf_tmp1 = df.groupby('person').agg({'is_brand_listing':'sum',
                                     'is_viewed_product':'sum', 
                                     'is_checkout':'sum', 
                                     'is_conversion':'sum', 
                                     'event':'count',
                                     'session_first':'sum', 'session_total_conversions':'sum',
                                     'session_checkout_first':'sum',
                                     'session_conversion_first':'sum',
                                     'session_ad':'sum',
                                     'session_ad_first':'sum'})

# Cosa rara que aparece sola
del udf_tmp1['session_total_conversions']

udf_tmp1.columns = ['total_brand_listings',
                    'total_viewed_products',
                    'total_checkouts',
                    'total_conversions',
                    'total_events',
                    'total_sessions',
                    'total_session_checkout',
                    'total_session_conversion',
                    'total_events_ad_session',
                    'total_ad_sessions']


udf_tmp1['avg_events_per_session'] = (udf_tmp1['total_events'] / udf_tmp1['total_sessions']).replace([np.inf, -np.inf], np.nan).fillna(0)
udf_tmp1['avg_events_per_ad_session'] = (udf_tmp1['total_events_ad_session'] / udf_tmp1['total_ad_sessions']).replace([np.inf, -np.inf], np.nan).fillna(0)
udf_tmp1['percentage_session_ad'] = udf_tmp1['total_ad_sessions'] / udf_tmp1['total_sessions']
udf_tmp1['percentage_session_conversion'] = udf_tmp1['total_session_conversion'] / udf_tmp1['total_sessions']

udf_tmp1['has_checkout'] = (udf_tmp1['total_checkouts'] > 0).astype('int')
udf_tmp1['has_conversion'] = (udf_tmp1['total_conversions'] > 0).astype('int')


udf_tmp1.head()

Cantidad de eventos por mes¶

Checkouts y conversiones por usuario por mes

udf_tmp2 = df['person'].drop_duplicates().to_frame().set_index('person')

for i in range(1,6):
    gb = df[df['month_number'] == i].groupby('person')
    udf_tmp2i = gb.agg({'is_viewed_product':'sum',
                                     'is_checkout':'sum', 
                                     'is_conversion':'sum', 
                                     'event':'count',
                                     'session_first':'sum',                                                                         'session_total_conversions':'sum',
                                     'session_checkout_first':'sum',
                                     'session_conversion_first':'sum',
                                     'session_ad':'sum',
                                     'session_ad_first':'sum'})
    
    # Cosa rara que aparece sola
    del udf_tmp2i['session_total_conversions']

    udf_tmp2i.columns = ['total_viewed_products_month_{}'.format(i),
                    'total_checkouts_month_{}'.format(i),
                    'total_conversions_month_{}'.format(i),
                    'total_events_month_{}'.format(i),
                    'total_sessions_month_{}'.format(i),
                    'total_session_checkouts_month_{}'.format(i),
                    'total_session_conversions_month_{}'.format(i),
                    'total_events_ad_session_month_{}'.format(i),
                    'total_ad_sessions_month_{}'.format(i)]

    
    udf_tmp2i['has_checkout_month_{}'.format(i)] = (udf_tmp2i['total_checkouts_month_{}'.format(i)] > 0).astype('int')
    udf_tmp2i['has_conversion_month_{}'.format(i)] = (udf_tmp2i['total_conversions_month_{}'.format(i)] > 0).astype('int')

    udf_tmp2 = udf_tmp2.merge(udf_tmp2i, how='outer', left_index=True, right_index=True)

udf_tmp2 = udf_tmp2.fillna(0)

udf_tmp2.head()

Eventos sin contar mayo¶

Checkouts y conversiones por usuario sin contar mayo (último mes registrado)

gb = df[df['month_number'] != 5].groupby('person')

udf_tmp3 = gb.agg({'is_viewed_product':'sum',
                   'is_checkout':'sum', 
                   'is_conversion':'sum', 
                   'event':'count',
                   'session_first':'sum',                                                                         
                   'session_total_conversions':'sum',
                   'session_checkout_first':'sum',
                   'session_conversion_first':'sum',
                   'session_ad':'sum',
                   'session_ad_first':'sum'})
    
# Cosa rara que aparece sola
del udf_tmp3['session_total_conversions']

udf_tmp3.columns = ['total_viewed_products_months_1_to_4',
                    'total_checkouts_months_1_to_4',
                    'total_conversions_months_1_to_4',
                    'total_events_months_1_to_4',
                    'total_sessions_months_1_to_4',
                    'total_session_checkouts_months_1_to_4',
                    'total_session_conversions_months_1_to_4',
                    'total_events_ad_session_months_1_to_4',
                    'total_ad_sessions_months_1_to_4']


udf_tmp3['has_checkout_months_1_to_4'] = (udf_tmp3['total_checkouts_months_1_to_4'] > 0).astype('int')
udf_tmp3['has_conversion_months_1_to_4'] = (udf_tmp3['total_conversions_months_1_to_4'] > 0).astype('int')


udf_tmp3.head()

Eventos en última semana¶

Checkouts y conversiones por usuario en la última semana registrada

gb = df[df['timestamp'] >= pd.to_datetime('2018-05-23')].groupby('person')

udf_tmp4 = gb.agg({'is_brand_listing':'sum',
                   'is_viewed_product':'sum', 
                   'is_checkout':'sum', 
                   'is_conversion':'sum', 
                   'event':'count',
                   'session_first':'sum',                                                                         
                   'session_total_conversions':'sum',
                   'session_checkout_first':'sum',
                   'session_conversion_first':'sum',
                   'session_ad':'sum',
                   'session_ad_first':'sum'})
    
# Cosa rara que aparece sola
del udf_tmp4['session_total_conversions']

udf_tmp4.columns = ['total_brand_listings_lw',
                    'total_viewed_products_lw',
                    'total_checkouts_lw',
                    'total_conversions_lw',
                    'total_events_lw',
                    'total_sessions_lw',
                    'total_session_checkouts_lw',
                    'total_session_conversions_lw',
                    'total_events_ad_session_lw',
                    'total_ad_sessions_lw']


udf_tmp4['has_checkout_lw'] = (udf_tmp4['total_checkouts_lw'] > 0).astype('int')
udf_tmp4['has_conversion_lw'] = (udf_tmp4['total_conversions_lw'] > 0).astype('int')

udf_tmp4 = udf_tmp4.merge(udf_tmp1[['total_events']], how='left', on='person')

udf_tmp4['percentage_last_week_activity'] = udf_tmp4['total_events_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_brand_listings'] = udf_tmp4['total_brand_listings_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_viewed_products'] = udf_tmp4['total_viewed_products_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_checkouts'] = udf_tmp4['total_checkouts_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_conversions'] = udf_tmp4['total_conversions_lw'] / udf_tmp4['total_events']

del udf_tmp4['total_events']

udf_tmp4.head()

Distribución mensual de las conversiones¶

Cuan esparcidas (en meses) estan las conversiones de los usuarios

udf_tmp5 = udf_tmp2['has_conversion_month_1']
for i in range(2,6):
    udf_tmp5 = udf_tmp5 + udf_tmp2['has_conversion_month_{}'.format(i)]
    
udf_tmp5 = udf_tmp5.to_frame()
udf_tmp5.columns = ['amount_of_months_that_has_bought']

for i in range(6):
    print('Users that have bought in {} different months: {}'.format(i, len(udf_tmp5[udf_tmp5['amount_of_months_that_has_bought'] >= i])))

udf_tmp5.head()

Users that have bought in 0 different months: 38829
Users that have bought in 1 different months: 4293
Users that have bought in 2 different months: 330
Users that have bought in 3 different months: 34
Users that have bought in 4 different months: 5
Users that have bought in 5 different months: 0

Informacion de los últimos eventos registrados por usuario¶

Información como días hasta la última conversión, que día de la semana fue esta y demás.

(En esta celda por evento nos referimos a cualquier tipo de evento)

df_event = df.groupby('person').agg({'timestamp':'max'})
df_event.columns = ['timestamp_last_event']

df_checkout = df[df['event'] == 'checkout']
df_checkout = df_checkout.groupby('person').agg({'timestamp': 'max'})
df_checkout.columns = ['timestamp_last_checkout']

df_conversion = df[df['event'] == 'conversion']
df_conversion = df_conversion.groupby('person').agg({'timestamp': 'max'})
df_conversion.columns = ['timestamp_last_conversion']

df_viewed_product = df[df['event'] == 'viewed_product']
df_viewed_product = df_viewed_product.groupby('person').agg({'timestamp': 'max'})
df_viewed_product.columns = ['timestamp_last_viewed_product']

df_timelapse = df_event.merge(df_checkout, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_conversion, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_viewed_product, how='outer', on='person')

#Fecha arbitraria cuando el usuario nunca hizo checkout/conversion/viewed_product
df_timelapse = df_timelapse.fillna(pd.to_datetime('2018-01-01')) 

df_timelapse.head()

udf_tmp6 = df_timelapse.loc[:]

udf_tmp6['days_to_last_event'] = pd.to_datetime('2018-06-01').dayofyear - df_timelapse['timestamp_last_event'].dt.dayofyear

udf_tmp6['days_to_last_checkout'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['days_to_last_conversion'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['days_to_last_viewed_product'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_viewed_product'].dt.dayofyear

udf_tmp6.head()

udf_tmp6['doy_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofyear
udf_tmp6['dow_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofweek
udf_tmp6['dom_last_event'] = udf_tmp6['timestamp_last_event'].dt.day
udf_tmp6['woy_last_event'] = udf_tmp6['timestamp_last_event'].dt.weekofyear

udf_tmp6['doy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['dow_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofweek
udf_tmp6['dom_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.day
udf_tmp6['woy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.weekofyear

udf_tmp6['doy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['dow_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofweek
udf_tmp6['dom_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.day
udf_tmp6['woy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.weekofyear

udf_tmp6['doy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofyear
udf_tmp6['dow_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofweek
udf_tmp6['dom_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.day
udf_tmp6['woy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.weekofyear

udf_tmp6 = udf_tmp6.astype('int')
udf_tmp6.head()

Precios de la ultima conversion realizada por el usuario¶

df_prices.head()

udf_tmp7 = df[df['event'] == 'conversion']
udf_tmp7.set_index('person', inplace=True)
udf_tmp7 = udf_tmp7.groupby('person').agg({'timestamp': 'max','sku':'max'})
udf_tmp7.reset_index(inplace=True)
udf_tmp7 = udf_tmp7.merge(df_prices, how='inner', on='sku')
udf_tmp7['sku'] =  udf_tmp7['sku'].astype('int')
udf_tmp7 = udf_tmp7.rename(columns={'precio_reales': 'last_conversion_price', 'sku': 'last_conversion_sku'})

udf_tmp7.set_index('person',inplace=True)
del udf_tmp7['timestamp']
udf_tmp7['last_conversion_price'] =  udf_tmp7['last_conversion_price'].astype('float')
udf_tmp7.head()

Porcentaje de la actividad de la ultima semana¶

Cantidad de eventos de la ultima semana sobre el total

Porcentaje de la actividad del ultimo mes¶

Cantidad de eventos del último mes sobre el total

udf_tmp9 = df.groupby('person').agg({'event': 'count'})
udf_tmp9 = udf_tmp9.rename(columns = {'event': 'total_events'})

df_last_month = df[df['timestamp'] > pd.to_datetime('2018-05-01')]

df_last_month_event = df_last_month.groupby('person').agg({'event': 'count'})
df_last_month_event = df_last_month_event.rename(columns = {'event': 'events_last_month'})

df_last_month_conversion = df_last_month[df_last_month['event'] == 'conversion']
df_last_month_conversion = df_last_month_conversion.groupby('person').agg({'event': 'count'})
df_last_month_conversion = df_last_month_conversion.rename(columns = {'event': 'conversions_last_month'})

df_last_month_checkout = df_last_month[df_last_month['event'] == 'checkout']
df_last_month_checkout = df_last_month_checkout.groupby('person').agg({'event': 'count'})
df_last_month_checkout = df_last_month_checkout.rename(columns = {'event': 'checkouts_last_month'})

df_last_month_viewed_product = df_last_month[df_last_month['event'] == 'viewed product']
df_last_month_viewed_product = df_last_month_viewed_product.groupby('person').agg({'event': 'count'})
df_last_month_viewed_product = df_last_month_viewed_product.rename(columns = {'event': 'viewed_products_last_month'})

udf_tmp9 = udf_tmp9.merge(df_last_month_event, how='outer', on='person')
udf_tmp9 = udf_tmp9.merge(df_last_month_conversion, how='outer', on='person')
udf_tmp9 = udf_tmp9.merge(df_last_month_checkout, how='outer', on='person')
udf_tmp9 = udf_tmp9.merge(df_last_month_viewed_product, how='outer', on='person')

udf_tmp9 = udf_tmp9.fillna(0) #Si no tuvo actividad en la última semana el porcentaje es 0

udf_tmp9['percentage_last_month_activity'] = udf_tmp9['events_last_month'] / udf_tmp9['total_events']
udf_tmp9['percentage_last_month_conversions'] = udf_tmp9['conversions_last_month'] / udf_tmp9['total_events']
udf_tmp9['percentage_last_month_checkouts'] = udf_tmp9['checkouts_last_month'] / udf_tmp9['total_events']
udf_tmp9['percentage_last_month_viewed_products'] = udf_tmp9['viewed_products_last_month'] / udf_tmp9['total_events']

del udf_tmp9['total_events']
del udf_tmp9['events_last_month']
del udf_tmp9['conversions_last_month']
del udf_tmp9['checkouts_last_month']
del udf_tmp9['viewed_products_last_month']

udf_tmp9.head()

Días entre el último checkout y última actividad¶

Se guardan los días entre el último evento del usuario y el último checkout

df_last_checkout = df[df['event'] == 'checkout']
df_last_checkout = df_last_checkout.groupby('person').agg({'timestamp': 'max'})
df_last_checkout = df_last_checkout.rename(columns = {'timestamp': 'timestamp_last_checkout'})
df_last_event = df.groupby('person').agg({'timestamp': 'max'})
df_last_event = df_last_event.rename(columns = {'timestamp': 'timestamp_last_event'})

udf_tmp10 = df_last_checkout.merge(df_last_event, how='outer', on='person')
udf_tmp10['days_between_last_event_and_checkout'] = udf_tmp10['timestamp_last_event'].dt.dayofyear - udf_tmp10['timestamp_last_checkout'].dt.dayofyear
udf_tmp10 = udf_tmp10.fillna(180) #Se utiliza el tope de tiempo si no tuvo una conversión
udf_tmp10['days_between_last_event_and_checkout'] = udf_tmp10['days_between_last_event_and_checkout'].astype('int')
del udf_tmp10['timestamp_last_event']
del udf_tmp10['timestamp_last_checkout']

udf_tmp10.head()

Estados de celulares¶

Utilizando la lógica de que hay empresas que compran celulares en mal estado y en una primera instancia (obviamente) lo ven, se agrega una columna que indique porcentaje de celulares en estado Bom - Sem Touch ID vs Bom sobre todos los celulares vistos

df_viewed_product = df[df['event'] == 'viewed product']

df_regular_phones = df_viewed_product[(df_viewed_product['condition'] == 'Bom') | (df_viewed_product['condition'] == 'Bom - Sem Touch ID')]
df_regular_phones = df_regular_phones.groupby('person').agg({'condition': 'count'})
df_regular_phones = df_regular_phones.rename(columns = {'condition': 'amount_regular_phones'})

df_all_phones = df_viewed_product.groupby('person').agg({'condition': 'count'})
df_all_phones = df_all_phones.rename(columns = {'condition': 'amount_total_phones'})

udf_tmp11 = df_regular_phones.merge(df_all_phones, how='outer', on='person')

udf_tmp11['percentage_regular_celphones_activity'] = udf_tmp11['amount_regular_phones'] / udf_tmp11['amount_total_phones']

del udf_tmp11['amount_regular_phones']
del udf_tmp11['amount_total_phones']

udf_tmp11.head()

Varianza logarítmica de productos vistos¶

udf_tmp12 = df[df['event'] == 'viewed product']
udf_tmp12 = udf_tmp12.groupby('person').agg({'precio_reales': 'var'})
udf_tmp12['precio_reales'] = np.log(udf_tmp12['precio_reales']) 
udf_tmp12['precio_reales'] = udf_tmp12['precio_reales'].replace([np.inf, -np.inf], 0)
udf_tmp12 = udf_tmp12.rename(columns={'precio_reales': 'var_viewed'})
udf_tmp12.head()

/home/delmazo/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: RuntimeWarning: divide by zero encountered in log
  This is separate from the ipykernel package so we can avoid doing imports until

¿El usuario compró más de la media?¶

conversions = df[df['event'] == 'conversion']
monto_total = np.sum(conversions['precio_reales'])
cantidad = conversions['precio_reales'].count()
media = monto_total/cantidad

udf_tmp13 = conversions.groupby('person').agg({'precio_reales': 'max'})
udf_tmp13['conversion_gt_media'] = (udf_tmp13['precio_reales'] > media).astype('int')
udf_tmp13['conversion_gt_media'] = udf_tmp13['conversion_gt_media'].astype('int')

del udf_tmp13['precio_reales']

udf_tmp13.head()

¿Cuantas veces vio el último modelo que compró?¶

En un primer lugar se guarda en un df cuantas veces vio cada modelo cada usuario

udf_tmp14 = df[df['event'] == 'viewed product']
udf_tmp14 = udf_tmp14.groupby('person')['model'].value_counts()
udf_tmp14 = udf_tmp14.to_frame()
udf_tmp14 = udf_tmp14.rename(columns={'model':'cant_viewed'})
udf_tmp14 = udf_tmp14.unstack()
udf_tmp14 = udf_tmp14.fillna(0)

udf_tmp14 = udf_tmp14.xs('cant_viewed', axis=1, drop_level=True)

udf_tmp14_prima = udf_tmp14.copy()
udf_tmp14_prima['total_viewed_products'] = udf_tmp14.sum(axis=1)
udf_tmp14_prima['total_max_viewed_product'] = udf_tmp14.max(axis=1)
udf_tmp14_prima['percentage_max_viewed_product'] = udf_tmp14_prima['total_max_viewed_product'] / udf_tmp14_prima['total_viewed_products']
udf_tmp14_prima = udf_tmp14_prima[[
    'total_viewed_products', 
    'total_max_viewed_product',
    'percentage_max_viewed_product'
]]

del udf_tmp14_prima['total_viewed_products']

display(udf_tmp14.head())
udf_tmp14_prima.head()

udf_tmp14_bis = df[df['event'] == 'conversion']
udf_tmp14_bis.set_index('person', inplace=True)
udf_tmp14_bis = udf_tmp14_bis.groupby('person')['timestamp','sku','model'].agg({'timestamp': 'max','sku':'max'})

udf_tmp14_bis = udf_tmp14_bis.xs('timestamp', axis=1, drop_level=True)

del udf_tmp14_bis['timestamp']
del udf_tmp14_bis['sku']

udf_tmp14_bis.head()

udf_tmp14_merge = udf_tmp14_bis.merge(udf_tmp14, how='inner', on='person')

models = list(udf_tmp14_merge['model'])
columns = list(udf_tmp14_merge.columns)
cant_viewed_model_last_conversion = []

for i in range(udf_tmp14_merge.shape[0]):
    actual_model = models[i]
    position_model = columns.index(actual_model)
    cant = udf_tmp14_merge.iloc[i,position_model]
    cant_viewed_model_last_conversion.append(cant)

udf_tmp14_merge['cant_viewed_model_last_conversion'] = cant_viewed_model_last_conversion
udf_tmp14 = udf_tmp14_merge[['cant_viewed_model_last_conversion']]
udf_tmp14['cant_viewed_model_last_conversion'] = udf_tmp14['cant_viewed_model_last_conversion'].astype('int')

udf_tmp14.head()

/home/delmazo/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

¿Cuántas veces vio la última marca que compró?¶

En un primer lugar se guarda en un df cuantas veces vio cada marca cada usuario

udf_tmp15 = df[df['event'] == 'viewed product']
udf_tmp15 = udf_tmp15.groupby('person')['brand'].value_counts()
udf_tmp15 = udf_tmp15.to_frame()
udf_tmp15 = udf_tmp15.rename(columns={'model':'cant_viewed'})
udf_tmp15 = udf_tmp15.unstack()
udf_tmp15 = udf_tmp15.fillna(0)

udf_tmp15 = udf_tmp15.xs('brand', axis=1, drop_level=True)
udf_tmp15.head()

udf_tmp15_bis = df[df['event'] == 'conversion']
udf_tmp15_bis.set_index('person', inplace=True)
udf_tmp15_bis = udf_tmp15_bis.groupby('person')['timestamp','sku','brand'].agg({'timestamp': 'max','sku':'max'})

udf_tmp15_bis = udf_tmp15_bis.xs('timestamp', axis=1, drop_level=True)

del udf_tmp15_bis['timestamp']
del udf_tmp15_bis['sku']

udf_tmp15_bis.head()

udf_tmp15_merge = udf_tmp15_bis.merge(udf_tmp15, how='inner', on='person')

brands = list(udf_tmp15_merge['brand'])
columns = list(udf_tmp15_merge.columns)
cant_viewed_brand_last_conversion = []

for i in range(udf_tmp15_merge.shape[0]):
    actual_brand = brands[i]
    position_brand = columns.index(actual_brand)
    cant = udf_tmp15_merge.iloc[i,position_brand]
    cant_viewed_brand_last_conversion.append(cant)

udf_tmp15_merge['cant_viewed_brand_last_conversion'] = cant_viewed_brand_last_conversion
udf_tmp15 = udf_tmp15_merge[['cant_viewed_brand_last_conversion']]
udf_tmp15['cant_viewed_brand_last_conversion'] = udf_tmp15['cant_viewed_brand_last_conversion'].astype('int')

udf_tmp15.head()

/home/delmazo/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

Comportamiento en sesiones de las últimas semanas¶

tmp = df[df['session_timestamp_first'] > pd.to_datetime('2018-05-23')]
gb = tmp.groupby('person')

udf_tmp16 = gb.agg({
    'session_first':'sum',
})

udf_tmp16.columns = ['total_sessions_last_week']

udf_tmp16 = udf_tmp16.merge(udf_tmp1[['total_sessions']], how='right', on='person')

udf_tmp16['ratio_sessions_last_week_over_total'] = udf_tmp16['total_sessions_last_week'] / udf_tmp16['total_sessions']
udf_tmp16['has_event_last_week'] = 1

del udf_tmp16['total_sessions']

display(udf_tmp16.head())

¿Cuántas veces entró cada usuario a las static pages?¶

udf_tmp17 = df[(df['staticpage'] == 'FaqEcommerce') | (df['staticpage'] == 'CustomerService')]
udf_tmp17 = udf_tmp17.groupby('person')['staticpage'].value_counts()
udf_tmp17 = udf_tmp17.to_frame()
udf_tmp17 = udf_tmp17.rename(columns={'staticpage':'cant_staticpage'})
udf_tmp17 = udf_tmp17.unstack()
udf_tmp17 = udf_tmp17.fillna(0)

udf_tmp17 = udf_tmp17.xs('cant_staticpage', axis=1, drop_level=True)
udf_tmp17.rename(columns={'CustomerService':'cant_visitas_customer_service','FaqEcommerce':'cant_visitas_faq_ecommerce'},inplace=True)
udf_tmp17.head()

¿Cuántas veces vio el modelo que más vio la última semana?¶

udf_tmp50 = df[(df['timestamp'] >= pd.to_datetime('2018-05-23')) & (df['event'] == 'viewed product')]
udf_tmp50 = udf_tmp50.groupby('person')['model'].value_counts()
udf_tmp50 = udf_tmp50.to_frame()
udf_tmp50 = udf_tmp50.rename(columns={'model':'cant_viewed'})
udf_tmp50 = udf_tmp50.unstack()
udf_tmp50 = udf_tmp50.fillna(0)

udf_tmp50 = udf_tmp50.xs('cant_viewed', axis=1, drop_level=True)

udf_tmp50_prima = udf_tmp50.copy()
udf_tmp50_prima['total_last_week_viewed_products'] = udf_tmp50.sum(axis=1)
udf_tmp50_prima['total_last_week_max_viewed_model'] = udf_tmp50.max(axis=1)
udf_tmp50_prima['percentage_last_week_max_viewed_model'] = \
 udf_tmp50_prima['total_last_week_max_viewed_model'] / udf_tmp50_prima['total_last_week_viewed_products']
udf_tmp50_prima = udf_tmp50_prima[[
    'total_last_week_max_viewed_model',
    'percentage_last_week_max_viewed_model'
]]

display(udf_tmp50.head())
display(udf_tmp50_prima.head())

¿Cuántas veces vio la marca que más vio la última semana?¶

udf_tmp51 = df[(df['timestamp'] >= pd.to_datetime('2018-05-23')) & (df['event'] == 'viewed product')]
udf_tmp51 = udf_tmp51.groupby('person')['brand'].value_counts()
udf_tmp51 = udf_tmp51.to_frame()
udf_tmp51 = udf_tmp51.rename(columns={'brand':'cant_viewed'})
udf_tmp51 = udf_tmp51.unstack()
udf_tmp51 = udf_tmp51.fillna(0)

udf_tmp51 = udf_tmp51.xs('cant_viewed', axis=1, drop_level=True)

udf_tmp51_prima = udf_tmp51.copy()
udf_tmp51_prima['total_last_week_viewed_products'] = udf_tmp51.sum(axis=1)
udf_tmp51_prima['total_last_week_max_viewed_brand'] = udf_tmp51.max(axis=1)
udf_tmp51_prima['percentage_last_week_max_viewed_brand'] = \
 udf_tmp51_prima['total_last_week_max_viewed_brand'] / udf_tmp51_prima['total_last_week_viewed_products']
udf_tmp51_prima = udf_tmp51_prima[[
    'total_last_week_max_viewed_brand',
    'percentage_last_week_max_viewed_brand'
]]


display(udf_tmp51.head())
display(udf_tmp51_prima.head())

Marca más vista¶

# One hot encoding casero
udf_tmp52 = df[df['event'] == 'viewed product'].groupby('person')['brand'].value_counts().to_frame().unstack().fillna(0)
udf_tmp52 = udf_tmp52['brand']
udf_tmp52.columns = 'most_viewed_brand_is_' + udf_tmp52.columns

udf_tmp52['most_viewed'] = udf_tmp52.max(axis=1)

for c in udf_tmp52.columns:
    udf_tmp52[c] = udf_tmp52[c] - udf_tmp52['most_viewed']
    
del udf_tmp52['most_viewed']

udf_tmp52 = (udf_tmp52 >= 0).astype('int')

udf_tmp52

Marca más vista la última semana¶

# One hot encoding casero
udf_tmp53 = df[(df['timestamp'] > pd.to_datetime('2018-05-23')) & (df['event'] == 'viewed product')].groupby('person')['brand'].value_counts().to_frame().unstack().fillna(0)
udf_tmp53 = udf_tmp53['brand']
udf_tmp53.columns = 'most_viewed_brand_lw_is_' + udf_tmp53.columns

udf_tmp53['most_viewed'] = udf_tmp53.max(axis=1)

for c in udf_tmp53.columns:
    udf_tmp53[c] = udf_tmp53[c] - udf_tmp53['most_viewed']
    
del udf_tmp53['most_viewed']

udf_tmp53 = (udf_tmp53 >= 0).astype('int')

udf_tmp53

Acciones Últimas 2 semanas¶

tmp = df[df['session_timestamp_first'] > pd.to_datetime('2018-05-16')]
gb = tmp.groupby('person')

udf_tmp54 = gb.agg({
    'session_first':'sum',
})

udf_tmp54.columns = ['total_sessions_l2w']

udf_tmp54 = udf_tmp54.merge(udf_tmp1[['total_sessions']], how='right', on='person')

udf_tmp54['ratio_sessions_l2w_over_total'] = udf_tmp54['total_sessions_l2w'] / udf_tmp54['total_sessions']

del udf_tmp54['total_sessions']

udf_tmp54 = udf_tmp54.fillna(0)

display(udf_tmp54.head())

gb = df[df['timestamp'] > pd.to_datetime('2018-05-16')].groupby('person')

udf_tmp55 = gb.agg({'is_brand_listing':'sum',
                   'is_viewed_product':'sum', 
                   'is_checkout':'sum', 
                   'is_conversion':'sum', 
                   'event':'count',
                   'session_first':'sum',                                                                         
                   'session_total_conversions':'sum',
                   'session_checkout_first':'sum',
                   'session_conversion_first':'sum',
                   'session_ad':'sum',
                   'session_ad_first':'sum'})
    
# Cosa rara que aparece sola
del udf_tmp55['session_total_conversions']

udf_tmp55.columns = ['total_brand_listings_l2w',
                    'total_viewed_products_l2w',
                    'total_checkouts_l2w',
                    'total_conversions_l2w',
                    'total_events_l2w',
                    'total_sessions_l2w',
                    'total_session_checkouts_l2w',
                    'total_session_conversions_l2w',
                    'total_events_ad_session_l2w',
                    'total_ad_sessions_l2w']


udf_tmp55['has_checkout_l2w'] = (udf_tmp55['total_checkouts_l2w'] > 0).astype('int')
udf_tmp55['has_conversion_l2w'] = (udf_tmp55['total_conversions_l2w'] > 0).astype('int')

udf_tmp55 = udf_tmp55.merge(udf_tmp1[['total_events']], how='left', on='person')

udf_tmp55['percentage_l2w_week_activity'] = udf_tmp55['total_events_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_brand_listings'] = udf_tmp55['total_brand_listings_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_viewed_products'] = udf_tmp55['total_viewed_products_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_checkouts'] = udf_tmp55['total_checkouts_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_conversions'] = udf_tmp55['total_conversions_l2w'] / udf_tmp55['total_events']

del udf_tmp55['total_events']

udf_tmp55.head()

Se guarda todo en user-features.csv

udf = udf_tmp1
udf = udf.merge(udf_tmp2, how='outer', on='person')
udf = udf.merge(udf_tmp3, how='outer', on='person') # No se encuentra utilidad, pero mejora un poco
udf = udf.merge(udf_tmp4, how='outer', on='person')
udf = udf.merge(udf_tmp5, how='outer', on='person')
udf = udf.merge(udf_tmp6, how='outer', on='person')
udf = udf.merge(udf_tmp7, how='outer', on='person')
# udf = udf.merge(udf_tmp8, how='outer', on='person') # Lo saqué, ahora se calcula con udf_tmp4
udf = udf.merge(udf_tmp9, how='outer', on='person')
udf = udf.merge(udf_tmp10, how='outer', on='person')
udf = udf.merge(udf_tmp11, how='outer', on='person')
udf = udf.merge(udf_tmp12, how='outer', on='person') 
udf = udf.merge(udf_tmp13, how='outer', on='person')
#udf = udf.merge(udf_tmp14, how='outer', on='person')
udf = udf.merge(udf_tmp14_prima, how='outer', on='person')
udf = udf.merge(udf_tmp15, how='outer', on='person')
udf = udf.merge(udf_tmp16, how='outer', on='person')
udf = udf.merge(udf_tmp17, how='outer', on='person')
#udf = udf.merge(udf_tmp50, how='outer', on='person')
udf = udf.merge(udf_tmp50_prima, how='outer', on='person')
#udf = udf.merge(udf_tmp51, how='outer', on='person')
udf = udf.merge(udf_tmp51_prima, how='outer', on='person')
udf = udf.merge(udf_tmp52, how='outer', on='person')
udf = udf.merge(udf_tmp53, how='outer', on='person')
udf = udf.merge(udf_tmp54, how='outer', on='person')
udf = udf.merge(udf_tmp55, how='outer', on='person')

udf = udf.fillna(0)
display(udf.head())
display(udf.columns.tolist())

['total_brand_listings',
 'total_viewed_products',
 'total_checkouts',
 'total_conversions',
 'total_events',
 'total_sessions',
 'total_session_checkout',
 'total_session_conversion',
 'total_events_ad_session',
 'total_ad_sessions',
 'avg_events_per_session',
 'avg_events_per_ad_session',
 'percentage_session_ad',
 'percentage_session_conversion',
 'has_checkout',
 'has_conversion',
 'total_viewed_products_month_1',
 'total_checkouts_month_1',
 'total_conversions_month_1',
 'total_events_month_1',
 'total_sessions_month_1',
 'total_session_checkouts_month_1',
 'total_session_conversions_month_1',
 'total_events_ad_session_month_1',
 'total_ad_sessions_month_1',
 'has_checkout_month_1',
 'has_conversion_month_1',
 'total_viewed_products_month_2',
 'total_checkouts_month_2',
 'total_conversions_month_2',
 'total_events_month_2',
 'total_sessions_month_2',
 'total_session_checkouts_month_2',
 'total_session_conversions_month_2',
 'total_events_ad_session_month_2',
 'total_ad_sessions_month_2',
 'has_checkout_month_2',
 'has_conversion_month_2',
 'total_viewed_products_month_3',
 'total_checkouts_month_3',
 'total_conversions_month_3',
 'total_events_month_3',
 'total_sessions_month_3',
 'total_session_checkouts_month_3',
 'total_session_conversions_month_3',
 'total_events_ad_session_month_3',
 'total_ad_sessions_month_3',
 'has_checkout_month_3',
 'has_conversion_month_3',
 'total_viewed_products_month_4',
 'total_checkouts_month_4',
 'total_conversions_month_4',
 'total_events_month_4',
 'total_sessions_month_4',
 'total_session_checkouts_month_4',
 'total_session_conversions_month_4',
 'total_events_ad_session_month_4',
 'total_ad_sessions_month_4',
 'has_checkout_month_4',
 'has_conversion_month_4',
 'total_viewed_products_month_5',
 'total_checkouts_month_5',
 'total_conversions_month_5',
 'total_events_month_5',
 'total_sessions_month_5',
 'total_session_checkouts_month_5',
 'total_session_conversions_month_5',
 'total_events_ad_session_month_5',
 'total_ad_sessions_month_5',
 'has_checkout_month_5',
 'has_conversion_month_5',
 'total_viewed_products_months_1_to_4',
 'total_checkouts_months_1_to_4',
 'total_conversions_months_1_to_4',
 'total_events_months_1_to_4',
 'total_sessions_months_1_to_4',
 'total_session_checkouts_months_1_to_4',
 'total_session_conversions_months_1_to_4',
 'total_events_ad_session_months_1_to_4',
 'total_ad_sessions_months_1_to_4',
 'has_checkout_months_1_to_4',
 'has_conversion_months_1_to_4',
 'total_brand_listings_lw',
 'total_viewed_products_lw',
 'total_checkouts_lw',
 'total_conversions_lw',
 'total_events_lw',
 'total_sessions_lw',
 'total_session_checkouts_lw',
 'total_session_conversions_lw',
 'total_events_ad_session_lw',
 'total_ad_sessions_lw',
 'has_checkout_lw',
 'has_conversion_lw',
 'percentage_last_week_activity',
 'percentage_last_week_brand_listings',
 'percentage_last_week_viewed_products',
 'percentage_last_week_checkouts',
 'percentage_last_week_conversions',
 'amount_of_months_that_has_bought',
 'timestamp_last_event',
 'timestamp_last_checkout',
 'timestamp_last_conversion',
 'timestamp_last_viewed_product',
 'days_to_last_event',
 'days_to_last_checkout',
 'days_to_last_conversion',
 'days_to_last_viewed_product',
 'doy_last_event',
 'dow_last_event',
 'dom_last_event',
 'woy_last_event',
 'doy_last_checkout',
 'dow_last_checkout',
 'dom_last_checkout',
 'woy_last_checkout',
 'doy_last_conversion',
 'dow_last_conversion',
 'dom_last_conversion',
 'woy_last_conversion',
 'doy_last_viewed_product',
 'dow_last_viewed_product',
 'dom_last_viewed_product',
 'woy_last_viewed_product',
 'last_conversion_sku',
 'last_conversion_price',
 'percentage_last_month_activity',
 'percentage_last_month_conversions',
 'percentage_last_month_checkouts',
 'percentage_last_month_viewed_products',
 'days_between_last_event_and_checkout',
 'percentage_regular_celphones_activity',
 'var_viewed',
 'conversion_gt_media',
 'total_max_viewed_product',
 'percentage_max_viewed_product',
 'cant_viewed_brand_last_conversion',
 'total_sessions_last_week',
 'ratio_sessions_last_week_over_total',
 'has_event_last_week',
 'cant_visitas_customer_service',
 'cant_visitas_faq_ecommerce',
 'total_last_week_max_viewed_model',
 'percentage_last_week_max_viewed_model',
 'total_last_week_max_viewed_brand',
 'percentage_last_week_max_viewed_brand',
 'most_viewed_brand_is_asus',
 'most_viewed_brand_is_ipad',
 'most_viewed_brand_is_iphone',
 'most_viewed_brand_is_lenovo',
 'most_viewed_brand_is_lg',
 'most_viewed_brand_is_motorola',
 'most_viewed_brand_is_quantum',
 'most_viewed_brand_is_samsung',
 'most_viewed_brand_is_sony',
 'most_viewed_brand_lw_is_asus',
 'most_viewed_brand_lw_is_ipad',
 'most_viewed_brand_lw_is_iphone',
 'most_viewed_brand_lw_is_lenovo',
 'most_viewed_brand_lw_is_lg',
 'most_viewed_brand_lw_is_motorola',
 'most_viewed_brand_lw_is_quantum',
 'most_viewed_brand_lw_is_samsung',
 'most_viewed_brand_lw_is_sony',
 'total_sessions_l2w_x',
 'ratio_sessions_l2w_over_total',
 'total_brand_listings_l2w',
 'total_viewed_products_l2w',
 'total_checkouts_l2w',
 'total_conversions_l2w',
 'total_events_l2w',
 'total_sessions_l2w_y',
 'total_session_checkouts_l2w',
 'total_session_conversions_l2w',
 'total_events_ad_session_l2w',
 'total_ad_sessions_l2w',
 'has_checkout_l2w',
 'has_conversion_l2w',
 'percentage_l2w_week_activity',
 'percentage_l2w_brand_listings',
 'percentage_l2w_viewed_products',
 'percentage_l2w_checkouts',
 'percentage_l2w_conversions']

Clustering¶

# K-Means
udf['kmeans_3'] = cluster.KMeans(n_clusters=3).fit_predict(udf)
udf['kmeans_5'] = cluster.KMeans(n_clusters=5).fit_predict(udf)
udf['kmeans_6'] = cluster.KMeans(n_clusters=6).fit_predict(udf)
udf['kmeans_15'] = cluster.KMeans(n_clusters=15).fit_predict(udf)
udf['kmeans_25'] = cluster.KMeans(n_clusters=25).fit_predict(udf)

# HDBScan
# Deja columnas negativas, entonces varios algoritmos no pueden hacer uso de ella
# udf['hdbscan_22'] = hdbscan.HDBSCAN(min_cluster_size=22).fit_predict(udf)

udf.reset_index().to_csv('data/user-features.csv', index=False)

udf.head()

# Confirmamos no haber perdido datos en el medio
assert(len(udf)==len(df['person'].unique()))
display(len(udf))

38829

users = pd.read_csv("data/user-features.csv", low_memory=False)
with pd.option_context('display.max_column',0):
    display(users.head())
    display(users.shape)
    
len(users.columns)

(38829, 189)

189

	timestamp	event	person	url	sku	model	condition	storage	color	skus	search_term	staticpage	campaign_source	search_engine	channel	new_vs_returning	city	region	country	device_type	screen_resolution	operating_system_version	browser_version	time_diff_min	session_id	session_total_events	session_cumno	session_first	session_last	session_has_conversion	session_total_checkouts	session_has_checkout	session_ad	session_timestamp_first	session_timestamp_last	browser	operating_system	brand	precio_reales
1573168	2018-05-30 03:49:18	viewed product	7adf932c	NaN	382.0	Samsung Galaxy S5	Excelente	16GB	Preto	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.166667	3	16	10	False	False	False	1.0	True	False	2018-05-30 03:47:48	2018-05-30 03:49:31	NaN	NaN	samsung	619.0
1087065	2018-05-27 13:43:28	viewed product	dd7add03	NaN	6314.0	Samsung Galaxy Note 5	Muito Bom	32GB	Preto	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.033333	4	4	2	False	False	False	1.0	True	True	2018-05-27 13:43:26	2018-05-27 13:43:51	NaN	NaN	samsung	2329.0
2329107	2018-05-19 16:04:08	visited site	0996ef9b	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Paid	New	Belo Horizonte	Minas Gerais	Brazil	Smartphone	640x360	Android 5.1	Chrome Mobile 39	0.000000	0	4	2	False	False	False	1.0	True	True	2018-05-19 16:04:08	2018-05-19 16:04:18	chrome mobile	android	NaN	NaN
1904437	2018-05-18 17:47:31	generic listing	38183d00	NaN	NaN	NaN	NaN	NaN	NaN	6594,6651,7239,7225,2820,6706,480,12604,7252,1...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.016667	5	17	6	False	False	False	0.0	False	True	2018-05-18 17:41:17	2018-05-18 18:03:39	NaN	NaN	NaN	NaN
231090	2018-05-25 19:45:31	search engine hit	36fc008b	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Google	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	60.383333	19	10	0	True	False	False	0.0	False	False	2018-05-25 19:45:31	2018-05-25 20:08:26	NaN	NaN	NaN	NaN

	total_brand_listings	total_viewed_products	total_checkouts	total_conversions	total_events	total_sessions	total_session_checkout	total_session_conversion	total_events_ad_session	total_ad_sessions	avg_events_per_session	avg_events_per_ad_session	percentage_session_ad	percentage_session_conversion	has_checkout	has_conversion
person
0008ed71	0.0	0.0	3.0	0.0	6	3.0	3.0	0.0	0.0	0.0	2.000000	0.0	0.000000	0.0	1	0
00091926	25.0	372.0	2.0	0.0	448	34.0	2.0	0.0	54.0	9.0	13.176471	6.0	0.264706	0.0	1	0
00091a7a	5.0	3.0	0.0	0.0	10	1.0	0.0	0.0	10.0	1.0	10.000000	10.0	1.000000	0.0	0	0
000ba417	24.0	153.0	6.0	1.0	206	5.0	4.0	1.0	0.0	0.0	41.200000	0.0	0.000000	0.2	1	1
000c79fe	0.0	3.0	1.0	0.0	17	1.0	1.0	0.0	17.0	1.0	17.000000	17.0	1.000000	0.0	1	0

	total_viewed_products_month_1	total_checkouts_month_1	total_conversions_month_1	total_events_month_1	total_sessions_month_1	total_session_checkouts_month_1	total_session_conversions_month_1	total_events_ad_session_month_1	total_ad_sessions_month_1	has_checkout_month_1	...	total_checkouts_month_5	total_conversions_month_5	total_events_month_5	total_sessions_month_5	total_session_checkouts_month_5	total_session_conversions_month_5	total_events_ad_session_month_5	total_ad_sessions_month_5	has_checkout_month_5	has_conversion_month_5
person
0008ed71	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	3.0	0.0	6.0	3.0	3.0	0.0	0.0	0.0	1.0	0.0
00091926	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	2.0	0.0	448.0	34.0	2.0	0.0	54.0	9.0	1.0	0.0
00091a7a	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
000ba417	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	6.0	1.0	206.0	5.0	4.0	1.0	0.0	0.0	1.0	1.0
000c79fe	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	0.0	17.0	1.0	1.0	0.0	17.0	1.0	1.0	0.0

	total_viewed_products_months_1_to_4	total_checkouts_months_1_to_4	total_conversions_months_1_to_4	total_events_months_1_to_4	total_sessions_months_1_to_4	total_session_checkouts_months_1_to_4	total_session_conversions_months_1_to_4	total_events_ad_session_months_1_to_4	total_ad_sessions_months_1_to_4	has_checkout_months_1_to_4	has_conversion_months_1_to_4
person
00091a7a	3.0	0.0	0.0	10	1.0	0.0	0.0	10.0	1.0	0	0
0019e639	66.0	11.0	1.0	181	7.0	3.0	1.0	118.0	6.0	1	1
001ca5ee	26.0	0.0	0.0	42	10.0	0.0	0.0	4.0	1.0	0	0
001dfc31	6.0	0.0	0.0	18	3.0	0.0	0.0	11.0	2.0	0	0
001e9aea	16.0	3.0	0.0	49	5.0	2.0	0.0	49.0	5.0	1	0

	total_brand_listings_lw	total_viewed_products_lw	total_checkouts_lw	total_conversions_lw	total_events_lw	total_sessions_lw	total_session_checkouts_lw	total_session_conversions_lw	total_events_ad_session_lw	total_ad_sessions_lw	has_checkout_lw	has_conversion_lw	percentage_last_week_activity	percentage_last_week_brand_listings	percentage_last_week_viewed_products	percentage_last_week_checkouts	percentage_last_week_conversions
person
00091926	3.0	63.0	1.0	0.0	83	8.0	1.0	0.0	29.0	5.0	1	0	0.185268	0.006696	0.140625	0.002232	0.000000
000ba417	9.0	115.0	3.0	1.0	138	2.0	2.0	1.0	0.0	0.0	1	1	0.669903	0.043689	0.558252	0.014563	0.004854
000c79fe	0.0	3.0	1.0	0.0	17	1.0	1.0	0.0	17.0	1.0	1	0	1.000000	0.000000	0.176471	0.058824	0.000000
000e4d9e	9.0	100.0	0.0	0.0	117	8.0	0.0	0.0	0.0	0.0	0	0	0.284672	0.021898	0.243309	0.000000	0.000000
0010e89a	0.0	1.0	1.0	0.0	4	1.0	1.0	0.0	4.0	1.0	1	0	1.000000	0.000000	0.250000	0.250000	0.000000

	timestamp_last_event	timestamp_last_checkout	timestamp_last_conversion	timestamp_last_viewed_product
person
0008ed71	2018-05-17 16:28:37	2018-05-17 16:28:37	2018-01-01 00:00:00	2018-01-01
00091926	2018-05-31 19:52:03	2018-05-26 00:30:47	2018-01-01 00:00:00	2018-01-01
00091a7a	2018-03-26 14:56:58	2018-01-01 00:00:00	2018-01-01 00:00:00	2018-01-01
000ba417	2018-05-26 13:09:22	2018-05-26 13:03:02	2018-05-26 13:09:22	2018-01-01
000c79fe	2018-05-29 00:38:07	2018-05-29 00:29:51	2018-01-01 00:00:00	2018-01-01

	timestamp_last_event	timestamp_last_checkout	timestamp_last_conversion	timestamp_last_viewed_product	days_to_last_event	days_to_last_checkout	days_to_last_conversion	days_to_last_viewed_product	doy_last_event	dow_last_event	...	dom_last_checkout	woy_last_checkout	doy_last_conversion	dow_last_conversion	dom_last_conversion	woy_last_conversion	doy_last_viewed_product	dow_last_viewed_product	dom_last_viewed_product	woy_last_viewed_product
person
0008ed71	1526574517000000000	1526574517000000000	1514764800000000000	1514764800000000000	15	0	136	136	137	3	...	17	20	1	0	1	1	1	0	1	1
00091926	1527796323000000000	1527294647000000000	1514764800000000000	1514764800000000000	1	5	150	150	151	3	...	26	21	1	0	1	1	1	0	1	1
00091a7a	1522076218000000000	1514764800000000000	1514764800000000000	1514764800000000000	67	84	84	84	85	0	...	1	1	1	0	1	1	1	0	1	1
000ba417	1527340162000000000	1527339782000000000	1527340162000000000	1514764800000000000	6	0	0	145	146	5	...	26	21	146	5	26	21	1	0	1	1
000c79fe	1527554287000000000	1527553791000000000	1514764800000000000	1514764800000000000	3	0	148	148	149	1	...	29	22	1	0	1	1	1	0	1	1

	sku	precio_reales
0	9288.0	749.0
1	304.0	1679.0
2	6888.0	2329.0
3	11890.0	2469.0
4	7517.0	1939.0

	last_conversion_sku	last_conversion_price
person
000ba417	7631	2469.0
5ce12b07	7631	2469.0
66006845	7631	2469.0
6cfd9cf2	7631	2469.0
bf7365e2	7631	2469.0

	percentage_regular_celphones_activity
person
00091926	0.274194
00091a7a	0.333333
000ba417	0.718954
000c79fe	1.000000
000e4d9e	0.365782

	var_viewed
person
00091926	12.733277
00091a7a	11.990401
000ba417	12.878211
000c79fe	0.000000
000e4d9e	13.378811

model	Asus Live	Asus Zenfone 2	Asus Zenfone 2 Deluxe	Asus Zenfone 2 Laser	Asus Zenfone 3 Max 32 GB	Asus Zenfone 3 Max 16 GB	Asus Zenfone 5	Asus Zenfone 6	Asus Zenfone Go	Asus Zenfone Selfie	...	iPhone 6	iPhone 6 Plus	iPhone 6S	iPhone 6S Plus	iPhone 7	iPhone 7 Plus	iPhone 8	iPhone 8 Plus	iPhone SE	iPhone X
person
00091926	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	5.0	41.0	94.0	51.0	45.0	9.0	0.0	2.0	3.0	1.0
00091a7a	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
000ba417	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0
000c79fe	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	3.0	0.0	0.0	0.0	0.0	0.0
000e4d9e	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	1.0	0.0	1.0	0.0	0.0	1.0	11.0	0.0

	model
person
000ba417	Motorola Moto G3 4G
001001be	iPhone 5s
001804a2	iPhone 6
0019e639	Samsung Galaxy Win Duos
001b0bf9	Samsung Galaxy S8 Plus

brand	asus	ipad	iphone	lenovo	lg	motorola	quantum	samsung	sony
person
00091926	0.0	1.0	251.0	1.0	2.0	55.0	0.0	61.0	1.0
00091a7a	0.0	0.0	3.0	0.0	0.0	0.0	0.0	0.0	0.0
000ba417	0.0	0.0	8.0	0.0	4.0	35.0	0.0	105.0	1.0
000c79fe	0.0	0.0	3.0	0.0	0.0	0.0	0.0	0.0	0.0
000e4d9e	0.0	0.0	21.0	0.0	1.0	2.0	0.0	300.0	15.0

	brand
person
000ba417	motorola
001001be	iphone
001804a2	iphone
0019e639	samsung
001b0bf9	samsung

	most_viewed_brand_is_asus	most_viewed_brand_is_ipad	most_viewed_brand_is_iphone	most_viewed_brand_is_lenovo	most_viewed_brand_is_lg	most_viewed_brand_is_motorola	most_viewed_brand_is_quantum	most_viewed_brand_is_samsung	most_viewed_brand_is_sony
person
00091926	0	0	1	0	0	0	0	0	0
00091a7a	0	0	1	0	0	0	0	0	0
000ba417	0	0	0	0	0	0	0	1	0
000c79fe	0	0	1	0	0	0	0	0	0
000e4d9e	0	0	0	0	0	0	0	1	0
000e619d	0	0	0	0	0	0	0	1	0
001001be	0	0	1	0	0	0	0	0	0
0010e89a	0	0	0	0	0	0	0	1	0
0016c4b5	0	0	0	0	0	0	0	1	0
001802e4	0	0	1	0	0	0	0	0	0
0019e639	0	0	0	0	0	1	0	0	0
001a2273	0	0	0	0	0	0	0	1	0
001b0bf9	0	0	0	0	0	0	0	1	0
001ca5ee	0	0	1	0	0	0	0	0	0
001dfc31	0	0	1	0	0	0	0	0	0
001e9aea	0	0	1	0	0	0	0	0	0
0020152e	0	0	0	0	0	0	0	1	0
00202cdb	0	0	1	0	0	0	0	0	0
0022965d	0	0	0	0	0	0	0	1	0
0027574e	0	0	0	0	0	0	0	1	0
00295d74	0	0	0	0	0	0	0	1	0
002aea56	0	0	1	0	0	0	0	0	0
002b0188	0	0	0	0	0	0	0	1	0
002e74b2	0	0	1	0	0	0	0	0	0
002ed810	0	0	0	0	0	0	0	1	0
002fb981	0	0	1	0	0	0	0	0	0
00317e49	0	0	1	0	0	0	0	0	0
0031c75f	0	0	0	0	0	1	0	0	0
00344873	0	0	0	0	0	0	0	1	0
003476c0	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...
ffc670b3	0	0	1	0	0	0	0	0	0
ffcaae19	0	0	0	0	0	0	0	1	0
ffcaeaae	0	0	0	0	0	1	0	0	0
ffccd5a1	0	0	0	0	0	0	0	1	0
ffcf952b	0	0	0	0	0	0	0	1	0
ffd41214	0	0	1	0	0	0	0	0	0
ffd62616	0	0	0	0	0	0	0	1	0
ffd9756a	0	0	1	0	0	0	0	0	0

	most_viewed_brand_lw_is_asus	most_viewed_brand_lw_is_ipad	most_viewed_brand_lw_is_iphone	most_viewed_brand_lw_is_lenovo	most_viewed_brand_lw_is_lg	most_viewed_brand_lw_is_motorola	most_viewed_brand_lw_is_quantum	most_viewed_brand_lw_is_samsung	most_viewed_brand_lw_is_sony
person
00091926	0	0	1	0	0	0	0	0	0
000ba417	0	0	0	0	0	0	0	1	0
000c79fe	0	0	1	0	0	0	0	0	0
000e4d9e	0	0	0	0	0	0	0	1	0
0010e89a	0	0	0	0	0	0	0	1	0
0016c4b5	0	0	0	0	0	0	0	1	0
001802e4	0	0	1	0	0	0	0	0	0
001ca5ee	0	0	1	0	0	0	0	0	0
001dfc31	0	0	1	0	0	0	0	0	0
0020152e	0	0	0	0	0	0	0	1	0
0022965d	0	0	0	0	0	0	0	1	0
00295d74	0	0	0	0	0	0	0	1	0
002e74b2	0	0	1	0	0	0	0	0	0
002ed810	0	0	0	0	0	0	0	1	0
002fb981	0	0	1	0	0	0	0	0	0
0031c75f	0	0	0	0	0	1	0	0	0
003847fc	0	0	0	0	0	0	0	1	0
0038a117	0	0	0	0	0	0	0	1	0
0041fec6	0	0	0	0	0	0	0	1	0
004ed8ba	0	0	0	0	0	0	0	1	0
0050d971	0	0	0	0	0	0	0	1	0
00546e55	0	0	0	0	0	0	0	1	0
0059a996	0	0	0	0	0	0	0	1	0
005e06ef	0	0	0	0	0	0	0	1	0
0061cbe8	0	0	0	0	0	0	0	1	0
006549da	0	0	1	0	0	0	0	0	0
00665dda	0	0	0	0	1	0	0	0	0
006bb312	0	1	0	0	0	0	0	0	0
006e1534	0	0	1	0	0	0	0	0	0
00736d10	0	0	1	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...
ff8787ad	0	0	1	0	0	0	0	0	0
ff8798aa	0	0	0	0	0	0	0	1	0
ff884f1a	0	0	0	0	0	0	0	1	0
ff89d18f	0	0	1	0	0	0	0	0	0
ff8a5f4a	0	0	0	0	0	0	0	1	0
ff8a9723	0	0	0	0	0	1	0	0	0
ff97d56f	0	0	1	0	0	0	0	0	0
ff97e281	0	0	0	0	0	0	0	1	0

	total_sessions_last_week	ratio_sessions_last_week_over_total	has_event_last_week
person
00091926	8.0	0.235294	1
000ba417	2.0	0.400000	1
000c79fe	1.0	1.000000	1
000e4d9e	8.0	0.615385	1
0010e89a	1.0	1.000000	1

brand	asus	ipad	iphone	lenovo	lg	motorola	quantum	samsung	sony
person
00091926	0.0	0.0	56.0	0.0	0.0	4.0	0.0	3.0	0.0
000ba417	0.0	0.0	6.0	0.0	4.0	35.0	0.0	69.0	1.0
000c79fe	0.0	0.0	3.0	0.0	0.0	0.0	0.0	0.0	0.0
000e4d9e	0.0	0.0	2.0	0.0	0.0	0.0	0.0	98.0	0.0
0010e89a	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0

[75.06 / 95.58] Organización de Datos Trabajo Práctico 2: Machine Learning