Grupo 30: Datatouille
En este notebook se buscan atributos nuevos para concatenar al set de datos original, así pudiendo armar un modelo predictivo más robusto y eficiente.
import pandas as pd
import numpy as np
import sklearn.cluster as cluster
from sklearn.feature_extraction import FeatureHasher
import hdbscan
df_events = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df_sessions = pd.read_csv('./data/sessions.csv', low_memory=False)
df_brands = pd.read_csv('./data/brands.csv')
df_os = pd.read_csv('./data/os.csv')
df_browsers = pd.read_csv('./data/browsers.csv')
df_prices = pd.read_csv('./data/prices.csv')
df = df_events.merge(df_sessions, how='left', left_index=True, right_index=True)
df = df.merge(df_browsers, how='left', on='browser_version')
df = df.merge(df_os, how='left', on='operating_system_version')
df = df.merge(df_brands, how='left', on='model')
df = df.merge(df_prices, how='left', on='sku')
df.columns
with pd.option_context('display.max_column',0):
display(df.sample(5))
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['session_timestamp_first'] = pd.to_datetime(df['session_timestamp_first'])
df['session_timestamp_last'] = pd.to_datetime(df['session_timestamp_last'])
df['month_number'] = df['timestamp'].dt.month
df['is_brand_listing'] = df['event'] == 'brand listing'
df['is_viewed_product'] = df['event'] == 'viewed product'
df['is_conversion'] = df['event'] == 'conversion'
df['is_checkout'] = df['event'] == 'checkout'
df['session_checkout_first'] = df['session_first'] & df['session_has_checkout']
df['session_conversion_first'] = df['session_first'] & df['session_has_conversion']
df['session_ad_first'] = df['session_first'] & df['session_ad']
df['session_ad_checkout_event'] = df['session_first'] & df['session_checkout_first']
df['session_ad_conversion_event'] = df['session_first'] & df['session_conversion_first']
df.columns
Checkouts y conversiones en total por usuario.
A motivos prácticos, en todo este notebook con eventos nos referimos solamente a:
Checkout: Un usuario ingresa al checkout de compra de un producto
Conversión: Un usuario realiza una conversión comprando un producto.
udf_tmp1 = df.groupby('person').agg({'is_brand_listing':'sum',
'is_viewed_product':'sum',
'is_checkout':'sum',
'is_conversion':'sum',
'event':'count',
'session_first':'sum', 'session_total_conversions':'sum',
'session_checkout_first':'sum',
'session_conversion_first':'sum',
'session_ad':'sum',
'session_ad_first':'sum'})
# Cosa rara que aparece sola
del udf_tmp1['session_total_conversions']
udf_tmp1.columns = ['total_brand_listings',
'total_viewed_products',
'total_checkouts',
'total_conversions',
'total_events',
'total_sessions',
'total_session_checkout',
'total_session_conversion',
'total_events_ad_session',
'total_ad_sessions']
udf_tmp1['avg_events_per_session'] = (udf_tmp1['total_events'] / udf_tmp1['total_sessions']).replace([np.inf, -np.inf], np.nan).fillna(0)
udf_tmp1['avg_events_per_ad_session'] = (udf_tmp1['total_events_ad_session'] / udf_tmp1['total_ad_sessions']).replace([np.inf, -np.inf], np.nan).fillna(0)
udf_tmp1['percentage_session_ad'] = udf_tmp1['total_ad_sessions'] / udf_tmp1['total_sessions']
udf_tmp1['percentage_session_conversion'] = udf_tmp1['total_session_conversion'] / udf_tmp1['total_sessions']
udf_tmp1['has_checkout'] = (udf_tmp1['total_checkouts'] > 0).astype('int')
udf_tmp1['has_conversion'] = (udf_tmp1['total_conversions'] > 0).astype('int')
udf_tmp1.head()
Checkouts y conversiones por usuario por mes
udf_tmp2 = df['person'].drop_duplicates().to_frame().set_index('person')
for i in range(1,6):
gb = df[df['month_number'] == i].groupby('person')
udf_tmp2i = gb.agg({'is_viewed_product':'sum',
'is_checkout':'sum',
'is_conversion':'sum',
'event':'count',
'session_first':'sum', 'session_total_conversions':'sum',
'session_checkout_first':'sum',
'session_conversion_first':'sum',
'session_ad':'sum',
'session_ad_first':'sum'})
# Cosa rara que aparece sola
del udf_tmp2i['session_total_conversions']
udf_tmp2i.columns = ['total_viewed_products_month_{}'.format(i),
'total_checkouts_month_{}'.format(i),
'total_conversions_month_{}'.format(i),
'total_events_month_{}'.format(i),
'total_sessions_month_{}'.format(i),
'total_session_checkouts_month_{}'.format(i),
'total_session_conversions_month_{}'.format(i),
'total_events_ad_session_month_{}'.format(i),
'total_ad_sessions_month_{}'.format(i)]
udf_tmp2i['has_checkout_month_{}'.format(i)] = (udf_tmp2i['total_checkouts_month_{}'.format(i)] > 0).astype('int')
udf_tmp2i['has_conversion_month_{}'.format(i)] = (udf_tmp2i['total_conversions_month_{}'.format(i)] > 0).astype('int')
udf_tmp2 = udf_tmp2.merge(udf_tmp2i, how='outer', left_index=True, right_index=True)
udf_tmp2 = udf_tmp2.fillna(0)
udf_tmp2.head()
Checkouts y conversiones por usuario sin contar mayo (último mes registrado)
gb = df[df['month_number'] != 5].groupby('person')
udf_tmp3 = gb.agg({'is_viewed_product':'sum',
'is_checkout':'sum',
'is_conversion':'sum',
'event':'count',
'session_first':'sum',
'session_total_conversions':'sum',
'session_checkout_first':'sum',
'session_conversion_first':'sum',
'session_ad':'sum',
'session_ad_first':'sum'})
# Cosa rara que aparece sola
del udf_tmp3['session_total_conversions']
udf_tmp3.columns = ['total_viewed_products_months_1_to_4',
'total_checkouts_months_1_to_4',
'total_conversions_months_1_to_4',
'total_events_months_1_to_4',
'total_sessions_months_1_to_4',
'total_session_checkouts_months_1_to_4',
'total_session_conversions_months_1_to_4',
'total_events_ad_session_months_1_to_4',
'total_ad_sessions_months_1_to_4']
udf_tmp3['has_checkout_months_1_to_4'] = (udf_tmp3['total_checkouts_months_1_to_4'] > 0).astype('int')
udf_tmp3['has_conversion_months_1_to_4'] = (udf_tmp3['total_conversions_months_1_to_4'] > 0).astype('int')
udf_tmp3.head()
Checkouts y conversiones por usuario en la última semana registrada
gb = df[df['timestamp'] >= pd.to_datetime('2018-05-23')].groupby('person')
udf_tmp4 = gb.agg({'is_brand_listing':'sum',
'is_viewed_product':'sum',
'is_checkout':'sum',
'is_conversion':'sum',
'event':'count',
'session_first':'sum',
'session_total_conversions':'sum',
'session_checkout_first':'sum',
'session_conversion_first':'sum',
'session_ad':'sum',
'session_ad_first':'sum'})
# Cosa rara que aparece sola
del udf_tmp4['session_total_conversions']
udf_tmp4.columns = ['total_brand_listings_lw',
'total_viewed_products_lw',
'total_checkouts_lw',
'total_conversions_lw',
'total_events_lw',
'total_sessions_lw',
'total_session_checkouts_lw',
'total_session_conversions_lw',
'total_events_ad_session_lw',
'total_ad_sessions_lw']
udf_tmp4['has_checkout_lw'] = (udf_tmp4['total_checkouts_lw'] > 0).astype('int')
udf_tmp4['has_conversion_lw'] = (udf_tmp4['total_conversions_lw'] > 0).astype('int')
udf_tmp4 = udf_tmp4.merge(udf_tmp1[['total_events']], how='left', on='person')
udf_tmp4['percentage_last_week_activity'] = udf_tmp4['total_events_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_brand_listings'] = udf_tmp4['total_brand_listings_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_viewed_products'] = udf_tmp4['total_viewed_products_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_checkouts'] = udf_tmp4['total_checkouts_lw'] / udf_tmp4['total_events']
udf_tmp4['percentage_last_week_conversions'] = udf_tmp4['total_conversions_lw'] / udf_tmp4['total_events']
del udf_tmp4['total_events']
udf_tmp4.head()
Cuan esparcidas (en meses) estan las conversiones de los usuarios
udf_tmp5 = udf_tmp2['has_conversion_month_1']
for i in range(2,6):
udf_tmp5 = udf_tmp5 + udf_tmp2['has_conversion_month_{}'.format(i)]
udf_tmp5 = udf_tmp5.to_frame()
udf_tmp5.columns = ['amount_of_months_that_has_bought']
for i in range(6):
print('Users that have bought in {} different months: {}'.format(i, len(udf_tmp5[udf_tmp5['amount_of_months_that_has_bought'] >= i])))
udf_tmp5.head()
Información como días hasta la última conversión, que día de la semana fue esta y demás.
(En esta celda por evento nos referimos a cualquier tipo de evento)
df_event = df.groupby('person').agg({'timestamp':'max'})
df_event.columns = ['timestamp_last_event']
df_checkout = df[df['event'] == 'checkout']
df_checkout = df_checkout.groupby('person').agg({'timestamp': 'max'})
df_checkout.columns = ['timestamp_last_checkout']
df_conversion = df[df['event'] == 'conversion']
df_conversion = df_conversion.groupby('person').agg({'timestamp': 'max'})
df_conversion.columns = ['timestamp_last_conversion']
df_viewed_product = df[df['event'] == 'viewed_product']
df_viewed_product = df_viewed_product.groupby('person').agg({'timestamp': 'max'})
df_viewed_product.columns = ['timestamp_last_viewed_product']
df_timelapse = df_event.merge(df_checkout, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_conversion, how='outer', on='person')
df_timelapse = df_timelapse.merge(df_viewed_product, how='outer', on='person')
#Fecha arbitraria cuando el usuario nunca hizo checkout/conversion/viewed_product
df_timelapse = df_timelapse.fillna(pd.to_datetime('2018-01-01'))
df_timelapse.head()
udf_tmp6 = df_timelapse.loc[:]
udf_tmp6['days_to_last_event'] = pd.to_datetime('2018-06-01').dayofyear - df_timelapse['timestamp_last_event'].dt.dayofyear
udf_tmp6['days_to_last_checkout'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['days_to_last_conversion'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['days_to_last_viewed_product'] = df_timelapse['timestamp_last_event'].dt.dayofyear - df_timelapse['timestamp_last_viewed_product'].dt.dayofyear
udf_tmp6.head()
udf_tmp6['doy_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofyear
udf_tmp6['dow_last_event'] = udf_tmp6['timestamp_last_event'].dt.dayofweek
udf_tmp6['dom_last_event'] = udf_tmp6['timestamp_last_event'].dt.day
udf_tmp6['woy_last_event'] = udf_tmp6['timestamp_last_event'].dt.weekofyear
udf_tmp6['doy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofyear
udf_tmp6['dow_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.dayofweek
udf_tmp6['dom_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.day
udf_tmp6['woy_last_checkout'] = udf_tmp6['timestamp_last_checkout'].dt.weekofyear
udf_tmp6['doy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofyear
udf_tmp6['dow_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.dayofweek
udf_tmp6['dom_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.day
udf_tmp6['woy_last_conversion'] = udf_tmp6['timestamp_last_conversion'].dt.weekofyear
udf_tmp6['doy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofyear
udf_tmp6['dow_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.dayofweek
udf_tmp6['dom_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.day
udf_tmp6['woy_last_viewed_product'] = udf_tmp6['timestamp_last_viewed_product'].dt.weekofyear
udf_tmp6 = udf_tmp6.astype('int')
udf_tmp6.head()
df_prices.head()
udf_tmp7 = df[df['event'] == 'conversion']
udf_tmp7.set_index('person', inplace=True)
udf_tmp7 = udf_tmp7.groupby('person').agg({'timestamp': 'max','sku':'max'})
udf_tmp7.reset_index(inplace=True)
udf_tmp7 = udf_tmp7.merge(df_prices, how='inner', on='sku')
udf_tmp7['sku'] = udf_tmp7['sku'].astype('int')
udf_tmp7 = udf_tmp7.rename(columns={'precio_reales': 'last_conversion_price', 'sku': 'last_conversion_sku'})
udf_tmp7.set_index('person',inplace=True)
del udf_tmp7['timestamp']
udf_tmp7['last_conversion_price'] = udf_tmp7['last_conversion_price'].astype('float')
udf_tmp7.head()
Cantidad de eventos de la ultima semana sobre el total
Cantidad de eventos del último mes sobre el total
udf_tmp9 = df.groupby('person').agg({'event': 'count'})
udf_tmp9 = udf_tmp9.rename(columns = {'event': 'total_events'})
df_last_month = df[df['timestamp'] > pd.to_datetime('2018-05-01')]
df_last_month_event = df_last_month.groupby('person').agg({'event': 'count'})
df_last_month_event = df_last_month_event.rename(columns = {'event': 'events_last_month'})
df_last_month_conversion = df_last_month[df_last_month['event'] == 'conversion']
df_last_month_conversion = df_last_month_conversion.groupby('person').agg({'event': 'count'})
df_last_month_conversion = df_last_month_conversion.rename(columns = {'event': 'conversions_last_month'})
df_last_month_checkout = df_last_month[df_last_month['event'] == 'checkout']
df_last_month_checkout = df_last_month_checkout.groupby('person').agg({'event': 'count'})
df_last_month_checkout = df_last_month_checkout.rename(columns = {'event': 'checkouts_last_month'})
df_last_month_viewed_product = df_last_month[df_last_month['event'] == 'viewed product']
df_last_month_viewed_product = df_last_month_viewed_product.groupby('person').agg({'event': 'count'})
df_last_month_viewed_product = df_last_month_viewed_product.rename(columns = {'event': 'viewed_products_last_month'})
udf_tmp9 = udf_tmp9.merge(df_last_month_event, how='outer', on='person')
udf_tmp9 = udf_tmp9.merge(df_last_month_conversion, how='outer', on='person')
udf_tmp9 = udf_tmp9.merge(df_last_month_checkout, how='outer', on='person')
udf_tmp9 = udf_tmp9.merge(df_last_month_viewed_product, how='outer', on='person')
udf_tmp9 = udf_tmp9.fillna(0) #Si no tuvo actividad en la última semana el porcentaje es 0
udf_tmp9['percentage_last_month_activity'] = udf_tmp9['events_last_month'] / udf_tmp9['total_events']
udf_tmp9['percentage_last_month_conversions'] = udf_tmp9['conversions_last_month'] / udf_tmp9['total_events']
udf_tmp9['percentage_last_month_checkouts'] = udf_tmp9['checkouts_last_month'] / udf_tmp9['total_events']
udf_tmp9['percentage_last_month_viewed_products'] = udf_tmp9['viewed_products_last_month'] / udf_tmp9['total_events']
del udf_tmp9['total_events']
del udf_tmp9['events_last_month']
del udf_tmp9['conversions_last_month']
del udf_tmp9['checkouts_last_month']
del udf_tmp9['viewed_products_last_month']
udf_tmp9.head()
Se guardan los días entre el último evento del usuario y el último checkout
df_last_checkout = df[df['event'] == 'checkout']
df_last_checkout = df_last_checkout.groupby('person').agg({'timestamp': 'max'})
df_last_checkout = df_last_checkout.rename(columns = {'timestamp': 'timestamp_last_checkout'})
df_last_event = df.groupby('person').agg({'timestamp': 'max'})
df_last_event = df_last_event.rename(columns = {'timestamp': 'timestamp_last_event'})
udf_tmp10 = df_last_checkout.merge(df_last_event, how='outer', on='person')
udf_tmp10['days_between_last_event_and_checkout'] = udf_tmp10['timestamp_last_event'].dt.dayofyear - udf_tmp10['timestamp_last_checkout'].dt.dayofyear
udf_tmp10 = udf_tmp10.fillna(180) #Se utiliza el tope de tiempo si no tuvo una conversión
udf_tmp10['days_between_last_event_and_checkout'] = udf_tmp10['days_between_last_event_and_checkout'].astype('int')
del udf_tmp10['timestamp_last_event']
del udf_tmp10['timestamp_last_checkout']
udf_tmp10.head()
Utilizando la lógica de que hay empresas que compran celulares en mal estado y en una primera instancia (obviamente) lo ven, se agrega una columna que indique porcentaje de celulares en estado Bom - Sem Touch ID vs Bom sobre todos los celulares vistos
df_viewed_product = df[df['event'] == 'viewed product']
df_regular_phones = df_viewed_product[(df_viewed_product['condition'] == 'Bom') | (df_viewed_product['condition'] == 'Bom - Sem Touch ID')]
df_regular_phones = df_regular_phones.groupby('person').agg({'condition': 'count'})
df_regular_phones = df_regular_phones.rename(columns = {'condition': 'amount_regular_phones'})
df_all_phones = df_viewed_product.groupby('person').agg({'condition': 'count'})
df_all_phones = df_all_phones.rename(columns = {'condition': 'amount_total_phones'})
udf_tmp11 = df_regular_phones.merge(df_all_phones, how='outer', on='person')
udf_tmp11['percentage_regular_celphones_activity'] = udf_tmp11['amount_regular_phones'] / udf_tmp11['amount_total_phones']
del udf_tmp11['amount_regular_phones']
del udf_tmp11['amount_total_phones']
udf_tmp11.head()
udf_tmp12 = df[df['event'] == 'viewed product']
udf_tmp12 = udf_tmp12.groupby('person').agg({'precio_reales': 'var'})
udf_tmp12['precio_reales'] = np.log(udf_tmp12['precio_reales'])
udf_tmp12['precio_reales'] = udf_tmp12['precio_reales'].replace([np.inf, -np.inf], 0)
udf_tmp12 = udf_tmp12.rename(columns={'precio_reales': 'var_viewed'})
udf_tmp12.head()
conversions = df[df['event'] == 'conversion']
monto_total = np.sum(conversions['precio_reales'])
cantidad = conversions['precio_reales'].count()
media = monto_total/cantidad
udf_tmp13 = conversions.groupby('person').agg({'precio_reales': 'max'})
udf_tmp13['conversion_gt_media'] = (udf_tmp13['precio_reales'] > media).astype('int')
udf_tmp13['conversion_gt_media'] = udf_tmp13['conversion_gt_media'].astype('int')
del udf_tmp13['precio_reales']
udf_tmp13.head()
En un primer lugar se guarda en un df cuantas veces vio cada modelo cada usuario
udf_tmp14 = df[df['event'] == 'viewed product']
udf_tmp14 = udf_tmp14.groupby('person')['model'].value_counts()
udf_tmp14 = udf_tmp14.to_frame()
udf_tmp14 = udf_tmp14.rename(columns={'model':'cant_viewed'})
udf_tmp14 = udf_tmp14.unstack()
udf_tmp14 = udf_tmp14.fillna(0)
udf_tmp14 = udf_tmp14.xs('cant_viewed', axis=1, drop_level=True)
udf_tmp14_prima = udf_tmp14.copy()
udf_tmp14_prima['total_viewed_products'] = udf_tmp14.sum(axis=1)
udf_tmp14_prima['total_max_viewed_product'] = udf_tmp14.max(axis=1)
udf_tmp14_prima['percentage_max_viewed_product'] = udf_tmp14_prima['total_max_viewed_product'] / udf_tmp14_prima['total_viewed_products']
udf_tmp14_prima = udf_tmp14_prima[[
'total_viewed_products',
'total_max_viewed_product',
'percentage_max_viewed_product'
]]
del udf_tmp14_prima['total_viewed_products']
display(udf_tmp14.head())
udf_tmp14_prima.head()
udf_tmp14_bis = df[df['event'] == 'conversion']
udf_tmp14_bis.set_index('person', inplace=True)
udf_tmp14_bis = udf_tmp14_bis.groupby('person')['timestamp','sku','model'].agg({'timestamp': 'max','sku':'max'})
udf_tmp14_bis = udf_tmp14_bis.xs('timestamp', axis=1, drop_level=True)
del udf_tmp14_bis['timestamp']
del udf_tmp14_bis['sku']
udf_tmp14_bis.head()
udf_tmp14_merge = udf_tmp14_bis.merge(udf_tmp14, how='inner', on='person')
models = list(udf_tmp14_merge['model'])
columns = list(udf_tmp14_merge.columns)
cant_viewed_model_last_conversion = []
for i in range(udf_tmp14_merge.shape[0]):
actual_model = models[i]
position_model = columns.index(actual_model)
cant = udf_tmp14_merge.iloc[i,position_model]
cant_viewed_model_last_conversion.append(cant)
udf_tmp14_merge['cant_viewed_model_last_conversion'] = cant_viewed_model_last_conversion
udf_tmp14 = udf_tmp14_merge[['cant_viewed_model_last_conversion']]
udf_tmp14['cant_viewed_model_last_conversion'] = udf_tmp14['cant_viewed_model_last_conversion'].astype('int')
udf_tmp14.head()
En un primer lugar se guarda en un df cuantas veces vio cada marca cada usuario
udf_tmp15 = df[df['event'] == 'viewed product']
udf_tmp15 = udf_tmp15.groupby('person')['brand'].value_counts()
udf_tmp15 = udf_tmp15.to_frame()
udf_tmp15 = udf_tmp15.rename(columns={'model':'cant_viewed'})
udf_tmp15 = udf_tmp15.unstack()
udf_tmp15 = udf_tmp15.fillna(0)
udf_tmp15 = udf_tmp15.xs('brand', axis=1, drop_level=True)
udf_tmp15.head()
udf_tmp15_bis = df[df['event'] == 'conversion']
udf_tmp15_bis.set_index('person', inplace=True)
udf_tmp15_bis = udf_tmp15_bis.groupby('person')['timestamp','sku','brand'].agg({'timestamp': 'max','sku':'max'})
udf_tmp15_bis = udf_tmp15_bis.xs('timestamp', axis=1, drop_level=True)
del udf_tmp15_bis['timestamp']
del udf_tmp15_bis['sku']
udf_tmp15_bis.head()
udf_tmp15_merge = udf_tmp15_bis.merge(udf_tmp15, how='inner', on='person')
brands = list(udf_tmp15_merge['brand'])
columns = list(udf_tmp15_merge.columns)
cant_viewed_brand_last_conversion = []
for i in range(udf_tmp15_merge.shape[0]):
actual_brand = brands[i]
position_brand = columns.index(actual_brand)
cant = udf_tmp15_merge.iloc[i,position_brand]
cant_viewed_brand_last_conversion.append(cant)
udf_tmp15_merge['cant_viewed_brand_last_conversion'] = cant_viewed_brand_last_conversion
udf_tmp15 = udf_tmp15_merge[['cant_viewed_brand_last_conversion']]
udf_tmp15['cant_viewed_brand_last_conversion'] = udf_tmp15['cant_viewed_brand_last_conversion'].astype('int')
udf_tmp15.head()
tmp = df[df['session_timestamp_first'] > pd.to_datetime('2018-05-23')]
gb = tmp.groupby('person')
udf_tmp16 = gb.agg({
'session_first':'sum',
})
udf_tmp16.columns = ['total_sessions_last_week']
udf_tmp16 = udf_tmp16.merge(udf_tmp1[['total_sessions']], how='right', on='person')
udf_tmp16['ratio_sessions_last_week_over_total'] = udf_tmp16['total_sessions_last_week'] / udf_tmp16['total_sessions']
udf_tmp16['has_event_last_week'] = 1
del udf_tmp16['total_sessions']
display(udf_tmp16.head())
udf_tmp17 = df[(df['staticpage'] == 'FaqEcommerce') | (df['staticpage'] == 'CustomerService')]
udf_tmp17 = udf_tmp17.groupby('person')['staticpage'].value_counts()
udf_tmp17 = udf_tmp17.to_frame()
udf_tmp17 = udf_tmp17.rename(columns={'staticpage':'cant_staticpage'})
udf_tmp17 = udf_tmp17.unstack()
udf_tmp17 = udf_tmp17.fillna(0)
udf_tmp17 = udf_tmp17.xs('cant_staticpage', axis=1, drop_level=True)
udf_tmp17.rename(columns={'CustomerService':'cant_visitas_customer_service','FaqEcommerce':'cant_visitas_faq_ecommerce'},inplace=True)
udf_tmp17.head()
udf_tmp50 = df[(df['timestamp'] >= pd.to_datetime('2018-05-23')) & (df['event'] == 'viewed product')]
udf_tmp50 = udf_tmp50.groupby('person')['model'].value_counts()
udf_tmp50 = udf_tmp50.to_frame()
udf_tmp50 = udf_tmp50.rename(columns={'model':'cant_viewed'})
udf_tmp50 = udf_tmp50.unstack()
udf_tmp50 = udf_tmp50.fillna(0)
udf_tmp50 = udf_tmp50.xs('cant_viewed', axis=1, drop_level=True)
udf_tmp50_prima = udf_tmp50.copy()
udf_tmp50_prima['total_last_week_viewed_products'] = udf_tmp50.sum(axis=1)
udf_tmp50_prima['total_last_week_max_viewed_model'] = udf_tmp50.max(axis=1)
udf_tmp50_prima['percentage_last_week_max_viewed_model'] = \
udf_tmp50_prima['total_last_week_max_viewed_model'] / udf_tmp50_prima['total_last_week_viewed_products']
udf_tmp50_prima = udf_tmp50_prima[[
'total_last_week_max_viewed_model',
'percentage_last_week_max_viewed_model'
]]
display(udf_tmp50.head())
display(udf_tmp50_prima.head())
udf_tmp51 = df[(df['timestamp'] >= pd.to_datetime('2018-05-23')) & (df['event'] == 'viewed product')]
udf_tmp51 = udf_tmp51.groupby('person')['brand'].value_counts()
udf_tmp51 = udf_tmp51.to_frame()
udf_tmp51 = udf_tmp51.rename(columns={'brand':'cant_viewed'})
udf_tmp51 = udf_tmp51.unstack()
udf_tmp51 = udf_tmp51.fillna(0)
udf_tmp51 = udf_tmp51.xs('cant_viewed', axis=1, drop_level=True)
udf_tmp51_prima = udf_tmp51.copy()
udf_tmp51_prima['total_last_week_viewed_products'] = udf_tmp51.sum(axis=1)
udf_tmp51_prima['total_last_week_max_viewed_brand'] = udf_tmp51.max(axis=1)
udf_tmp51_prima['percentage_last_week_max_viewed_brand'] = \
udf_tmp51_prima['total_last_week_max_viewed_brand'] / udf_tmp51_prima['total_last_week_viewed_products']
udf_tmp51_prima = udf_tmp51_prima[[
'total_last_week_max_viewed_brand',
'percentage_last_week_max_viewed_brand'
]]
display(udf_tmp51.head())
display(udf_tmp51_prima.head())
# One hot encoding casero
udf_tmp52 = df[df['event'] == 'viewed product'].groupby('person')['brand'].value_counts().to_frame().unstack().fillna(0)
udf_tmp52 = udf_tmp52['brand']
udf_tmp52.columns = 'most_viewed_brand_is_' + udf_tmp52.columns
udf_tmp52['most_viewed'] = udf_tmp52.max(axis=1)
for c in udf_tmp52.columns:
udf_tmp52[c] = udf_tmp52[c] - udf_tmp52['most_viewed']
del udf_tmp52['most_viewed']
udf_tmp52 = (udf_tmp52 >= 0).astype('int')
udf_tmp52
# One hot encoding casero
udf_tmp53 = df[(df['timestamp'] > pd.to_datetime('2018-05-23')) & (df['event'] == 'viewed product')].groupby('person')['brand'].value_counts().to_frame().unstack().fillna(0)
udf_tmp53 = udf_tmp53['brand']
udf_tmp53.columns = 'most_viewed_brand_lw_is_' + udf_tmp53.columns
udf_tmp53['most_viewed'] = udf_tmp53.max(axis=1)
for c in udf_tmp53.columns:
udf_tmp53[c] = udf_tmp53[c] - udf_tmp53['most_viewed']
del udf_tmp53['most_viewed']
udf_tmp53 = (udf_tmp53 >= 0).astype('int')
udf_tmp53
tmp = df[df['session_timestamp_first'] > pd.to_datetime('2018-05-16')]
gb = tmp.groupby('person')
udf_tmp54 = gb.agg({
'session_first':'sum',
})
udf_tmp54.columns = ['total_sessions_l2w']
udf_tmp54 = udf_tmp54.merge(udf_tmp1[['total_sessions']], how='right', on='person')
udf_tmp54['ratio_sessions_l2w_over_total'] = udf_tmp54['total_sessions_l2w'] / udf_tmp54['total_sessions']
del udf_tmp54['total_sessions']
udf_tmp54 = udf_tmp54.fillna(0)
display(udf_tmp54.head())
gb = df[df['timestamp'] > pd.to_datetime('2018-05-16')].groupby('person')
udf_tmp55 = gb.agg({'is_brand_listing':'sum',
'is_viewed_product':'sum',
'is_checkout':'sum',
'is_conversion':'sum',
'event':'count',
'session_first':'sum',
'session_total_conversions':'sum',
'session_checkout_first':'sum',
'session_conversion_first':'sum',
'session_ad':'sum',
'session_ad_first':'sum'})
# Cosa rara que aparece sola
del udf_tmp55['session_total_conversions']
udf_tmp55.columns = ['total_brand_listings_l2w',
'total_viewed_products_l2w',
'total_checkouts_l2w',
'total_conversions_l2w',
'total_events_l2w',
'total_sessions_l2w',
'total_session_checkouts_l2w',
'total_session_conversions_l2w',
'total_events_ad_session_l2w',
'total_ad_sessions_l2w']
udf_tmp55['has_checkout_l2w'] = (udf_tmp55['total_checkouts_l2w'] > 0).astype('int')
udf_tmp55['has_conversion_l2w'] = (udf_tmp55['total_conversions_l2w'] > 0).astype('int')
udf_tmp55 = udf_tmp55.merge(udf_tmp1[['total_events']], how='left', on='person')
udf_tmp55['percentage_l2w_week_activity'] = udf_tmp55['total_events_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_brand_listings'] = udf_tmp55['total_brand_listings_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_viewed_products'] = udf_tmp55['total_viewed_products_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_checkouts'] = udf_tmp55['total_checkouts_l2w'] / udf_tmp55['total_events']
udf_tmp55['percentage_l2w_conversions'] = udf_tmp55['total_conversions_l2w'] / udf_tmp55['total_events']
del udf_tmp55['total_events']
udf_tmp55.head()
Se guarda todo en user-features.csv
udf = udf_tmp1
udf = udf.merge(udf_tmp2, how='outer', on='person')
udf = udf.merge(udf_tmp3, how='outer', on='person') # No se encuentra utilidad, pero mejora un poco
udf = udf.merge(udf_tmp4, how='outer', on='person')
udf = udf.merge(udf_tmp5, how='outer', on='person')
udf = udf.merge(udf_tmp6, how='outer', on='person')
udf = udf.merge(udf_tmp7, how='outer', on='person')
# udf = udf.merge(udf_tmp8, how='outer', on='person') # Lo saqué, ahora se calcula con udf_tmp4
udf = udf.merge(udf_tmp9, how='outer', on='person')
udf = udf.merge(udf_tmp10, how='outer', on='person')
udf = udf.merge(udf_tmp11, how='outer', on='person')
udf = udf.merge(udf_tmp12, how='outer', on='person')
udf = udf.merge(udf_tmp13, how='outer', on='person')
#udf = udf.merge(udf_tmp14, how='outer', on='person')
udf = udf.merge(udf_tmp14_prima, how='outer', on='person')
udf = udf.merge(udf_tmp15, how='outer', on='person')
udf = udf.merge(udf_tmp16, how='outer', on='person')
udf = udf.merge(udf_tmp17, how='outer', on='person')
#udf = udf.merge(udf_tmp50, how='outer', on='person')
udf = udf.merge(udf_tmp50_prima, how='outer', on='person')
#udf = udf.merge(udf_tmp51, how='outer', on='person')
udf = udf.merge(udf_tmp51_prima, how='outer', on='person')
udf = udf.merge(udf_tmp52, how='outer', on='person')
udf = udf.merge(udf_tmp53, how='outer', on='person')
udf = udf.merge(udf_tmp54, how='outer', on='person')
udf = udf.merge(udf_tmp55, how='outer', on='person')
udf = udf.fillna(0)
display(udf.head())
display(udf.columns.tolist())
# K-Means
udf['kmeans_3'] = cluster.KMeans(n_clusters=3).fit_predict(udf)
udf['kmeans_5'] = cluster.KMeans(n_clusters=5).fit_predict(udf)
udf['kmeans_6'] = cluster.KMeans(n_clusters=6).fit_predict(udf)
udf['kmeans_15'] = cluster.KMeans(n_clusters=15).fit_predict(udf)
udf['kmeans_25'] = cluster.KMeans(n_clusters=25).fit_predict(udf)
# HDBScan
# Deja columnas negativas, entonces varios algoritmos no pueden hacer uso de ella
# udf['hdbscan_22'] = hdbscan.HDBSCAN(min_cluster_size=22).fit_predict(udf)
udf.reset_index().to_csv('data/user-features.csv', index=False)
udf.head()
# Confirmamos no haber perdido datos en el medio
assert(len(udf)==len(df['person'].unique()))
display(len(udf))
users = pd.read_csv("data/user-features.csv", low_memory=False)
with pd.option_context('display.max_column',0):
display(users.head())
display(users.shape)
len(users.columns)