Grupo 30: Datatouille
En este notebook, que es un re-trabajo sobre el Notebook Anexo
del TP1 se buscan features del set de datos y se crean nuevos atributos a partir de ellos. Estos se almacenan en nuevos archivos .csv
para luego ser concatenados al set de datos principal.
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
df = pd.read_csv('./data/events_up_to_01062018.csv', low_memory=False)
df['timestamp'] = pd.to_datetime(df['timestamp'])
print("len: {}".format(len(df)))
df.head()
Marcas de celulares
df['model'].unique()
checked = ['iphone', 'samsung', 'motorola', 'lenovo', 'sony', 'lg', 'ipad', 'asus', 'quantum', 'blackberry']
model_parsed = df['model'].dropna().map(lambda x: x.lower())
model_parsed = model_parsed.map(lambda x: x.split())
def find_brand(model):
for str in model:
if str in checked:
return str
return "other"
df['brand'] = model_parsed.map(find_brand)
df['brand'] = df['brand'].astype('category')
df[['model', 'brand']].dropna().drop_duplicates().to_csv('data/brands.csv', index=False)
brands_csv = pd.read_csv("data/brands.csv", low_memory=False)
display(brands_csv.head())
display(brands_csv.shape)
Sistemas operativos de celulares
df['operating_system_version'].unique()
checked = ['windows', 'android', 'linux', 'mac', 'ios', 'ubuntu', 'chrome os', 'tizen', 'blackberry', 'other']
os_version_parsed = df['operating_system_version'].dropna().map(lambda x: x.lower())
def find_os(os_version):
for os in checked:
if os in os_version:
return os
return "another"
df['operating_system'] = os_version_parsed.map(find_os)
df['operating_system'] = df['operating_system'].astype('category')
# Chequeamos cuantos os quedaron con el nombre 'another' (idealmente, ninguno)
df[['operating_system_version', 'operating_system']].dropna().head(10)
df[df['operating_system'] == 'another'][['operating_system_version', 'operating_system']].head(10)
df['operating_system'].value_counts()
df[['operating_system_version', 'operating_system']].dropna().drop_duplicates().to_csv('data/os.csv', index=False)
os_csv = pd.read_csv("data/os.csv", low_memory=False)
display(os_csv.head())
display(os_csv.shape)
Exploradores de internet desde los cuales se accedió al sitio
df['browser_version'].unique().tolist()
checked = ['mobile safari', 'chrome mobile', 'ie mobile', 'firefox mobile', 'edge mobile', 'opera mobile',
'mobile', 'blackberry os', 'blackberry webkit', 'chrome', 'android', 'opera', 'ie', 'firefox',
'facebook', 'samsung', 'chromium', 'edge', 'yandex', 'uc',
'other', 'safari', 'puffin', 'maxthon', 'vivaldi', 'pinterest']
browser_version_parsed = df['browser_version'].dropna().map(lambda x: x.lower())
def find_browser(browser_version):
for browser in checked:
if browser in browser_version:
return browser
return "other"
df['browser'] = browser_version_parsed.map(find_browser)
df['browser'] = df['browser'].astype('category')
df[['browser_version', 'browser']].dropna()
display(df['browser'].value_counts().head())
df[['browser_version', 'browser']].head()
df[df['browser'] == 'other']['browser_version']
df[['browser_version', 'browser']].dropna().drop_duplicates().to_csv('data/browsers.csv', index=False)
browsers_csv = pd.read_csv("data/browsers.csv", low_memory=False)
display(browsers_csv.head())
display(browsers_csv.shape)
Eventos por usuario
Definimos una sesión como una serie de eventos por usuario, los cuales están todos con menos de 30 minutos de inactividad entre el actual y el anterior.
funnel = df
funnel = funnel.sort_values(['person', 'timestamp'])
funnel.head()
funnel['time_diff'] = funnel.groupby('person')['timestamp'].diff()
funnel['time_diff'] = funnel['time_diff'].fillna(0)
funnel['time_diff_min'] = funnel['time_diff'] / np.timedelta64(1, 'm')
funnel[['person', 'event', 'timestamp', 'time_diff', 'time_diff_min']].head()
THRESHOLD = 30 # minutos
funnel['new_session'] = funnel['time_diff_min'] > THRESHOLD
funnel['session_id'] = funnel.groupby('person')['new_session'].cumsum()
funnel['session_id'] = funnel['session_id'].astype('int')
funnel[['person', 'new_session', 'session_id']].head(8)
gb = funnel.groupby(['person', 'session_id'])
funnel['session_cumno'] = gb.cumcount()
funnel['session_total_events'] = pd.Series(np.repeat(gb.size(), gb.size().values)).values
funnel['session_first'] = funnel['session_cumno'] == 0
funnel['session_last'] = funnel['session_cumno'] == (-1+funnel['session_total_events'])
cols = ['person', 'timestamp', 'time_diff_min', \
'session_id', 'event', 'session_total_events', \
'session_cumno', 'session_first', 'session_last']
funnel[cols].head()
funnel['is_conversion'] = funnel['event'] == 'conversion'
gb = funnel.groupby(['person', 'session_id'])['is_conversion']
funnel['session_total_conversions'] = pd.Series(np.repeat(gb.sum(), gb.size().values)).values
funnel['session_has_conversion'] = funnel['session_total_conversions'] > 0
funnel.head()
funnel['is_checkout'] = funnel['event'] == 'checkout'
gb = funnel.groupby(['person', 'session_id'])['is_checkout']
funnel['session_total_checkouts'] = pd.Series(np.repeat(gb.sum(), gb.size().values)).values
funnel['session_has_checkout'] = funnel['session_total_checkouts'] > 0
funnel.head()
funnel['ad_origin'] = (funnel['event'] == 'ad campaign hit') & ((funnel['session_first']) | (funnel['session_cumno'] == 1))
gb = funnel.groupby(['person', 'session_id'])['ad_origin']
funnel['session_ad'] = pd.Series(np.repeat(gb.sum().values, gb.size().values)).values
funnel['session_ad'] = funnel['session_ad'] > 0
funnel.head()
gb = funnel.groupby(['person', 'session_id'])['timestamp']
funnel['session_timestamp_first'] = pd.Series(np.repeat(gb.min().values, gb.size().values)).values
funnel['session_timestamp_last'] = pd.Series(np.repeat(gb.max().values, gb.size().values)).values
funnel.head()
funnel[cols].head()
cols_csv = ['time_diff_min', \
'session_id', 'session_total_events', \
'session_cumno', 'session_first', 'session_last', \
'session_total_conversions', 'session_has_conversion', \
'session_total_checkouts', 'session_has_checkout', 'session_ad',
'session_timestamp_first', 'session_timestamp_last']
funnel = funnel.sort_index()
funnel[cols].head()
cols_check_ord = ['person', 'timestamp', 'time_diff_min', \
'session_id', 'session_total_events', \
'session_cumno', 'session_first', 'session_last', \
'session_total_conversions', 'session_has_conversion', \
'session_total_checkouts', 'session_has_checkout', 'session_ad',
'session_timestamp_first', 'session_timestamp_last']
funnel.sort_values(['person', 'timestamp'])[cols_check_ord].head()
funnel[cols_csv].to_csv('data/sessions.csv', index=False)
sessions_csv = pd.read_csv("data/sessions.csv", low_memory=False)
display(sessions_csv.head())
display(sessions_csv.shape)
mapping_conditions = {'Bom':15,
'Muito Bom': 6,
'Excelente': 5,
'Bom - Sem Touch ID':17,
'Novo':18,
np.nan:0
}
df_tmp = df.loc[df['model'].notnull()][['sku','model','color','storage','condition']]
df_tmp['to_search'] = df_tmp['model'] + ' ' + df_tmp['storage'].astype(str) + ' ' + df_tmp['color']
df_tmp['condition_n'] = df_tmp['condition'].transform(lambda x: mapping_conditions[x])
df_tmp['to_search+condition_n'] = df_tmp['to_search'] + ' ' + df_tmp['condition_n'].astype(str)
df_tmp = df_tmp.drop_duplicates()
df_tmp = df_tmp.dropna()
display(df_tmp.head())
dic_modelos_condiciones = {}
modelos = df_tmp['to_search'].tolist()
condiciones = df_tmp['condition_n'].tolist()
for m,c in zip(modelos,condiciones):
if m in dic_modelos_condiciones:
dic_modelos_condiciones[m].append(c)
else: dic_modelos_condiciones[m] = [c]
def request_soup(url):
try:
req = requests.get(url)
html = req.content
soup = BeautifulSoup(html, "lxml")
except requests.exceptions.ConnectionError:
raise ConnectionError(f"Connection with {url} refused.")
return soup
BASE = 'https://www.trocafone.com/comprar/list?'
MODELO = 'q'
CONDICION = 'condition'
precios = {}
i=1
for modelo, condiciones in dic_modelos_condiciones.items():
print(f'Iteración {i} de {len(dic_modelos_condiciones)}:')
modelo_s = modelo.replace(' ','+')
for condicion in condiciones:
url = f'{BASE}{MODELO}={modelo_s}&{CONDICION}={condicion}'
print(f'\t * {url}: ',end='')
soup = request_soup(url)
precio = soup.find(class_='price-value')
if precio and precio.string:
precio = precio.string
else: precio = 0
precio.replace('.','').replace(',','.') # (pasar de 1.2900,00 a 12900.00, para que Python se lleve bien con el float)
print(f'${precio}')
precios[f'{modelo} {condicion}'] = precio
i+=1
orden = df_tmp['to_search+condition_n']
precios_ordenado = []
for modelo in orden:
precios_ordenado.append(precios[modelo])
df_tmp['precio_reales'] = precios_ordenado
df_tmp[['sku','precio_reales']].to_csv('data/prices.csv', index=False)
prices_csv = pd.read_csv("data/prices.csv", low_memory=False)
display(prices_csv.head())
display(prices_csv.shape)