Date: 19 AUG
Preparation and Brief information
import numpy as np
import pandas as pd
from sklearn import preprocessing, metrics
import lightgbm as lgb
# Set up code checking
from learntools.core import binder
binder.bind(globals())
from learntools.feature_engineering.ex3 import *
# Create features from timestamps
click_data = pd.read_csv('../input/feature-engineering-data/train_sample.csv',
parse_dates=['click_time'])
click_times = click_data['click_time']
clicks = click_data.assign(day=click_times.dt.day.astype('uint8'),
hour=click_times.dt.hour.astype('uint8'),
minute=click_times.dt.minute.astype('uint8'),
second=click_times.dt.second.astype('uint8'))
# Label encoding for categorical features
cat_features = ['ip', 'app', 'device', 'os', 'channel']
for feature in cat_features:
label_encoder = preprocessing.LabelEncoder()
clicks[feature] = label_encoder.fit_transform(clicks[feature])
def get_data_splits(dataframe, valid_fraction=0.1):
dataframe = dataframe.sort_values('click_time')
valid_rows = int(len(dataframe) * valid_fraction)
train = dataframe[:-valid_rows * 2]
# valid size == test size, last two sections of the data
valid = dataframe[-valid_rows * 2:-valid_rows]
test = dataframe[-valid_rows:]
return train, valid, test
def train_model(train, valid, test=None, feature_cols=None):
if feature_cols is None:
feature_cols = train.columns.drop(['click_time', 'attributed_time',
'is_attributed'])
dtrain = lgb.Dataset(train[feature_cols], label=train['is_attributed'])
dvalid = lgb.Dataset(valid[feature_cols], label=valid['is_attributed'])
param = {'num_leaves': 64, 'objective': 'binary',
'metric': 'auc', 'seed': 7}
num_round = 1000
print("Training model. Hold on a minute to see the validation score")
bst = lgb.train(param, dtrain, num_round, valid_sets=[dvalid],
early_stopping_rounds=20, verbose_eval=False)
valid_pred = bst.predict(valid[feature_cols])
valid_score = metrics.roc_auc_score(valid['is_attributed'], valid_pred)
print(f"Validation AUC score: {valid_score}")
if test is not None:
test_pred = bst.predict(test[feature_cols])
test_score = metrics.roc_auc_score(test['is_attributed'], test_pred)
return bst, valid_score, test_score
else:
return bst, valid_score
print("Baseline model score")
train, valid, test = get_data_splits(clicks)
_ = train_model(train, valid)
P01 Add interaction features
Hint: The easiest way to loop through the pairs is with itertools.combinations. Once you have that working, for each pair of columns convert them to strings then you can join them with the +
operator. It’s usually good to join with a symbol like _ inbetween to ensure unique values. Now you should have a column of new categorical values, you can label encoder those and add them to the DataFrame
cat_features = ['ip', 'app', 'device', 'os', 'channel']
interactions = pd.DataFrame(index=clicks.index)
for col1, col2 in itertools.combinations(cat_features, 2):
new_col_name = '_'.join([col1, col2])
# Convert to strings and combine
new_values = clicks[col1].map(str) + "_" + clicks[col2].map(str)
encoder = preprocessing.LabelEncoder()
interactions[new_col_name] = encoder.fit_transform(new_values)
P02 Generating numerical features
Hint: You can get a rolling time window using .rolling(), but first you need to convert the index to a time series. The current row is included in the window, but we want to count all the events before the current row, so be sure to adjust the count.
Number of events in the past six hours
def count_past_events(series, time_window='6H'):
series = pd.Series(series.index, index=series)
# Subtract 1 so the current event isn't counted
past_events = series.rolling(time_window).count() - 1
return past_events
# Loading in from saved Parquet file
past_events = pd.read_parquet('../input/feature-engineering-data/past_6hr_events.pqt')
clicks['ip_past_6hr_counts'] = past_events
train, valid, test = get_data_splits(clicks)
_ = train_model(train, valid)
P03 Features from future information
Should you use future events or not?
Solution:
In general, you shouldn’t use information from the future. When you’re using models like this in a real-world scenario you won’t have data from the future. Your model’s score will likely be higher when training and testing on historical data, but it will overestimate the performance on real data.
I should note that using future data will improve the score on Kaggle competition test data, but avoid it when building machine learning products.
The data is inactruate
P04 Time since last event
Hint: Try using the .diff() method on a time series.
def time_diff(series):
"""Returns a series with the time since the last timestamp in seconds."""
return series.diff().dt.total_seconds()
timedeltas = clicks.groupby('ip')['click_time'].transform(time_diff)
P05 Number of previous app downloads
Hint:
Here you want a window that always starts at the first row but expands as you get further in the data.
You can use the .expanding
methods for this.
Also, the current row is included in the window, so you’ll need to subtract that off as well
def previous_attributions(series):
"""Returns a series with the number of times an app has been downloaded."""
# Subtracting raw values so I don't count the current event
sums = series.expanding(min_periods=2).sum() - series
return sums
P06 Tree-based vs Neural Network Models
Solution:
The features themselves will work for either model. However, numerical inputs to neural networks need to be standardized first. That is, the features need to be scaled such that they have 0 mean and a standard deviation of 1. This can be done using sklearn.preprocessing.StandardScaler.