https://tianchi.aliyun.com/forum/postDetail?spm=5176.12586969.1002.6.6b17329eVBTIwN&postId=333161

时间特征
年、月、日、小时、星期、季度、是否工作日等。def get_time_feature(df, col, keep=False):"""为df增加时间特征列,包括:年,月,日,小时,dayofweek,weekofyear:param df::param col: 时间列的列名:param keep: 是否保留原始时间列:return:"""df_copy = df.copy()prefix = col + "_"df_copy[col] = pd.to_datetime(df_copy[col])df_copy[prefix + 'year'] = df_copy[col].dt.yeardf_copy[prefix + 'month'] = df_copy[col].dt.monthdf_copy[prefix + 'day'] = df_copy[col].dt.daydf_copy[prefix + 'hour'] = df_copy[col].dt.hourdf_copy[prefix + 'weekofyear'] = df_copy[col].dt.weekofyeardf_copy[prefix + 'dayofweek'] = df_copy[col].dt.dayofweekdf_copy[prefix + 'is_wknd'] = df_copy[col].dt.dayofweek // 4df_copy[prefix + 'quarter'] = df_copy[col].dt.quarterdf_copy[prefix + 'is_month_start'] = df_copy[col].dt.is_month_start.astype(int)df_copy[prefix + 'is_month_end'] = df_copy[col].dt.is_month_end.astype(int)if keep:return df_copyelse:return df_copy.drop([col], axis=1)df = get_time_feature(df, "time_col")
lag特征
表示同一元素历史时间点的值(例如在这个赛题中,同一个unit昨天、前天、上周对应的使用量)keys = ['unit']val = 'qty'lag = 1df.groupby(keys)[val].transform(lambda x: x.shift(lag))
滑动窗口统计特征:历史时间窗口内的统计值
keys = ['unit']val = 'qty'window = 7df.groupby(keys)[val].transform(lambda x: x.rolling(window=window, min_periods=3, win_type="triang").mean())df.groupby(keys)[val].transform(lambda x: x.rolling(window=window, min_periods=3).std())
指数加权移动平均
keys = ['unit']val = 'qty'lag = 1alpha=0.95df_temp.groupby(keys)[val].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
标签的转化
当label的分布范围差异较大时,可以尝试做变换后再进行训练,训练完模型后,对测试集预测后再逆操作回来。
常用的方法有log1p,适用于label值均为正值的场景。转化后的训练结果会更加稳定。# 训练时转化assert(train[target].min() >= 0)train[target] = np.log1p(train[target])# 预测结果需要逆操作pred = np.expm1(pred)
验证集的切分
时序预测问题验证集的切分常常采用按时间切分的方式,可以用sklearn中的TimeSeriesSplit。在验证集上获得模型的最佳迭代轮数之后,再用全量数据重新训练。重新训练的时候,迭代轮次可以是之前迭代轮次的k倍。k的参考值=全量数据样本量/除去验证集的数据样本量。
参考如下代码:from sklearn.model_selection import TimeSeriesSplitimport gcts_folds = TimeSeriesSplit(n_splits = 5)for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train[used_features])):if fold_n in [0, 1, 2, 3]:continueprint('Training with validation')trn_data = lgb.Dataset(train[used_features].iloc[train_index], label=train[target].iloc[train_index],categorical_feature="")val_data = lgb.Dataset(train[used_features].iloc[valid_index], label=train[target].iloc[valid_index],categorical_feature="")clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,early_stopping_rounds=Early_Stopping_Rounds)val = clf.predict(train[used_features].iloc[valid_index])if target_log:mae_ = mean_absolute_error(np.expm1(train.iloc[valid_index][target]), np.expm1(val))else:mae_ = mean_absolute_error(train.iloc[valid_index][target], val)print('MAE: {}'.format(mae_))MAEs.append(mae_)print("ReTraining on all data")gc.enable()del trn_data, val_datagc.collect()Best_iteration = clf.best_iterationprint("Best_iteration: ", Best_iteration)trn_data = lgb.Dataset(train[used_features], label=train[target], categorical_feature="")clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2),valid_sets=[trn_data], verbose_eval=Verbose)pred = clf.predict(test[used_features])
