数据科学特征工程
agg是一个聚合函数,使用指定轴上的一个或多个操作进行聚合。通过agg函数,可以同时对多列进行提取特征,非常适合用于特征工程。

内置的聚合函数

在Pandas内部支持了13中聚合函数,可以在分组之后进行使用:

  • **mean()**:分组均值
  • **sum()**:分组求和
  • **size()**:分组个数
  • **count()**:分组大小
  • **std()**:分组标准差
  • **var()**:分组方差
  • **sem()**:均值误差
  • **describe()**:分组描述
  • **first()**:分组第一个元素
  • **last()**:分组最后一个元素
  • **nth()**:分组第N个元素
  • **min()**:分组最小值
  • **max()**:分组最大值

案例如下,有多种使用方式可供选择:

  1. # 定义模型
  2. df = pd.DataFrame({'group':[1,1,2,2],
  3. 'values':[4,1,1,2],
  4. 'values2':[0,1,1,2]
  5. })
  6. # 分组对两列求均值
  7. df.groupby('group').mean()
  8. # 分组对两列求均值、标准差
  9. df.groupby('group').agg([np.mean,np.std])
  10. # 分组对两列分别聚合
  11. df.groupby('group').agg(
  12. {'values':['mean','median'],
  13. 'values2':['mean','std']}
  14. )

自定义聚合函数

如果在Pandas内部的聚合函数不满足要求,也可以自定义聚合函数搭配使用

median

  1. def median(x):
  2. return np.median(x)

variation_coefficient

  1. def variation_coefficient(x):
  2. mean = np.mean(x)
  3. if mean != 0:
  4. return np.std(x) / mean
  5. else:
  6. return np.nan

variance

  1. def variance(x):
  2. return np.var(x)

skewness

  1. def skewness(x):
  2. if not isinstance(x, pd.Series):
  3. x = pd.Series(x)
  4. return pd.Series.skew(x)

kurtosis

  1. def kurtosis(x):
  2. if not isinstance(x, pd.Series):
  3. x = pd.Series(x)
  4. return pd.Series.kurtosis(x)

standard_deviation

  1. def standard_deviation(x):
  2. return np.std(x)

large_standard_deviation

  1. def large_standard_deviation(x):
  2. if (np.max(x)-np.min(x)) == 0:
  3. return np.nan
  4. else:
  5. return np.std(x)/(np.max(x)-np.min(x))

variation_coefficient

  1. def variation_coefficient(x):
  2. mean = np.mean(x)
  3. if mean != 0:
  4. return np.std(x) / mean
  5. else:
  6. return np.nan

variance_std_ratio

  1. def variance_std_ratio(x):
  2. y = np.var(x)
  3. if y != 0:
  4. return y/np.sqrt(y)
  5. else:
  6. return np.nan

ratio_beyond_r_sigma

  1. def ratio_beyond_r_sigma(x, r):
  2. if x.size == 0:
  3. return np.nan
  4. else:
  5. return np.sum(np.abs(x - np.mean(x)) > r * np.asarray(np.std(x))) / x.size

range_ratio

  1. def range_ratio(x):
  2. mean_median_difference = np.abs(np.mean(x) - np.median(x))
  3. max_min_difference = np.max(x) - np.min(x)
  4. if max_min_difference == 0:
  5. return np.nan
  6. else:
  7. return mean_median_difference / max_min_difference

has_duplicate_max

  1. def has_duplicate_max(x):
  2. return np.sum(x == np.max(x)) >= 2

has_duplicate_min

  1. def has_duplicate_min(x):
  2. return np.sum(x == np.min(x)) >= 2

has_duplicate

  1. def has_duplicate(x):
  2. return x.size != np.unique(x).size

count_duplicate_max

  1. def count_duplicate_max(x):
  2. return np.sum(x == np.max(x))

count_duplicate_min

  1. def count_duplicate_min(x):
  2. return np.sum(x == np.min(x))

count_duplicate

  1. def count_duplicate(x):
  2. return x.size - np.unique(x).size

sum_values

  1. def sum_values(x):
  2. if len(x) == 0:
  3. return 0
  4. return np.sum(x)

log_return

  1. def log_return(list_stock_prices):
  2. return np.log(list_stock_prices).diff()

realized_volatility

  1. def realized_volatility(series):
  2. return np.sqrt(np.sum(series**2))

realized_abs_skew

  1. def realized_abs_skew(series):
  2. return np.power(np.abs(np.sum(series**3)),1/3)

realized_skew

  1. def realized_skew(series):
  2. return np.sign(np.sum(series**3))*np.power(np.abs(np.sum(series**3)),1/3)

realized_vol_skew

  1. def realized_vol_skew(series):
  2. return np.power(np.abs(np.sum(series**6)),1/6)

realized_quarticity

  1. def realized_quarticity(series):
  2. return np.power(np.sum(series**4),1/4)

count_unique

  1. def count_unique(series):
  2. return len(np.unique(series))

count

  1. def count(series):
  2. return series.size

maximum_drawdown

  1. def maximum_drawdown(series):
  2. series = np.asarray(series)
  3. if len(series)<2:
  4. return 0
  5. k = series[np.argmax(np.maximum.accumulate(series) - series)]
  6. i = np.argmax(np.maximum.accumulate(series) - series)
  7. if len(series[:i])<1:
  8. return np.NaN
  9. else:
  10. j = np.max(series[:i])
  11. return j-k

maximum_drawup

  1. def maximum_drawup(series):
  2. series = np.asarray(series)
  3. if len(series)<2:
  4. return 0
  5. series = - series
  6. k = series[np.argmax(np.maximum.accumulate(series) - series)]
  7. i = np.argmax(np.maximum.accumulate(series) - series)
  8. if len(series[:i])<1:
  9. return np.NaN
  10. else:
  11. j = np.max(series[:i])
  12. return j-k

drawdown_duration

  1. def drawdown_duration(series):
  2. series = np.asarray(series)
  3. if len(series)<2:
  4. return 0
  5. k = np.argmax(np.maximum.accumulate(series) - series)
  6. i = np.argmax(np.maximum.accumulate(series) - series)
  7. if len(series[:i]) == 0:
  8. j=k
  9. else:
  10. j = np.argmax(series[:i])
  11. return k-j

drawup_duration

  1. def drawup_duration(series):
  2. series = np.asarray(series)
  3. if len(series)<2:
  4. return 0
  5. series=-series
  6. k = np.argmax(np.maximum.accumulate(series) - series)
  7. i = np.argmax(np.maximum.accumulate(series) - series)
  8. if len(series[:i]) == 0:
  9. j=k
  10. else:
  11. j = np.argmax(series[:i])
  12. return k-j

max_over_min

  1. def max_over_min(series):
  2. if len(series)<2:
  3. return 0
  4. if np.min(series) == 0:
  5. return np.nan
  6. return np.max(series)/np.min(series)

mean_n_absolute_max

  1. def mean_n_absolute_max(x, number_of_maxima = 1):
  2. """ Calculates the arithmetic mean of the n absolute maximum values of the time series."""
  3. assert (
  4. number_of_maxima > 0
  5. ), f" number_of_maxima={number_of_maxima} which is not greater than 1"
  6. n_absolute_maximum_values = np.sort(np.absolute(x))[-number_of_maxima:]
  7. return np.mean(n_absolute_maximum_values) if len(x) > number_of_maxima else np.NaN

count_above

  1. def count_above(x, t):
  2. if len(x)==0:
  3. return np.nan
  4. else:
  5. return np.sum(x >= t) / len(x)

count_below

  1. def count_below(x, t):
  2. if len(x)==0:
  3. return np.nan
  4. else:
  5. return np.sum(x <= t) / len(x)

number_peaks

  1. def number_peaks(x, n):
  2. x_reduced = x[n:-n]
  3. res = None
  4. for i in range(1, n + 1):
  5. result_first = x_reduced > _roll(x, i)[n:-n]
  6. if res is None:
  7. res = result_first
  8. else:
  9. res &= result_first
  10. res &= x_reduced > _roll(x, -i)[n:-n]
  11. return np.sum(res)

mean_abs_change

  1. def mean_abs_change(x):
  2. return np.mean(np.abs(np.diff(x)))

mean_change

  1. def mean_change(x):
  2. x = np.asarray(x)
  3. return (x[-1] - x[0]) / (len(x) - 1) if len(x) > 1 else np.NaN

mean_second_derivative_central

  1. def mean_second_derivative_central(x):
  2. x = np.asarray(x)
  3. return (x[-1] - x[-2] - x[1] + x[0]) / (2 * (len(x) - 2)) if len(x) > 2 else np.NaN

root_mean_square

  1. def root_mean_square(x):
  2. return np.sqrt(np.mean(np.square(x))) if len(x) > 0 else np.NaN

absolute_sum_of_changes

  1. def absolute_sum_of_changes(x):
  2. return np.sum(np.abs(np.diff(x)))

longest_strike_below_mean

  1. def longest_strike_below_mean(x):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. return np.max(_get_length_sequences_where(x < np.mean(x))) if x.size > 0 else 0

longest_strike_above_mean

  1. def longest_strike_above_mean(x):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. return np.max(_get_length_sequences_where(x > np.mean(x))) if x.size > 0 else 0

count_above_mean

  1. def count_above_mean(x):
  2. m = np.mean(x)
  3. return np.where(x > m)[0].size

count_below_mean

  1. def count_below_mean(x):
  2. m = np.mean(x)
  3. return np.where(x < m)[0].size

last_location_of_maximum

  1. def last_location_of_maximum(x):
  2. x = np.asarray(x)
  3. return 1.0 - np.argmax(x[::-1]) / len(x) if len(x) > 0 else np.NaN

first_location_of_maximum

  1. def first_location_of_maximum(x):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. return np.argmax(x) / len(x) if len(x) > 0 else np.NaN

last_location_of_minimum

  1. def last_location_of_minimum(x):
  2. x = np.asarray(x)
  3. return 1.0 - np.argmin(x[::-1]) / len(x) if len(x) > 0 else np.NaN

first_location_of_minimum

  1. def first_location_of_minimum(x):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. return np.argmin(x) / len(x) if len(x) > 0 else np.NaN

percentage_of_reoccurring_values_to_all_values

  1. def percentage_of_reoccurring_values_to_all_values(x):
  2. if len(x) == 0:
  3. return np.nan
  4. unique, counts = np.unique(x, return_counts=True)
  5. if counts.shape[0] == 0:
  6. return 0
  7. return np.sum(counts > 1) / float(counts.shape[0])

percentage_of_reoccurring_datapoints_to_all_datapoints

  1. def percentage_of_reoccurring_datapoints_to_all_datapoints(x):
  2. if len(x) == 0:
  3. return np.nan
  4. if not isinstance(x, pd.Series):
  5. x = pd.Series(x)
  6. value_counts = x.value_counts()
  7. reoccuring_values = value_counts[value_counts > 1].sum()
  8. if np.isnan(reoccuring_values):
  9. return 0
  10. return reoccuring_values / x.size

sum_of_reoccurring_values

  1. def sum_of_reoccurring_values(x):
  2. unique, counts = np.unique(x, return_counts=True)
  3. counts[counts < 2] = 0
  4. counts[counts > 1] = 1
  5. return np.sum(counts * unique)

sum_of_reoccurring_data_points

  1. def sum_of_reoccurring_data_points(x):
  2. unique, counts = np.unique(x, return_counts=True)
  3. counts[counts < 2] = 0
  4. return np.sum(counts * unique)

ratio_value_number_to_time_series_length

  1. def ratio_value_number_to_time_series_length(x):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. if x.size == 0:
  5. return np.nan
  6. return np.unique(x).size / x.size

abs_energy

  1. def abs_energy(x):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. return np.dot(x, x)

quantile

  1. def quantile(x, q):
  2. if len(x) == 0:
  3. return np.NaN
  4. return np.quantile(x, q)

number_crossing_m

  1. def number_crossing_m(x, m):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. # From https://stackoverflow.com/questions/3843017/efficiently-detect-sign-changes-in-python
  5. positive = x > m
  6. return np.where(np.diff(positive))[0].size

absolute_maximum

  1. def absolute_maximum(x):
  2. return np.max(np.absolute(x)) if len(x) > 0 else np.NaN

value_count

  1. def value_count(x, value):
  2. if not isinstance(x, (np.ndarray, pd.Series)):
  3. x = np.asarray(x)
  4. if np.isnan(value):
  5. return np.isnan(x).sum()
  6. else:
  7. return x[x == value].size

range_count

  1. def range_count(x, min, max):
  2. return np.sum((x >= min) & (x < max))

mean_diff

  1. def mean_diff(x):
  2. return np.nanmean(np.diff(x.values))