一、分组

GroupBy对象

· groupedby函数中的参数:

  1. as_index的作用:控制聚合输出是否以组标签为索引值,默认为True,就是分层次的索引,若为False多加一列默认索引索引,相当于非其他数据排序好了。
  2. 但是这两组标签索引值不同有什么作用呢?=== 作用就是,根据的一列是否为索引列。
  3. sort_values的作用:对选定的一列数值数据从上往下从小到大进行排序(如果传值没成功===设置本体覆盖,传值覆盖)
  1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import matplotlib.pyplot
  5. %matplotlib inline
  1. dict_obj = {'key1' : ['a', 'b', 'a', 'b',
  2. 'a', 'b', 'a', 'a'],
  3. 'key2' : ['one', 'one', 'two', 'three',
  4. 'two', 'two', 'one', 'three'],
  5. 'data1': np.random.randn(8),
  6. 'data2': np.random.randn(8)}
  7. df_obj = pd.DataFrame(dict_obj)
  8. print (df_obj)
  1. key1 key2 data1 data2
  2. 0 a one -0.147612 -0.348087
  3. 1 b one -0.992986 0.902458
  4. 2 a two 0.547541 -0.310040
  5. 3 b three 0.458871 -1.895392
  6. 4 a two 1.224041 0.220150
  7. 5 b two -0.200124 -1.562237
  8. 6 a one 1.539144 -0.758716
  9. 7 a three 0.385845 0.074309
  1. '''1. dataframe根据key2进行分组'''
  2. print(df_obj.groupby('key2')['key1'].count())
  3. print (type(df_obj.groupby('key1')))
  4. #没有可视化的输出
  1. key2
  2. one 3
  3. three 2
  4. two 3
  5. Name: key1, dtype: int64
  6. <class 'pandas.core.groupby.generic.DataFrameGroupBy'>
  1. '''2. 指定列根据key1进行分组'''
  2. print (type(df_obj['data1'].groupby(df_obj['key1'])))
  1. <class 'pandas.core.groupby.generic.SeriesGroupBy'>
  1. # 分组运算
  2. grouped1 = df_obj.groupby('key1',as_index=False)
  3. print (grouped1.mean())
  4. grouped2 = df_obj['data1'].groupby(df_obj['key1'])#指定某一列的数据在该索引下进行分组并且加以聚合
  5. print (grouped2.mean())
  1. key1 data1 data2
  2. 0 a 0.709792 -0.224477
  3. 1 b -0.244746 -0.851723
  4. key1
  5. a 0.709792
  6. b -0.244746
  7. Name: data1, dtype: float64
  1. '''3. 按自定义key分组,列表'''
  2. self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
  3. df_obj.groupby(self_def_key).mean()
data1 data2
1 0.116853 -0.338455
2 0.743484 -0.661761
  1. df_obj
key1 key2 data1 data2
0 a one -0.147612 -0.348087
1 b one -0.992986 0.902458
2 a two 0.547541 -0.310040
3 b three 0.458871 -1.895392
4 a two 1.224041 0.220150
5 b two -0.200124 -1.562237
6 a one 1.539144 -0.758716
7 a three 0.385845 0.074309
  1. '''4. 按多个列多层分组 = = = 通过列表'''
  2. grouped2 = df_obj.groupby(['key1', 'key2'],as_index=False)
  3. print (grouped2.mean())
  4. print('--------比较asindex的差异-------')
  5. grouped2 = df_obj.groupby(['key1', 'key2'],as_index=True)
  6. print (grouped2.mean())
  1. key1 key2 data1 data2
  2. 0 a one 0.695766 -0.553401
  3. 1 a three 0.385845 0.074309
  4. 2 a two 0.885791 -0.044945
  5. 3 b one -0.992986 0.902458
  6. 4 b three 0.458871 -1.895392
  7. 5 b two -0.200124 -1.562237
  8. --------比较asindex的差异-------
  9. data1 data2
  10. key1 key2
  11. a one 0.695766 -0.553401
  12. three 0.385845 0.074309
  13. two 0.885791 -0.044945
  14. b one -0.992986 0.902458
  15. three 0.458871 -1.895392
  16. two -0.200124 -1.562237
  1. # 多层分组按key的顺序进行===和上面的asindex作用一样,把所选取的列数据当成索引,这才是区别之处
  2. grouped3 = df_obj.groupby(['key2', 'key1'])
  3. print (grouped3.mean())
  4. print ('=============================================')
  5. '''PS:如果想按照列进行分组聚合运算 === unstack===也可以通过转置'''
  6. print (grouped3.mean().unstack())
  1. data1 data2
  2. key2 key1
  3. one a 0.695766 -0.553401
  4. b -0.992986 0.902458
  5. three a 0.385845 0.074309
  6. b 0.458871 -1.895392
  7. two a 0.885791 -0.044945
  8. b -0.200124 -1.562237
  9. =============================================
  10. data1 data2
  11. key1 a b a b
  12. key2
  13. one 0.695766 -0.992986 -0.553401 0.902458
  14. three 0.385845 0.458871 0.074309 -1.895392
  15. two 0.885791 -0.200124 -0.044945 -1.562237

GroupBy对象遍历迭代

  1. grouped1
  1. <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001AF5B5F9088>
  1. # 单层分组
  2. print(grouped1.head(5))
  3. print("------------------------------------分割线------------------------------------------")
  4. for group_name, group_data in grouped1:
  5. print (group_name)
  6. print (group_data['data1'])
  1. key1 key2 data1 data2
  2. 0 a one -0.147612 -0.348087
  3. 1 b one -0.992986 0.902458
  4. 2 a two 0.547541 -0.310040
  5. 3 b three 0.458871 -1.895392
  6. 4 a two 1.224041 0.220150
  7. 5 b two -0.200124 -1.562237
  8. 6 a one 1.539144 -0.758716
  9. 7 a three 0.385845 0.074309
  10. ------------------------------------分割线------------------------------------------
  11. a
  12. 0 -0.147612
  13. 2 0.547541
  14. 4 1.224041
  15. 6 1.539144
  16. 7 0.385845
  17. Name: data1, dtype: float64
  18. b
  19. 1 -0.992986
  20. 3 0.458871
  21. 5 -0.200124
  22. Name: data1, dtype: float64
  1. # 多层分组
  2. for group_name, group_data in grouped2:
  3. print (group_name)
  4. print (group_data)
  1. ('a', 'one')
  2. key1 key2 data1 data2
  3. 0 a one -0.147612 -0.348087
  4. 6 a one 1.539144 -0.758716
  5. ('a', 'three')
  6. key1 key2 data1 data2
  7. 7 a three 0.385845 0.074309
  8. ('a', 'two')
  9. key1 key2 data1 data2
  10. 2 a two 0.547541 -0.31004
  11. 4 a two 1.224041 0.22015
  12. ('b', 'one')
  13. key1 key2 data1 data2
  14. 1 b one -0.992986 0.902458
  15. ('b', 'three')
  16. key1 key2 data1 data2
  17. 3 b three 0.458871 -1.895392
  18. ('b', 'two')
  19. key1 key2 data1 data2
  20. 5 b two -0.200124 -1.562237
  1. # GroupBy对象转换list
  2. print(grouped1.mean())
  3. list(grouped1)
  1. key1 data1 data2
  2. 0 a 0.709792 -0.224477
  3. 1 b -0.244746 -0.851723
  4. [('a', key1 key2 data1 data2
  5. 0 a one -0.147612 -0.348087
  6. 2 a two 0.547541 -0.310040
  7. 4 a two 1.224041 0.220150
  8. 6 a one 1.539144 -0.758716
  9. 7 a three 0.385845 0.074309), ('b', key1 key2 data1 data2
  10. 1 b one -0.992986 0.902458
  11. 3 b three 0.458871 -1.895392
  12. 5 b two -0.200124 -1.562237)]
  1. # GroupBy对象转换dict
  2. dict(list(grouped1))
  1. {'a': key1 key2 data1 data2
  2. 0 a one -0.147612 -0.348087
  3. 2 a two 0.547541 -0.310040
  4. 4 a two 1.224041 0.220150
  5. 6 a one 1.539144 -0.758716
  6. 7 a three 0.385845 0.074309, 'b': key1 key2 data1 data2
  7. 1 b one -0.992986 0.902458
  8. 3 b three 0.458871 -1.895392
  9. 5 b two -0.200124 -1.562237}
  1. # 按列分组
  2. print (df_obj.dtypes)
  3. # 按数据类型分组
  4. df_obj.groupby(df_obj.dtypes, axis=1).size()
  5. df_obj.groupby(df_obj.dtypes, axis=1).sum()
  1. key1 object
  2. key2 object
  3. data1 float64
  4. data2 float64
  5. dtype: object

其他分组方法

  1. 其实列表也是分组的一种方式
    ===用到列表时候,一般都是多层索引了
  1. df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
  2. columns=['a', 'b', 'c', 'd', 'e'],
  3. index=['A', 'B', 'C', 'D', 'E'])
  4. df_obj2.ix[1, 1:4] = np.NaN
  5. df_obj2
  1. C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning:
  2. .ix is deprecated. Please use
  3. .loc for label based indexing or
  4. .iloc for positional indexing
  5. See the documentation here:
  6. http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  7. after removing the cwd from sys.path.
a b c d e
A 4 2.0 6.0 5.0 9
B 5 NaN NaN NaN 6
C 2 3.0 8.0 6.0 3
D 9 5.0 6.0 5.0 9
E 4 1.0 6.0 2.0 1
  1. 通过字典分组
  1. # 通过字典分组
  2. mapping_dict = {'A':'python', 'B':'python', 'C':'java', 'D':'C', 'E':'java'}
  3. #df_obj2.groupby(mapping_dict, axis=1).size()
  4. #df_obj2.groupby(mapping_dict, axis=1).count() # 非NaN的个数
  5. print(df_obj2.groupby(mapping_dict, axis=0).sum())
  1. a b c d e
  2. C 9 5.0 6.0 5.0 9
  3. java 6 4.0 14.0 8.0 4
  4. python 9 2.0 6.0 5.0 15
  1. 通过函数分组
  1. # 通过函数分组
  2. df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
  3. columns=['a', 'b', 'c', 'd', 'e'],
  4. index=['AA', 'BBB', 'CC', 'D', 'EE'])
  5. #df_obj3
  6. def group_key(idx):
  7. """
  8. idx 为列索引或行索引
  9. """
  10. #return idx
  11. return len(idx)
  12. df_obj3.groupby(group_key).size()
  13. # 以上自定义函数等价于
  14. #df_obj3.groupby(len).size()
  1. 1 1
  2. 2 3
  3. 3 1
  4. dtype: int64
  1. 通过层级索引级别分组
  1. # 通过索引级别分组
  2. columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
  3. ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
  4. df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5, 5)), columns=columns)
  5. df_obj4
language Python Java Python Java Python
index A A B C B
0 4 6 8 8 4
1 1 3 2 3 5
2 3 1 1 5 6
3 2 9 3 1 9
4 4 1 5 6 6
  1. # 根据language进行分组
  2. df_obj4.groupby(level='language', axis=1).sum()
  3. df_obj4.groupby(level='index', axis=1).sum()
index A B C
0 10 12 8
1 4 7 3
2 4 7 5
3 11 12 1
4 5 11 6

二、聚合

  1. dict_obj = {'key1' : ['a', 'b', 'a', 'b',
  2. 'a', 'b', 'a', 'a'],
  3. 'key2' : ['one', 'one', 'two', 'three',
  4. 'two', 'two', 'one', 'three'],
  5. 'data1': np.random.randint(1,10, 8),
  6. 'data2': np.random.randint(1,10, 8)}
  7. df_obj5 = pd.DataFrame(dict_obj)
  8. print (df_obj5)
  1. key1 key2 data1 data2
  2. 0 a one 9 4
  3. 1 b one 6 7
  4. 2 a two 9 4
  5. 3 b three 9 6
  6. 4 a two 6 2
  7. 5 b two 3 3
  8. 6 a one 1 1
  9. 7 a three 2 6

内置的聚合函数

  1. df_obj5
key1 key2 data1 data2
0 a one 9 4
1 b one 6 7
2 a two 9 4
3 b three 9 6
4 a two 6 2
5 b two 3 3
6 a one 1 1
7 a three 2 6
  1. # 内置的聚合函数
  2. #print (df_obj5.groupby('key1').sum())
  3. #print (df_obj5.groupby('key1').max())
  4. #print (df_obj5.groupby('key1').min())
  5. print (df_obj5.groupby('key1').mean())
  6. #print (df_obj5.groupby('key1').size())
  7. #print (df_obj5.groupby('key1').count())
  8. #print (df_obj5.groupby('key1').describe())
  9. '''
  10. count:分组中非NA的值
  11. std:标准差
  12. var:方差
  13. median:非NA中的中位数
  14. mean:非NA的平均值
  15. 25%||50%||75%是什么意思==不造?
  16. '''
  1. data1 data2
  2. key1
  3. a 5.4 3.400000
  4. b 6.0 5.333333
  5. '\ncount:分组中非NA的值\nstd:标准差\nvar:方差\nmedian:非NA中的中位数\nmean:非NA的平均值\n25%||50%||75%是什么意思==不造?\n'

自定义聚合函数

  1. # 自定义聚合函数
  2. def peak_range(df):
  3. """
  4. 返回数值范围
  5. """
  6. #print type(df) #参数为索引所对应的记录
  7. return df.max() - df.min()
  8. print (df_obj5.groupby('key1').agg(peak_range))
  9. #print df_obj.groupby('key1').agg(lambda df : df.max() - df.min())
  10. #默认列名就是函数名。
  1. data1 data2
  2. key1
  3. a 8 5
  4. b 6 4
  1. 同时应用多个聚合函数:agg
  1. # 同时应用多个聚合函数:agg
  2. print (df_obj.groupby('key1').agg(['mean', 'std', 'count']))
  1. data1 data2
  2. mean std count mean std count
  3. key1
  4. a 0.709792 0.674293 5 -0.224477 0.385674 5
  5. b -0.244746 0.726957 3 -0.851723 1.528271 3
  1. print (df_obj.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)])) # 通过元组提供新的列名
  1. data1 data2
  2. mean std count range mean std count range
  3. key1
  4. a 0.709792 0.674293 5 1.686756 -0.224477 0.385674 5 0.978865
  5. b -0.244746 0.726957 3 1.451857 -0.851723 1.528271 3 2.797850
  1. # 每列作用不同的聚合函数
  2. dict_mapping = {'data1':'mean',
  3. 'data2':'sum'}
  4. print (df_obj.groupby('key1').agg(dict_mapping))
  1. data1 data2
  2. key1
  3. a 0.709792 -1.122384
  4. b -0.244746 -2.555170
  1. dict_mapping = {'data1':['mean','max'],
  2. 'data2':'sum'}
  3. print (df_obj.groupby('key1').agg(dict_mapping))
  1. data1 data2
  2. mean max sum
  3. key1
  4. a 0.709792 1.539144 -1.122384
  5. b -0.244746 0.458871 -2.555170

三、分组运算

  1. import pandas as pd
  2. import numpy as np

分组和对齐

  1. s1 = pd.Series(range(10, 20), index = range(10))
  2. s2 = pd.Series(range(20, 25), index = range(5))
  3. print ('s1: ' )
  4. print (s1)
  5. print('===========================')
  6. print ('s2: ')
  7. print (s2)
  1. s1:
  2. 0 10
  3. 1 11
  4. 2 12
  5. 3 13
  6. 4 14
  7. 5 15
  8. 6 16
  9. 7 17
  10. 8 18
  11. 9 19
  12. dtype: int64
  13. ===========================
  14. s2:
  15. 0 20
  16. 1 21
  17. 2 22
  18. 3 23
  19. 4 24
  20. dtype: int64
  1. # Series 对齐运算
  2. s1 + s2
  3. print(s1+s2)
  1. 0 30.0
  2. 1 32.0
  3. 2 34.0
  4. 3 36.0
  5. 4 38.0
  6. 5 NaN
  7. 6 NaN
  8. 7 NaN
  9. 8 NaN
  10. 9 NaN
  11. dtype: float64
  1. df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
  2. df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])
  3. print ('df1: ')
  4. print (df1)
  5. print ('=================')
  6. print ('df2: ')
  7. print (df2)
  1. df1:
  2. a b
  3. 0 1.0 1.0
  4. 1 1.0 1.0
  5. =================
  6. df2:
  7. a b c
  8. 0 1.0 1.0 1.0
  9. 1 1.0 1.0 1.0
  10. 2 1.0 1.0 1.0
  1. # DataFrame对齐操作
  2. print(df1 + df2)
  1. a b c
  2. 0 2.0 2.0 NaN
  3. 1 2.0 2.0 NaN
  4. 2 NaN NaN NaN

① 常用运算函数

  1. # 填充未对齐的数据进行运算
  2. print(s1.add(s2, fill_value = -1))
  1. 0 30.0
  2. 1 32.0
  3. 2 34.0
  4. 3 36.0
  5. 4 38.0
  6. 5 14.0
  7. 6 15.0
  8. 7 16.0
  9. 8 17.0
  10. 9 18.0
  11. dtype: float64
  1. df1.sub(df2, fill_value = 2.)
  2. #sub函数
a b c
0 0.0 0.0 1.0
1 0.0 0.0 1.0
2 1.0 1.0 1.0
  1. # 填充NaN
  2. s3 = s1 + s2
  3. print (s3)
  1. 0 30.0
  2. 1 32.0
  3. 2 34.0
  4. 3 36.0
  5. 4 38.0
  6. 5 NaN
  7. 6 NaN
  8. 7 NaN
  9. 8 NaN
  10. 9 NaN
  11. dtype: float64
  1. s3_filled = s3.fillna(-1)
  2. print (s3)
  1. 0 30.0
  2. 1 32.0
  3. 2 34.0
  4. 3 36.0
  5. 4 38.0
  6. 5 NaN
  7. 6 NaN
  8. 7 NaN
  9. 8 NaN
  10. 9 NaN
  11. dtype: float64
  1. df3 = df1 + df2
  2. print (df3)
  1. a b c
  2. 0 2.0 2.0 NaN
  3. 1 2.0 2.0 NaN
  4. 2 NaN NaN NaN
  1. df3.fillna(100, inplace = True)
  2. print (df3)
  1. a b c
  2. 0 2.0 2.0 100.0
  3. 1 2.0 2.0 100.0
  4. 2 100.0 100.0 100.0

统计计算VS聚合运算

  1. df_obj1 = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
  2. print(df_obj1)
  1. a b c d
  2. 0 -0.542708 0.201376 1.111431 1.784324
  3. 1 0.583422 0.231096 -2.801967 0.568497
  4. 2 -0.577329 -1.668581 -0.842126 1.803080
  5. 3 -0.128431 -1.769619 2.089983 0.209761
  6. 4 0.493981 -1.571405 0.690019 -0.215292
  1. print(df_obj1.sum(axis=1))
  2. print('=====================================')
  3. print(df_obj1.max())
  4. print('=====================================')
  5. print(df_obj1.min(axis=1))
  1. 0 2.554423
  2. 1 -1.418952
  3. 2 -1.284956
  4. 3 0.401694
  5. 4 -0.602698
  6. dtype: float64
  7. =====================================
  8. a 0.583422
  9. b 0.231096
  10. c 2.089983
  11. d 1.803080
  12. dtype: float64
  13. =====================================
  14. 0 -0.542708
  15. 1 -2.801967
  16. 2 -1.668581
  17. 3 -1.769619
  18. 4 -1.571405
  19. dtype: float64

数据分组运算

  1. # 分组运算后保持shape
  2. dict_obj = {'key1' : ['a', 'b', 'a', 'b',
  3. 'a', 'b', 'a', 'a'],
  4. 'key2' : ['one', 'one', 'two', 'three',
  5. 'two', 'two', 'one', 'three'],
  6. 'data1': np.random.randint(1, 10, 8),
  7. 'data2': np.random.randint(1, 10, 8)}
  8. df_obj = pd.DataFrame(dict_obj)
  9. df_obj
key1 key2 data1 data2
0 a one 4 3
1 b one 4 4
2 a two 9 6
3 b three 8 2
4 a two 3 3
5 b two 6 2
6 a one 4 1
7 a three 2 2
  1. # 按key1分组后,计算data1,data2的统计信息======并附加到原始表格中
  2. k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
  3. print(k1_sum)
  4. print('================================')
  5. print(df_obj)
  1. sum_data1 sum_data2
  2. key1
  3. a 22 15
  4. b 18 8
  5. ================================
  6. key1 key2 data1 data2
  7. 0 a one 4 3
  8. 1 b one 4 4
  9. 2 a two 9 6
  10. 3 b three 8 2
  11. 4 a two 3 3
  12. 5 b two 6 2
  13. 6 a one 4 1
  14. 7 a three 2 2
  1. merge方法
  1. # 方法1,使用merge
  2. pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)
key1 key2 data1 data2 sum_data1 sum_data2
0 a one 4 3 22 15
2 a two 9 6 22 15
4 a two 3 3 22 15
6 a one 4 1 22 15
7 a three 2 2 22 15
1 b one 4 4 18 8
3 b three 8 2 18 8
5 b two 6 2 18 8
  1. transform方法
  1. # 方法2,使用transform
  2. k1_sum_tf = df_obj.groupby('key1').transform(np.sum).add_prefix('sum_')
  3. df_obj[k1_sum_tf.columns] = k1_sum_tf
  4. df_obj
key1 key2 data1 data2 sum_key2 sum_data1 sum_data2
0 a one 4 3 onetwotwoonethree 22 15
1 b one 4 4 onethreetwo 18 8
2 a two 9 6 onetwotwoonethree 22 15
3 b three 8 2 onethreetwo 18 8
4 a two 3 3 onetwotwoonethree 22 15
5 b two 6 2 onethreetwo 18 8
6 a one 4 1 onetwotwoonethree 22 15
7 a three 2 2 onetwotwoonethree 22 15
  1. 自定义函数
  1. # 自定义函数传入transform
  2. def diff_mean(s):
  3. """
  4. 返回数据与均值的差值
  5. """
  6. return s - s.mean()
  7. df_obj.groupby('key1').transform(diff_mean)
data1 data2 sum_data1 sum_data2
0 -0.4 0.000000 0 0
1 -2.0 1.333333 0 0
2 4.6 3.000000 0 0
3 2.0 -0.666667 0 0
4 -1.4 0.000000 0 0
5 0.0 -0.666667 0 0
6 -0.4 -2.000000 0 0
7 -2.4 -1.000000 0 0
  1. dataset_path = './data/starcraft.csv'
  2. df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek',
  3. 'TotalHours', 'APM'])
  • apply
  1. def top_n(df, n=3, column='APM'):
  2. """
  3. 返回每个分组按 column 的 top n 数据
  4. """
  5. return df.sort_values(by=column, ascending=False)[:n]
  6. df_data.groupby('LeagueIndex').apply(top_n)
LeagueIndex Age HoursPerWeek TotalHours APM
LeagueIndex
1 2214 1 20.0 12.0 730.0 172.9530
2246 1 27.0 8.0 250.0 141.6282
1753 1 20.0 28.0 100.0 139.6362
2 3062 2 20.0 6.0 100.0 179.6250
3229 2 16.0 24.0 110.0 156.7380
1520 2 29.0 6.0 250.0 151.6470
3 1557 3 22.0 6.0 200.0 226.6554
484 3 19.0 42.0 450.0 220.0692
2883 3 16.0 8.0 800.0 208.9500
4 2688 4 26.0 24.0 990.0 249.0210
1759 4 16.0 6.0 75.0 229.9122
2637 4 23.0 24.0 650.0 227.2272
5 3277 5 18.0 16.0 950.0 372.6426
93 5 17.0 36.0 720.0 335.4990
202 5 37.0 14.0 800.0 327.7218
6 734 6 16.0 28.0 730.0 389.8314
2746 6 16.0 28.0 4000.0 350.4114
1810 6 21.0 14.0 730.0 323.2506
7 3127 7 23.0 42.0 2000.0 298.7952
104 7 21.0 24.0 1000.0 286.4538
1654 7 18.0 98.0 700.0 236.0316
8 3393 8 NaN NaN NaN 375.8664
3373 8 NaN NaN NaN 364.8504
3372 8 NaN NaN NaN 355.3518
  1. # apply函数接收的参数会传入自定义的函数中
  2. df_data.groupby('LeagueIndex').apply(top_n, n=2, column='Age')
LeagueIndex Age HoursPerWeek TotalHours APM
LeagueIndex
1 3146 1 40.0 12.0 150.0 38.5590
3040 1 39.0 10.0 500.0 29.8764
2 920 2 43.0 10.0 730.0 86.0586
2437 2 41.0 4.0 200.0 54.2166
3 1258 3 41.0 14.0 800.0 77.6472
2972 3 40.0 10.0 500.0 60.5970
4 1696 4 44.0 6.0 500.0 89.5266
1729 4 39.0 8.0 500.0 86.7246
5 202 5 37.0 14.0 800.0 327.7218
2745 5 37.0 18.0 1000.0 123.4098
6 3069 6 31.0 8.0 800.0 133.1790
2706 6 31.0 8.0 700.0 66.9918
7 2813 7 26.0 36.0 1300.0 188.5512
1992 7 26.0 24.0 1000.0 219.6690
8 3340 8 NaN NaN NaN 189.7404
3341 8 NaN NaN NaN 287.8128
  • 禁止分组 group_keys=False
  1. df_data.groupby('LeagueIndex', group_keys=False).apply(top_n)
LeagueIndex Age HoursPerWeek TotalHours APM
2214 1 20.0 12.0 730.0 172.9530
2246 1 27.0 8.0 250.0 141.6282
1753 1 20.0 28.0 100.0 139.6362
3062 2 20.0 6.0 100.0 179.6250
3229 2 16.0 24.0 110.0 156.7380
1520 2 29.0 6.0 250.0 151.6470
1557 3 22.0 6.0 200.0 226.6554
484 3 19.0 42.0 450.0 220.0692
2883 3 16.0 8.0 800.0 208.9500
2688 4 26.0 24.0 990.0 249.0210
1759 4 16.0 6.0 75.0 229.9122
2637 4 23.0 24.0 650.0 227.2272
3277 5 18.0 16.0 950.0 372.6426
93 5 17.0 36.0 720.0 335.4990
202 5 37.0 14.0 800.0 327.7218
734 6 16.0 28.0 730.0 389.8314
2746 6 16.0 28.0 4000.0 350.4114
1810 6 21.0 14.0 730.0 323.2506
3127 7 23.0 42.0 2000.0 298.7952
104 7 21.0 24.0 1000.0 286.4538
1654 7 18.0 98.0 700.0 236.0316
3393 8 NaN NaN NaN 375.8664
3373 8 NaN NaN NaN 364.8504
3372 8 NaN NaN NaN 355.3518