规整数据:连接、合并、重构、转换etc


三、数据规整-连接-含索引

  • 数据连接 merge
  1. import pandas as pd
  2. import numpy as np
  1. df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
  2. 'data1' : ['sfd','fdsf','we',24,3253,234,23]})
  3. df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
  4. 'data2' : np.random.randint(0,10,3)})
  5. print (df_obj1)
  6. print (df_obj2)
  1. key data1
  2. 0 b sfd
  3. 1 b fdsf
  4. 2 a we
  5. 3 c 24
  6. 4 a 3253
  7. 5 a 234
  8. 6 b 23
  9. key data2
  10. 0 a 0
  11. 1 b 6
  12. 2 d 6

merge和on

默认将重叠列的列名作为“外键”进行连接

  1. pd.merge(df_obj1, df_obj2)
key data1 data2
0 b sfd 6
1 b fdsf 6
2 b 23 6
3 a we 0
4 a 3253 0
5 a 234 0
  1. # on显示指定“外键”
  2. pd.merge(df_obj1, df_obj2, on='key')
key data1 data2
0 b sfd 6
1 b fdsf 6
2 b 23 6
3 a we 0
4 a 3253 0
5 a 234 0

left-on和right-on

  1. # left_on,right_on分别指定左侧数据和右侧数据的“外键”
  2. # 更改列名
  3. df_obj1 = df_obj1.rename(columns={'key':'key1'})
  4. df_obj2 = df_obj2.rename(columns={'key':'key2'})
  1. print(df_obj1)
  2. print(df_obj2)
  1. key1 data1
  2. 0 b sfd
  3. 1 b fdsf
  4. 2 a we
  5. 3 c 24
  6. 4 a 3253
  7. 5 a 234
  8. 6 b 23
  9. key2 data2
  10. 0 a 0
  11. 1 b 6
  12. 2 d 6
  1. pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
key1 data1 key2 data2
0 b sfd b 6
1 b fdsf b 6
2 b 23 b 6
3 a we a 0
4 a 3253 a 0
5 a 234 a 0

how

  1. # “外连接”
  2. pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
key1 data1 key2 data2
0 b sfd b 6.0
1 b fdsf b 6.0
2 b 23 b 6.0
3 a we a 0.0
4 a 3253 a 0.0
5 a 234 a 0.0
6 c 24 NaN NaN
7 NaN NaN d 6.0
  1. # 左连接
  2. pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
key1 data1 key2 data2
0 b sfd b 6.0
1 b fdsf b 6.0
2 a we a 0.0
3 c 24 NaN NaN
4 a 3253 a 0.0
5 a 234 a 0.0
6 b 23 b 6.0
  1. # 右连接
  2. pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
key1 data1 key2 data2
0 b sfd b 6
1 b fdsf b 6
2 b 23 b 6
3 a we a 0
4 a 3253 a 0
5 a 234 a 0
6 NaN NaN d 6

处理重复列名suffixes

  1. # 处理重复列名
  2. df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
  3. 'data' : np.random.randint(0,10,7)})
  4. df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
  5. 'data' : np.random.randint(0,10,3)})
  6. pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))
key data_left data_right
0 b 9 1
1 b 1 1
2 b 6 1
3 a 7 1
4 a 3 1
5 a 4 1
  1. # 按索引连接
  2. df_obj3 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
  3. 'data1' : np.random.randint(0,10,7)})
  4. df_obj4 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
  1. print(df_obj3)
  2. print(df_obj4)
  1. key data1
  2. 0 b 7
  3. 1 b 4
  4. 2 a 1
  5. 3 c 9
  6. 4 a 2
  7. 5 a 9
  8. 6 b 7
  9. data2
  10. a 9
  11. b 4
  12. d 0
  1. pd.merge(df_obj3, df_obj4, left_on='key', right_index=True)
key data1 data2
0 b 7 4
1 b 4 4
6 b 7 4
2 a 1 9
4 a 2 9
5 a 9 9

按索引连接right_index

  1. # 按索引连接
  2. df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
  3. 'data1' : np.random.randint(0,10,7)})
  4. df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
  1. print(df_obj1)
  2. print(df_obj2)
  1. key data1
  2. 0 b 0
  3. 1 b 2
  4. 2 a 7
  5. 3 c 3
  6. 4 a 1
  7. 5 a 1
  8. 6 b 6
  9. data2
  10. a 2
  11. b 1
  12. d 1
  1. pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)
key data1 data2
0 b 0 1
1 b 2 1
6 b 6 1
2 a 7 2
4 a 1 2
5 a 1 2

四、数据合并

  • 数据合并 concat
  • 按索引连接===right_index
  1. import numpy as np
  2. import pandas as pd

numpy的concat

  1. arr1 = np.random.randint(0, 10, (3, 4))
  2. arr2 = np.random.randint(0, 10, (3, 4))
  3. print (arr1)
  4. print (arr2)
  1. [[6 0 3 2]
  2. [5 7 9 8]
  3. [5 8 0 3]]
  4. [[6 5 7 9]
  5. [0 1 0 0]
  6. [1 1 1 7]]
  1. np.concatenate([arr1, arr2])
  1. array([[6, 0, 3, 2],
  2. [5, 7, 9, 8],
  3. [5, 8, 0, 3],
  4. [6, 5, 7, 9],
  5. [0, 1, 0, 0],
  6. [1, 1, 1, 7]])
  1. np.concatenate([arr1, arr2], axis=1)
  1. array([[6, 0, 3, 2, 6, 5, 7, 9],
  2. [5, 7, 9, 8, 0, 1, 0, 0],
  3. [5, 8, 0, 3, 1, 1, 1, 7]])

series上的concat

  1. # index 没有重复的情况
  2. ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
  3. ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
  4. ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))
  1. pd.concat([ser_obj1, ser_obj2, ser_obj3])
  1. 0 0
  2. 1 4
  3. 2 5
  4. 3 1
  5. 4 9
  6. 5 7
  7. 6 8
  8. 7 5
  9. 8 0
  10. 9 5
  11. 10 9
  12. 11 0
  13. dtype: int32
  1. pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1)
0 1 2
0 0.0 NaN NaN
1 4.0 NaN NaN
2 5.0 NaN NaN
3 1.0 NaN NaN
4 9.0 NaN NaN
5 NaN 7.0 NaN
6 NaN 8.0 NaN
7 NaN 5.0 NaN
8 NaN 0.0 NaN
9 NaN NaN 5.0
10 NaN NaN 9.0
11 NaN NaN 0.0
  1. # index 有重复的情况
  2. ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
  3. ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
  4. ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))
  5. print (ser_obj1)
  6. print (ser_obj2)
  7. print (ser_obj3)
  1. 0 5
  2. 1 3
  3. 2 0
  4. 3 8
  5. 4 3
  6. dtype: int32
  7. 0 5
  8. 1 3
  9. 2 2
  10. 3 1
  11. dtype: int32
  12. 0 5
  13. 1 8
  14. 2 6
  15. dtype: int32
  1. pd.concat([ser_obj1, ser_obj2, ser_obj3])
  1. 0 5
  2. 1 3
  3. 2 0
  4. 3 8
  5. 4 3
  6. 0 5
  7. 1 3
  8. 2 2
  9. 3 1
  10. 0 5
  11. 1 8
  12. 2 6
  13. dtype: int32
  1. pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1, join='inner')
0 1 2
0 5 5 5
1 3 3 8
2 0 2 6

dataframe上的concat

  1. df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
  2. columns=['A', 'B'])
  3. df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
  4. columns=['C', 'D'])
  5. print (df_obj1)
  6. print (df_obj2)
  1. A B
  2. a 4 3
  3. b 8 1
  4. c 6 3
  5. C D
  6. a 1 3
  7. b 8 2
  1. pd.concat([df_obj1, df_obj2])
  1. C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
  2. of pandas will change to not sort by default.
  3. To accept the future behavior, pass 'sort=False'.
  4. To retain the current behavior and silence the warning, pass 'sort=True'.
  5. """Entry point for launching an IPython kernel.
A B C D
a 4.0 3.0 NaN NaN
b 8.0 1.0 NaN NaN
c 6.0 3.0 NaN NaN
a NaN NaN 1.0 3.0
b NaN NaN 8.0 2.0
  1. pd.concat([df_obj1, df_obj2], axis=1)
  1. C:\Users\wztli\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
  2. of pandas will change to not sort by default.
  3. To accept the future behavior, pass 'sort=False'.
  4. To retain the current behavior and silence the warning, pass 'sort=True'.
  5. """Entry point for launching an IPython kernel.
A B C D
a 4 3 1.0 3.0
b 8 1 8.0 2.0
c 6 3 NaN NaN

五、数据重构

  1. import numpy as np
  2. import pandas as pd

stack

  1. df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
  2. df_obj
data1 data2
0 0 4
1 6 2
2 9 8
3 7 0
4 3 1
  1. stacked = df_obj.stack()
  2. print (stacked)
  1. 0 data1 0
  2. data2 4
  3. 1 data1 6
  4. data2 2
  5. 2 data1 9
  6. data2 8
  7. 3 data1 7
  8. data2 0
  9. 4 data1 3
  10. data2 1
  11. dtype: int32
  1. print (type(stacked))
  2. print (type(stacked.index))
  1. <class 'pandas.core.series.Series'>
  2. <class 'pandas.core.indexes.multi.MultiIndex'>

unstack

  1. # 默认操作内层索引
  2. stacked.unstack()
data1 data2
0 0 4
1 6 2
2 9 8
3 7 0
4 3 1
  1. # 通过level指定操作索引的级别
  2. stacked.unstack(level=0)
0 1 2 3 4
data1 0 6 9 7 3
data2 4 2 8 0 1

六、数据转换

  1. import numpy as np
  2. import pandas as pd

重复数据duplicates函数

  1. df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
  2. 'data2' : np.random.randint(0, 4, 8)})
  3. df_obj
data1 data2
0 a 3
1 a 2
2 a 2
3 a 1
4 b 0
5 b 2
6 b 2
7 b 1
  1. df_obj.duplicated()
  1. 0 False
  2. 1 False
  3. 2 True
  4. 3 False
  5. 4 False
  6. 5 False
  7. 6 True
  8. 7 False
  9. dtype: bool
  1. df_obj.drop_duplicates()
data1 data2
0 a 3
1 a 2
3 a 1
4 b 0
5 b 2
7 b 1
  1. df_obj.drop_duplicates('data2')
data1 data2
0 a 3
1 a 2
3 a 1
4 b 0

map函数

  1. ser_obj = pd.Series(np.random.randint(0,10,10))
  2. ser_obj
  1. 0 1
  2. 1 9
  3. 2 1
  4. 3 2
  5. 4 7
  6. 5 2
  7. 6 4
  8. 7 5
  9. 8 4
  10. 9 6
  11. dtype: int32
  1. ser_obj.map(lambda x : x ** 2)
  1. 0 1
  2. 1 81
  3. 2 1
  4. 3 4
  5. 4 49
  6. 5 4
  7. 6 16
  8. 7 25
  9. 8 16
  10. 9 36
  11. dtype: int64

数据替换repalce

  1. # 替换单个值
  2. ser_obj.replace(0, -100)
  1. 0 1
  2. 1 9
  3. 2 1
  4. 3 2
  5. 4 7
  6. 5 2
  7. 6 4
  8. 7 5
  9. 8 4
  10. 9 6
  11. dtype: int32
  1. # 替换多个值
  2. ser_obj.replace([0, 2], -100)
  1. 0 1
  2. 1 9
  3. 2 1
  4. 3 -100
  5. 4 7
  6. 5 -100
  7. 6 4
  8. 7 5
  9. 8 4
  10. 9 6
  11. dtype: int32
  1. # 替换多个值
  2. ser_obj.replace([0, 2], [-100, -200])
  1. 0 1
  2. 1 9
  3. 2 1
  4. 3 -200
  5. 4 7
  6. 5 -200
  7. 6 4
  8. 7 5
  9. 8 4
  10. 9 6
  11. dtype: int64
  1. ser_obj.map(lambda x : x ** 2)
  2. #### 3. 数据替换repalce
  3. # 替换单个值
  4. ser_obj.replace(0, -100)
  5. # 替换多个值
  6. ser_obj.replace([0, 2], -100)
  7. # 替换多个值
  8. ser_obj.replace([0, 2], [-100, -200])
  1. 0 1
  2. 1 9
  3. 2 1
  4. 3 -200
  5. 4 7
  6. 5 -200
  7. 6 4
  8. 7 5
  9. 8 4
  10. 9 6
  11. dtype: int64