数据合并
合并 — concat
import numpy as npimport pandas as pd
#合并两个seriess1 = pd.Series(['a', 'b'])s2 = pd.Series(['c', 'd'])pd.concat([s1, s2])
0 a1 b0 c1 ddtype: object
# 忽略存在得index,并且重新设定index值pd.concat([s1,s2],ignore_index=True)
0 a1 b2 c3 ddtype: object
# 增加复合索引 pd.concat([s1,s2],keys=['s1','s2'])
s1 0 a 1 bs2 0 c 1 ddtype: object
# 合并两个DataFrame df1 = pd.DataFrame([['a', 1], ['b', 2]],columns= ['letter', 'number']) df2 = pd.DataFrame([['c', 3], ['d', 4]],columns= ['letter', 'number']) df1 df2 pd.concat([df1, df2])
|
letter |
number |
| 0 |
a |
1 |
| 1 |
b |
2 |
| 0 |
c |
3 |
| 1 |
d |
4 |
# 将DataFrame具有重叠列的对象合并,缺失值将填充NaN值 df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], columns=['letter', 'number', 'animal'])pd.concat([df1,df3],sort=False)
|
letter |
number |
animal |
| 0 |
a |
1 |
NaN |
| 1 |
b |
2 |
NaN |
| 0 |
c |
3 |
cat |
| 1 |
d |
4 |
dog |
# 合并DataFrame具有重叠列,并仅通过传递inner给join关键字参数 返回那些共享的对象pd.concat([df1, df3], join="inner")
|
letter |
number |
| 0 |
a |
1 |
| 1 |
b |
2 |
| 0 |
c |
3 |
| 1 |
d |
4 |
# DataFrame传入,沿x轴水平合并对象axis=1 df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],columns=['animal', 'name']) pd.concat([df1, df4], axis=1)
|
letter |
number |
animal |
name |
| 0 |
a |
1 |
bird |
polly |
| 1 |
b |
2 |
monkey |
george |
#纵向合并:纵向堆叠,需要区分情况。#横向合并:横向堆叠,即将两个表在X轴向拼接在一起,可以使用concat函数完成
join
DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)
df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})df
|
key |
A |
| 0 |
K0 |
A0 |
| 1 |
K1 |
A1 |
| 2 |
K2 |
A2 |
| 3 |
K3 |
A3 |
| 4 |
K4 |
A4 |
| 5 |
K5 |
A5 |
other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 'B': ['B0', 'B1', 'B2']})other
|
key |
B |
| 0 |
K0 |
B0 |
| 1 |
K1 |
B1 |
| 2 |
K2 |
B2 |
df.join(other, how='inner',lsuffix='_caller', rsuffix='_other')
|
key_caller |
A |
key_other |
B |
| 0 |
K0 |
A0 |
K0 |
B0 |
| 1 |
K1 |
A1 |
K1 |
B1 |
| 2 |
K2 |
A2 |
K2 |
B2 |
merge
pd.merge(left, right, how='inner', on=None, left_on=None,right_on=None,left_index=False, right_index=False)
#参数名称 说明#left 接收DataFrame,左表,无默认。#right 接收DataFrame或Series。右表,无默认。#how {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’#on 接收string或sequence。表示外键字段名。#left_on 关联操作时左表中的关联字段名。#right_on 关联操作时右表中的关联字段名。#left_index 是否将左表的index作为连接主键。(False)#right_index 是否将右表的index作为连接主键。(False)#sort 是否根据连接键对合并后的数据进行排序。(False)#suffixes tuple。合并后左表与右表重叠列名的别名尾缀。默认为('x', 'y')。