导入库
import numpy as npimport pandas as pdimport matplotlib.pyplot as plt
创建数据¶
使用pd.Series创建Series对象
s = pd.Series([1,3,5,np.nan,6,8])s>>>0 1.01 3.02 5.03 NaN4 6.05 8.0dtype: float64
通过numpy的array数据来创建DataFrame对象
dates = pd.date_range('20130101', periods=6)dates>>>DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04','2013-01-05', '2013-01-06'],dtype='datetime64[ns]', freq='D')df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))print(df)>>>A B C D2013-01-01 0.342275 -0.333060 -0.294502 1.8083112013-01-02 -0.010251 -0.322083 -0.992557 -0.9608912013-01-03 -0.344072 -1.185725 0.674009 -0.7160582013-01-04 -0.235446 -1.721794 -1.265767 0.2422532013-01-05 3.074955 1.848873 1.813445 -0.7956272013-01-06 -0.039975 1.090794 -0.605099 -1.111459
通过字典创建DataFrame对象
df2 = pd.DataFrame({ 'A' : 1.,'B' : pd.Timestamp('20130102'),'C' : pd.Series(1,index=list(range(4)),dtype='float32'),'D' : np.array([3] * 4,dtype='int32'),'E' : pd.Categorical(["test","train","test","train"]),'F' : 'foo' })print(df2)>>>A B C D E F0 1.0 2013-01-02 1.0 3 test foo1 1.0 2013-01-02 1.0 3 train foo2 1.0 2013-01-02 1.0 3 test foo3 1.0 2013-01-02 1.0 3 train fooprint(df2.dtypes)>>>A float64B datetime64[ns]C float32D int32E categoryF objectdtype: objectprint(dir(df2))>>>返回一个列表
数据查看¶
基本方法,务必掌握,更多相关查看数据的方法可以参与官方文档
下面分别是查看数据的顶部和尾部的方法
dates = pd.date_range('20130101', periods=6)df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))print(df.head(2)) # 查找头部的前两个数据,head()默认是前5个>>>A B C D2013-01-01 0.342275 -0.333060 -0.294502 1.8083112013-01-02 -0.010251 -0.322083 -0.992557 -0.960891\print(df.tail(3)) # 查找尾部的3个数据>>>A B C D2013-01-04 -0.235446 -1.721794 -1.265767 0.2422532013-01-05 3.074955 1.848873 1.813445 -0.7956272013-01-06 -0.039975 1.090794 -0.605099 -1.111459
查看DataFrame对象的索引,列名,数据信息
print(df.index) # 查找数据的索引>>>DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04','2013-01-05', '2013-01-06'],dtype='datetime64[ns]', freq='D')print(df.columns) # 查找数据的列名>>>Index(['A', 'B', 'C', 'D'], dtype='object')print(df.values)>>>[[ 0.34227537, -0.33306022, -0.29450173, 1.80831125],[-0.01025096, -0.3220833 , -0.99255656, -0.96089093],[-0.34407203, -1.18572491, 0.67400852, -0.71605802],[-0.2354458 , -1.7217938 , -1.26576668, 0.24225255],[ 3.07495472, 1.84887323, 1.81344527, -0.79562727],[-0.0399747 , 1.0907938 , -0.60509926, -1.11145858]]
描述性统计
print(df.describe())>>>A B C Dcount 6.000000 6.000000 6.000000 6.000000mean 0.896009 0.507334 0.520300 -0.604691std 1.226597 0.871341 1.047377 1.338247min -0.654441 -0.562629 -1.176366 -2.09314025% -0.189769 -0.114563 0.228438 -1.45052250% 1.394002 0.541984 0.574028 -0.90038075% 1.781452 0.899909 0.927062 0.030616max 2.049583 1.836860 1.992128 1.558711
数据转置
print(df.T)>>>2013-01-01 2013-01-02 2013-01-03 2013-01-04 2013-01-05 2013-01-06A 0.106123 -0.102469 2.113397 0.231633 -0.483121 -0.433374B 0.313585 -0.490534 1.031808 -0.209964 -0.679466 -0.562606C -0.030473 0.006813 -2.225570 -0.246044 0.066495 -0.371361D 0.416181 -1.344376 -1.508693 -1.496924 -0.236305 0.136705
根据列名排序
df.sort_index(axis=1, ascending=False) # 默认是True(升序),降序是False>>>D C B A2013-01-01 -1.743265 -2.436682 -0.126209 -0.9099772013-01-02 -1.237405 -0.479658 0.289598 0.3868312013-01-03 1.222879 -0.611954 0.466847 -0.9649702013-01-04 3.709656 -0.859897 0.626873 1.3866372013-01-05 0.372616 -0.668504 -0.824841 -0.5779402013-01-06 -0.172015 1.126194 -0.763443 -0.343243
根据B列数值排序
print(df.sort_values(by='B'))>>>A B C D2013-01-06 1.693180 -1.613240 -0.146807 -1.5505962013-01-01 0.156765 -1.120396 0.447468 -1.0144162013-01-03 0.775879 -1.093397 0.227219 0.1690622013-01-05 -1.279614 -1.088507 -0.327012 -0.5197092013-01-02 0.936605 -0.299676 -1.370837 -0.4259012013-01-04 0.812881 0.881192 -0.350331 -2.037380
数据选取¶
官方建议使用优化的熊猫数据访问方法.at,.iat,.loc和.iloc,部分较早的pandas版本可以使用.ix
5分钟学会Pandas中iloc/loc/ix区别
使用[ ]选取数据¶
选取单列数据,等效于df.A:
print(df['A']) # 相当于df.A>>>2013-01-01 0.3422752013-01-02 -0.0102512013-01-03 -0.3440722013-01-04 -0.2354462013-01-05 3.0749552013-01-06 -0.039975Freq: D, Name: A, dtype: float64
按行选取数据,使用[]
print(df[0:3]) # 打印前3行数据,前开后闭>>>A B C D2013-01-01 -1.889500 -1.413149 -0.039584 0.0315512013-01-02 1.480268 0.108239 -0.005645 0.5362602013-01-03 1.385717 0.227386 -0.098316 1.056272print(df['20130102':'20130104'])>>>A B C D2013-01-02 -0.175396 -0.608281 1.472997 -0.8429022013-01-03 1.073921 0.536321 -1.062791 0.7787092013-01-04 0.144927 -0.107287 -0.594705 0.644814
通过标签选取数据¶
print(df.loc[dates[0]]) # 选取第一行数据>>>A 0.342275B -0.333060C -0.294502D 1.808311Name: 2013-01-01 00:00:00, dtype: float64
print(df.loc[:,['A','B']])>>>A B2013-01-01 1.105010 -0.3209292013-01-02 -1.204395 -0.5706912013-01-03 -0.786688 0.4247012013-01-04 -0.121843 0.1278012013-01-05 -0.035029 0.2930372013-01-06 -0.603599 -1.931956print(df.loc['20130102':'20130104',['A','B']])>>>A B2013-01-02 0.467955 -2.1484672013-01-03 0.887625 1.0353882013-01-04 0.055645 -0.018191print(df.loc['20130102',['A','B']])>>>A -1.138735B -0.100542Name: 2013-01-02 00:00:00, dtype: float64print(df.loc[dates[0],'A'])>>>0.7391924458347098print(df.at[dates[0],'A'])>>>0.7391924458347098
通过位置选取数据¶
print(df.iloc[3]) # 获取下标为3(第4行)的数据>>>A -0.381260B 0.868501C -1.668756D 0.839632Name: 2013-01-04 00:00:00, dtype: float64print(df.iloc[3:5, 0:2]) # 前开后闭>>>A B2013-01-04 0.482456 -1.2169272013-01-05 1.008627 0.427897print(df.iloc[[1,2,4],[0,2]]) # 取下标为1,2,4的行>>>A C2013-01-02 1.112508 0.9693432013-01-03 -0.164053 -1.3225572013-01-05 -1.073691 -0.356547print(df.iloc[1:3]) # 取下标从1到3(3取不到)的行>>>A B C D2013-01-02 0.800642 -0.504769 0.519685 1.9789162013-01-03 0.137714 0.540270 0.374199 -1.224552print(df.iloc[:, 1:3]) # 取全部的行,列取1到3(3取不到)>>>B C2013-01-01 -0.333060 -0.2945022013-01-02 -0.322083 -0.9925572013-01-03 -1.185725 0.6740092013-01-04 -1.721794 -1.2657672013-01-05 1.848873 1.8134452013-01-06 1.090794 -0.605099print(df.iloc[1, 1]) # 获取单个值坐标为1,1>>>1.265304932997343print(df.iat[1, 1]) # 获取单个值坐标为1,1>>>-0.4083246435680777
使用布尔索引¶
print(df[df.A>0]) # 取A列大于0的>>>A B C D2013-01-01 0.342275 -0.333060 -0.294502 1.8083112013-01-05 3.074955 1.848873 1.813445 -0.795627print(df[df>0]) # 取全部大于0的数据>>>A B C D2013-01-01 NaN 0.362070 NaN 2.0379412013-01-02 1.500178 NaN 0.168748 0.6205322013-01-03 NaN NaN 0.922889 NaN2013-01-04 1.698255 0.025913 NaN NaN2013-01-05 0.152086 0.348645 0.237132 0.8689152013-01-06 NaN 0.050824 0.075995 NaNdf2 = df.copy() # 复制dfdf2['E'] = ['one', 'one','two','three','four','three'] # df2新增1列print(df2)>>>A B C D E2013-01-01 -0.897222 -1.714759 0.358384 -1.475133 one2013-01-02 -1.707707 -0.444518 -2.838489 -2.436182 one2013-01-03 -0.955428 0.005758 -0.264125 -0.045104 two2013-01-04 1.037277 0.255815 0.180912 -0.311802 three2013-01-05 1.631085 3.236270 -0.039909 -0.280554 four2013-01-06 1.758670 1.209860 0.948103 0.129601 threeprint(df2[df2['E'].isin(['two','four'])]) # 取df2中E列中含有two和four的每一行>>>A B C D E2013-01-03 -0.269128 -0.624533 -1.616405 -0.678576 two2013-01-05 0.203549 -0.853705 -0.523561 1.429644 four
数据可视化¶
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) # 创建对象print(ts.head())ts = ts.cumsum() #累加
