一、Pandas数据结构

  1. import pandas as pd

Series

  1. 通过list构建Series
  1. ser_obj = pd.Series(range(10, 20,2))
  2. print (type(ser_obj))
  3. print(ser_obj)
  1. <class 'pandas.core.series.Series'>
  2. 0 10
  3. 1 12
  4. 2 14
  5. 3 16
  6. 4 18
  7. dtype: int64
  1. # 获取数据
  2. print (ser_obj.values)
  3. # 获取索引
  4. print (ser_obj.index)
  5. #范围索引数据类型
  6. # 预览数据
  7. print (ser_obj.head(3))
  8. #默认输出五行
  1. [10 12 14 16 18]
  2. RangeIndex(start=0, stop=5, step=1)
  3. 0 10
  4. 1 12
  5. 2 14
  6. dtype: int64
  1. 通过dict构建Series
  1. year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5,2004:324,2423:243}
  2. ser_obj2 = pd.Series(year_data)
  3. print (ser_obj2.head(2))
  4. print (ser_obj2.index)
  5. print(ser_obj2)
  1. 2001 17.8
  2. 2002 20.1
  3. dtype: float64
  4. Int64Index([2001, 2002, 2003, 2004, 2423], dtype='int64')
  5. 2001 17.8
  6. 2002 20.1
  7. 2003 16.5
  8. 2004 324.0
  9. 2423 243.0
  10. dtype: float64
  1. # name属性【【【【【出问题了!!!】】】】】
  2. ser_obj2.name = '钱'
  3. ser_obj2.index.name = 'year'
  4. print (ser_obj2.head())
  1. year
  2. 2001 17.8
  3. 2002 20.1
  4. 2003 16.5
  5. 2004 324.0
  6. 2423 243.0
  7. Name: 钱, dtype: float64

DataFrame

  1. 通过ndarray构建DataFrame
  1. import numpy as np
  2. array = np.random.rand(5,4)
  3. print (array)
  4. df_obj = pd.DataFrame(array,columns=['a','b','c','d'])
  5. print (df_obj.head())
  6. print(df_obj.sort_values(by='a', ascending=False))
  1. [[0.23496522 0.92258429 0.36447462 0.52634697]
  2. [0.73743514 0.88175941 0.48944212 0.4173522 ]
  3. [0.21214568 0.57148666 0.59496072 0.49490723]
  4. [0.7458542 0.74743907 0.70475157 0.28130394]
  5. [0.43805937 0.90300134 0.00730653 0.68203725]]
  6. a b c d
  7. 0 0.234965 0.922584 0.364475 0.526347
  8. 1 0.737435 0.881759 0.489442 0.417352
  9. 2 0.212146 0.571487 0.594961 0.494907
  10. 3 0.745854 0.747439 0.704752 0.281304
  11. 4 0.438059 0.903001 0.007307 0.682037
  12. a b c d
  13. 3 0.745854 0.747439 0.704752 0.281304
  14. 1 0.737435 0.881759 0.489442 0.417352
  15. 4 0.438059 0.903001 0.007307 0.682037
  16. 0 0.234965 0.922584 0.364475 0.526347
  17. 2 0.212146 0.571487 0.594961 0.494907
  1. 通过dict构建DataFrame
  1. #一个键值对就相当于一列!!但是具体到字典里面的值所用到的一些函数还是不能很清楚
  2. dict_data = {'A': 1.,
  3. 'B': pd.Timestamp('20161217'),
  4. 'C': pd.Series(1, index=list(range(4)),dtype='float32'),
  5. 'D': np.array([3] * 4,dtype='int32'),
  6. 'E' : pd.Categorical(["Python","Java","C++","C#"]),
  7. 'F' : 'ChinaHadoop' }
  8. df_obj2 = pd.DataFrame(dict_data)
  9. print (df_obj2.head())
  1. A B C D E F
  2. 0 1.0 2016-12-17 1.0 3 Python ChinaHadoop
  3. 1 1.0 2016-12-17 1.0 3 Java ChinaHadoop
  4. 2 1.0 2016-12-17 1.0 3 C++ ChinaHadoop
  5. 3 1.0 2016-12-17 1.0 3 C# ChinaHadoop
  1. # 增加列
  2. df_obj2['G'] = df_obj2['D'] + 4
  3. print (df_obj2.head())
  4. xxx = pd.DataFrame(df_obj2,columns=['A','B','C','D','E','F','G','H'],index=[0,1,2,3,4])
  5. print(xxx)
  1. A B C D E F G
  2. 0 1.0 2016-12-17 1.0 3 Python ChinaHadoop 7
  3. 1 1.0 2016-12-17 1.0 3 Java ChinaHadoop 7
  4. 2 1.0 2016-12-17 1.0 3 C++ ChinaHadoop 7
  5. 3 1.0 2016-12-17 1.0 3 C# ChinaHadoop 7
  6. A B C D E F G H
  7. 0 1.0 2016-12-17 1.0 3.0 Python ChinaHadoop 7.0 NaN
  8. 1 1.0 2016-12-17 1.0 3.0 Java ChinaHadoop 7.0 NaN
  9. 2 1.0 2016-12-17 1.0 3.0 C++ ChinaHadoop 7.0 NaN
  10. 3 1.0 2016-12-17 1.0 3.0 C# ChinaHadoop 7.0 NaN
  11. 4 NaN NaT NaN NaN NaN NaN NaN NaN

Index

  1. print (type(ser_obj.index))
  2. print (type(df_obj2.index))
  3. print (df_obj2.index)
  1. <class 'pandas.core.indexes.range.RangeIndex'>
  2. <class 'pandas.core.indexes.numeric.Int64Index'>
  3. Int64Index([0, 1, 2, 3], dtype='int64')
  1. # 索引对象不可变
  2. df_obj2.index[0] = 2
  1. ---------------------------------------------------------------------------
  2. TypeError Traceback (most recent call last)
  3. <ipython-input-10-6367894e76d8> in <module>
  4. 1 # 索引对象不可变
  5. ----> 2 df_obj2.index[0] = 2
  6. ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in __setitem__(self, key, value)
  7. 4258
  8. 4259 def __setitem__(self, key, value):
  9. -> 4260 raise TypeError("Index does not support mutable operations")
  10. 4261
  11. 4262 def __getitem__(self, key):
  12. TypeError: Index does not support mutable operations

二、Pandas数据操作

常用函数总结

·shape 获取数据的尺寸

  1. 获得dfsizedf.shape
  2. 获得df中的行数:df.shape[0]
  3. 获得df中的列数: df.shape[1]
  4. 获得行索引信息:df.index
  5. 获得列索引信息:df.colomns

·values 获得df中的值===中文没用

  1. df.values === 以列表的形式展现出来,去除了索引===dataframe类型数据转换成array类型

·setindex和resetindex

  1. reset_index可以还原索引,从新变为默认的整型索引
  2. DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill=”)
  3. level控制了具体要还原的那个等级的索引
  4. dropFalse则索引列会被还原为普通列,否则会丢失
  5. set_index方法,设置单索引和复合索引抑或是添加索引。
  6. DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False)
  7. append添加新索引,dropFalseinplaceTrue时,索引将会还原为列

·iterrows()遍历DataFrame中的数据

  1. for index,row in df.iterrows():

·split(sep,n,expand=false)

  1. sep表示用于分割的字符;n表格分割成多少列;expand表示是否展开为数据款,True输出SeriesFalse输出Dataframe
  2. 字段拆分:是指按照固定的字符,拆分已有字符串
  1. import pandas as pd
  2. import numpy as np

匿名函数应用

  1. # Numpy ufunc 函数
  2. df = pd.DataFrame(np.random.randn(5,4) - 1)
  3. print (df)
  4. print (np.abs(df))
  1. 0 1 2 3
  2. 0 0.624016 -2.695175 -1.211426 -0.386151
  3. 1 -1.335385 -1.315232 -0.305902 -0.361348
  4. 2 -0.349443 -2.032110 0.075995 -0.966725
  5. 3 -1.631192 -1.051390 -1.767981 -0.366663
  6. 4 -0.786178 -0.335846 -0.797992 -0.931216
  7. 0 1 2 3
  8. 0 0.624016 2.695175 1.211426 0.386151
  9. 1 1.335385 1.315232 0.305902 0.361348
  10. 2 0.349443 2.032110 0.075995 0.966725
  11. 3 1.631192 1.051390 1.767981 0.366663
  12. 4 0.786178 0.335846 0.797992 0.931216
  1. # 使用apply应用行或列数据
  2. f = lambda x : x.max()
  3. print (df.apply(f))
  1. 0 0.624016
  2. 1 -0.335846
  3. 2 0.075995
  4. 3 -0.361348
  5. dtype: float64
  1. # 指定轴方向
  2. print (df.apply(f, axis=1))
  1. 0 0.624016
  2. 1 -0.305902
  3. 2 0.075995
  4. 3 -0.366663
  5. 4 -0.335846
  6. dtype: float64
  1. # 使用applymap应用到每个数据
  2. f2 = lambda x : '%.2f' % x
  3. print (df.applymap(f2))
  1. 0 1 2 3
  2. 0 0.62 -2.70 -1.21 -0.39
  3. 1 -1.34 -1.32 -0.31 -0.36
  4. 2 -0.35 -2.03 0.08 -0.97
  5. 3 -1.63 -1.05 -1.77 -0.37
  6. 4 -0.79 -0.34 -0.80 -0.93

排序

  1. s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
  2. print (s4)
  1. 4 10
  2. 1 11
  3. 4 12
  4. 1 13
  5. 1 14
  6. dtype: int64
  1. 索引排序
  1. s4.sort_index()
  1. 1 11
  2. 1 13
  3. 1 14
  4. 4 10
  5. 4 12
  6. dtype: int64
  1. df4 = pd.DataFrame(np.random.randn(3, 4),
  2. index=np.random.randint(3, size=3),
  3. columns=np.random.randint(4, size=4))
  1. df4
2 1 3 1
0 0.007031 1.261990 -1.647929 0.176549
1 -2.510698 -0.207659 0.628221 0.441352
0 -0.367051 1.536606 0.167158 -0.236129
  1. #df4.sort_index(ascending=False)
  2. df4.sort_index(axis=1)
1 1 2 3
0 1.261990 0.176549 0.007031 -1.647929
1 -0.207659 0.441352 -2.510698 0.628221
0 1.536606 -0.236129 -0.367051 0.167158
  1. 按值排序
  1. #df.sortvalues(by='a', ascending=False) === 通过a的值
  2. # 作用是对选定的一列数值('a')数据从上往下从小到大进行排序(如果传值没成功===设置本体覆盖,传值覆盖)
  3. df4.sort_values(by=1)
  1. ---------------------------------------------------------------------------
  2. ValueError Traceback (most recent call last)
  3. <ipython-input-22-36ffa8ddd07d> in <module>
  4. 2 #df.sortvalues(by='a', ascending=False) === 通过a的值
  5. 3 # 作用是对选定的一列数值('a')数据从上往下从小到大进行排序(如果传值没成功===设置本体覆盖,传值覆盖)
  6. ----> 4 df4.sort_values(by=1)
  7. ~\Anaconda3\lib\site-packages\pandas\core\frame.py in sort_values(self, by, axis, ascending, inplace, kind, na_position)
  8. 4991
  9. 4992 by = by[0]
  10. -> 4993 k = self._get_label_or_level_values(by, axis=axis)
  11. 4994
  12. 4995 if isinstance(ascending, (tuple, list)):
  13. ~\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_label_or_level_values(self, key, axis)
  14. 1795 key=key,
  15. 1796 label_axis_name=label_axis_name,
  16. -> 1797 multi_message=multi_message,
  17. 1798 )
  18. 1799 )
  19. ValueError: The column label '1' is not unique.