1. 导入与预览数据

  1. import numpy as np
  2. import pandas as pd
  3. import seaborn as sns
  4. plants = pd.read_csv('planets.csv')
  5. print(plants.head())
  6. method number orbital_period mass distance year
  7. 0 Radial Velocity 1 269.300 7.10 77.40 2006
  8. 1 Radial Velocity 1 874.774 2.21 56.95 2008
  9. 2 Radial Velocity 1 763.000 2.60 19.84 2011
  10. 3 Radial Velocity 1 326.030 19.40 110.62 2007
  11. 4 Radial Velocity 1 516.220 10.50 119.47 2009

2. 找出存在缺失值列

  1. print(plants.isnull().any())
  2. method False
  3. number False
  4. orbital_period True
  5. mass True
  6. distance True
  7. year False
  8. dtype: bool

3. 找出多列都为空行数据

  1. print("找出['orbital_period']、['mass']、['distance']三列都为空的行数据:")
  2. plants.loc[plants['orbital_period'].isnull()&plants['mass'].isnull()&plants['distance'].isnull()]
  3. 找出['orbital_period']、['mass']、['distance']三列都为空的行数据:

image.png

4. 去除缺失值所在行数据

  1. print('只保留所有列数据都不为NaN的行数据')
  2. plants.dropna()

image.png

5. 综合统计函数

  1. plants.describe()

image.png

6. 获取所有行星公转周期的中位数

  1. print('不同方法下所有行星公转周期(orbital_period)的中位数')
  2. print(plants.groupby('method'))
  3. print(plants.groupby('method')['orbital_period'])
  4. # groupby对象直到运行累计函数才开始计算
  5. print(plants.groupby('method')['orbital_period'].median())
  6. 不同方法下所有行星公转周期(orbital_period)的中位数
  7. <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002115A3BBA60>
  8. <pandas.core.groupby.generic.SeriesGroupBy object at 0x000002115A3BBEB0>
  9. method
  10. Astrometry 631.180000
  11. Eclipse Timing Variations 4343.500000
  12. Imaging 27500.000000
  13. Microlensing 3300.000000
  14. Orbital Brightness Modulation 0.342887
  15. Pulsar Timing 66.541900
  16. Pulsation Timing Variations 1170.000000
  17. Radial Velocity 360.200000
  18. Transit 5.714932
  19. Transit Timing Variations 57.011000
  20. Name: orbital_period, dtype: float64

7. 分组使用综合统计函数

  1. print('对每一组数据描述性统计')
  2. plants.groupby('method')['year'].describe()
  3. # 通过count可以看出,大部分行星是通过Radial Velocity和Transit方法发现的,并且后种方法近期比较常用
  4. 对每一组数据描述性统计

image.png

8. 对所有列进行指定累计操作

  1. aa = plants.groupby('method').aggregate([min, max, np.median])
  2. print(aa)
  3. number orbital_period \
  4. min max median min max
  5. method
  6. Astrometry 1 1 1 246.360000 1016.000000
  7. Eclipse Timing Variations 1 2 2 1916.250000 10220.000000
  8. Imaging 1 4 1 4639.150000 730000.000000
  9. Microlensing 1 2 1 1825.000000 5100.000000
  10. Orbital Brightness Modulation 1 2 2 0.240104 1.544929
  11. Pulsar Timing 1 3 3 0.090706 36525.000000
  12. Pulsation Timing Variations 1 1 1 1170.000000 1170.000000
  13. Radial Velocity 1 6 1 0.736540 17337.500000
  14. Transit 1 7 1 0.355000 331.600590
  15. Transit Timing Variations 2 3 2 22.339500 160.000000
  16. mass distance \
  17. median min max median min
  18. method
  19. Astrometry 631.180000 NaN NaN NaN 14.98
  20. Eclipse Timing Variations 4343.500000 4.2000 6.05 5.125 130.72
  21. Imaging 27500.000000 NaN NaN NaN 7.69
  22. Microlensing 3300.000000 NaN NaN NaN 1760.00
  23. Orbital Brightness Modulation 0.342887 NaN NaN NaN 1180.00
  24. Pulsar Timing 66.541900 NaN NaN NaN 1200.00
  25. Pulsation Timing Variations 1170.000000 NaN NaN NaN NaN
  26. Radial Velocity 360.200000 0.0036 25.00 1.260 1.35
  27. Transit 5.714932 1.4700 1.47 1.470 38.00
  28. Transit Timing Variations 57.011000 NaN NaN NaN 339.00
  29. year
  30. max median min max median
  31. method
  32. Astrometry 20.77 17.875 2010 2013 2011.5
  33. Eclipse Timing Variations 500.00 315.360 2008 2012 2010.0
  34. Imaging 165.00 40.395 2004 2013 2009.0
  35. Microlensing 7720.00 3840.000 2004 2013 2010.0
  36. Orbital Brightness Modulation 1180.00 1180.000 2011 2013 2011.0
  37. Pulsar Timing 1200.00 1200.000 1992 2011 1994.0
  38. Pulsation Timing Variations NaN NaN 2007 2007 2007.0
  39. Radial Velocity 354.00 40.445 1989 2014 2009.0
  40. Transit 8500.00 341.000 2002 2014 2012.0
  41. Transit Timing Variations 2119.00 855.000 2011 2014 2012.5

9. 对指定列进行指定操作

  1. print('各方法每年发现行星最大的是多少;各方法在哪年最早有发现行星')
  2. bb = plants.groupby('method').aggregate({'number': max, 'year':min})
  3. print(bb)
  4. 各方法每年发现行星最大的是多少;各方法在哪年最早有发现行星
  5. number year
  6. method
  7. Astrometry 1 2010
  8. Eclipse Timing Variations 2 2008
  9. Imaging 4 2004
  10. Microlensing 2 2004
  11. Orbital Brightness Modulation 2 2011
  12. Pulsar Timing 3 1992
  13. Pulsation Timing Variations 1 2007
  14. Radial Velocity 6 1989
  15. Transit 7 2002
  16. Transit Timing Variations 3 2011
  1. bb = plants.groupby('method').aggregate({'number': min, 'year':min})
  2. print('各方法每年发现行星最少的是多少;各方法在哪年最早有发现行星')
  3. print(bb)
  4. 各方法每年发现行星最少的是多少;各方法在哪年最早有发现行星
  5. number year
  6. method
  7. Astrometry 1 2010
  8. Eclipse Timing Variations 1 2008
  9. Imaging 1 2004
  10. Microlensing 1 2004
  11. Orbital Brightness Modulation 1 2011
  12. Pulsar Timing 1 1992
  13. Pulsation Timing Variations 1 2007
  14. Radial Velocity 1 1989
  15. Transit 1 2002
  16. Transit Timing Variations 2 2011
  1. bb = plants.groupby('method').aggregate({'number': min, 'year':max})
  2. print('各方法每年发现行星最少的是多少;各方法在哪年最近有发现行星')
  3. print(bb)
  4. 各方法每年发现行星最少的是多少;各方法在哪年最近有发现行星
  5. number year
  6. method
  7. Astrometry 1 2013
  8. Eclipse Timing Variations 1 2012
  9. Imaging 1 2013
  10. Microlensing 1 2013
  11. Orbital Brightness Modulation 1 2013
  12. Pulsar Timing 1 2011
  13. Pulsation Timing Variations 1 2007
  14. Radial Velocity 1 2014
  15. Transit 1 2014
  16. Transit Timing Variations 2 2014
  1. bb = plants.groupby('method').aggregate({'number': max, 'year':max})
  2. print('各方法每年发现行星最大的是多少;各方法在哪年最近有发现行星')
  3. print(bb)
  4. 各方法每年发现行星最大的是多少;各方法在哪年最近有发现行星
  5. number year
  6. method
  7. Astrometry 1 2013
  8. Eclipse Timing Variations 2 2012
  9. Imaging 4 2013
  10. Microlensing 2 2013
  11. Orbital Brightness Modulation 2 2013
  12. Pulsar Timing 3 2011
  13. Pulsation Timing Variations 1 2007
  14. Radial Velocity 6 2014
  15. Transit 7 2014
  16. Transit Timing Variations 3 2014
  1. print('各个方法发现行星年份的中位数:')
  2. cc = plants.groupby('method')['year'].median()
  3. print(cc)
  4. 各个方法发现行星年份的中位数:
  5. method
  6. Astrometry 2011.5
  7. Eclipse Timing Variations 2010.0
  8. Imaging 2009.0
  9. Microlensing 2010.0
  10. Orbital Brightness Modulation 2011.0
  11. Pulsar Timing 1994.0
  12. Pulsation Timing Variations 2007.0
  13. Radial Velocity 2009.0
  14. Transit 2012.0
  15. Transit Timing Variations 2012.5
  16. Name: year, dtype: float64

10. 不同方法和年份发现的行星数量

  1. dd = plants.pivot_table('number', index='method', columns='year', aggfunc='sum')
  2. print(dd)
  3. 不同方法和年份发现的行星数量
  4. year 1989 1992 1994 1995 1996 1997 1998 1999 \
  5. method
  6. Astrometry NaN NaN NaN NaN NaN NaN NaN NaN
  7. Eclipse Timing Variations NaN NaN NaN NaN NaN NaN NaN NaN
  8. Imaging NaN NaN NaN NaN NaN NaN NaN NaN
  9. Microlensing NaN NaN NaN NaN NaN NaN NaN NaN
  10. Orbital Brightness Modulation NaN NaN NaN NaN NaN NaN NaN NaN
  11. Pulsar Timing NaN 6.0 3.0 NaN NaN NaN NaN NaN
  12. Pulsation Timing Variations NaN NaN NaN NaN NaN NaN NaN NaN
  13. Radial Velocity 1.0 NaN NaN 1.0 15.0 1.0 11.0 24.0
  14. Transit NaN NaN NaN NaN NaN NaN NaN NaN
  15. Transit Timing Variations NaN NaN NaN NaN NaN NaN NaN NaN
  16. year 2000 2001 ... 2005 2006 2007 2008 2009 \
  17. method ...
  18. Astrometry NaN NaN ... NaN NaN NaN NaN NaN
  19. Eclipse Timing Variations NaN NaN ... NaN NaN NaN 4.0 1.0
  20. Imaging NaN NaN ... 1.0 4.0 1.0 17.0 3.0
  21. Microlensing NaN NaN ... 2.0 1.0 NaN 6.0 2.0
  22. Orbital Brightness Modulation NaN NaN ... NaN NaN NaN NaN NaN
  23. Pulsar Timing NaN NaN ... NaN NaN NaN NaN NaN
  24. Pulsation Timing Variations NaN NaN ... NaN NaN 1.0 NaN NaN
  25. Radial Velocity 27.0 15.0 ... 61.0 33.0 47.0 76.0 105.0
  26. Transit NaN NaN ... NaN 5.0 16.0 17.0 20.0
  27. Transit Timing Variations NaN NaN ... NaN NaN NaN NaN NaN
  28. year 2010 2011 2012 2013 2014
  29. method
  30. Astrometry 1.0 NaN NaN 1.0 NaN
  31. Eclipse Timing Variations 4.0 5.0 1.0 NaN NaN
  32. Imaging 9.0 3.0 2.0 7.0 NaN
  33. Microlensing 2.0 1.0 8.0 4.0 NaN
  34. Orbital Brightness Modulation NaN 4.0 NaN 1.0 NaN
  35. Pulsar Timing NaN 1.0 NaN NaN NaN
  36. Pulsation Timing Variations NaN NaN NaN NaN NaN
  37. Radial Velocity 92.0 176.0 70.0 65.0 21.0
  38. Transit 85.0 162.0 175.0 197.0 93.0
  39. Transit Timing Variations NaN 2.0 2.0 2.0 3.0
  40. [10 rows x 23 columns]