1. 导入与预览数据
import numpy as npimport pandas as pdimport seaborn as snsplants = pd.read_csv('planets.csv')print(plants.head()) method number orbital_period mass distance year0 Radial Velocity 1 269.300 7.10 77.40 20061 Radial Velocity 1 874.774 2.21 56.95 20082 Radial Velocity 1 763.000 2.60 19.84 20113 Radial Velocity 1 326.030 19.40 110.62 20074 Radial Velocity 1 516.220 10.50 119.47 2009
2. 找出存在缺失值列
print(plants.isnull().any()) method Falsenumber Falseorbital_period Truemass Truedistance Trueyear Falsedtype: bool
3. 找出多列都为空行数据
print("找出['orbital_period']、['mass']、['distance']三列都为空的行数据:")plants.loc[plants['orbital_period'].isnull()&plants['mass'].isnull()&plants['distance'].isnull()]找出['orbital_period']、['mass']、['distance']三列都为空的行数据:
4. 去除缺失值所在行数据
print('只保留所有列数据都不为NaN的行数据')plants.dropna()
5. 综合统计函数
plants.describe()
6. 获取所有行星公转周期的中位数
print('不同方法下所有行星公转周期(orbital_period)的中位数')print(plants.groupby('method'))print(plants.groupby('method')['orbital_period'])# groupby对象直到运行累计函数才开始计算print(plants.groupby('method')['orbital_period'].median())不同方法下所有行星公转周期(orbital_period)的中位数<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002115A3BBA60><pandas.core.groupby.generic.SeriesGroupBy object at 0x000002115A3BBEB0>methodAstrometry 631.180000Eclipse Timing Variations 4343.500000Imaging 27500.000000Microlensing 3300.000000Orbital Brightness Modulation 0.342887Pulsar Timing 66.541900Pulsation Timing Variations 1170.000000Radial Velocity 360.200000Transit 5.714932Transit Timing Variations 57.011000Name: orbital_period, dtype: float64
7. 分组使用综合统计函数
print('对每一组数据描述性统计')plants.groupby('method')['year'].describe()# 通过count可以看出,大部分行星是通过Radial Velocity和Transit方法发现的,并且后种方法近期比较常用对每一组数据描述性统计
8. 对所有列进行指定累计操作
aa = plants.groupby('method').aggregate([min, max, np.median])print(aa) number orbital_period \ min max median min max method Astrometry 1 1 1 246.360000 1016.000000 Eclipse Timing Variations 1 2 2 1916.250000 10220.000000 Imaging 1 4 1 4639.150000 730000.000000 Microlensing 1 2 1 1825.000000 5100.000000 Orbital Brightness Modulation 1 2 2 0.240104 1.544929 Pulsar Timing 1 3 3 0.090706 36525.000000 Pulsation Timing Variations 1 1 1 1170.000000 1170.000000 Radial Velocity 1 6 1 0.736540 17337.500000 Transit 1 7 1 0.355000 331.600590 Transit Timing Variations 2 3 2 22.339500 160.000000 mass distance \ median min max median min method Astrometry 631.180000 NaN NaN NaN 14.98 Eclipse Timing Variations 4343.500000 4.2000 6.05 5.125 130.72 Imaging 27500.000000 NaN NaN NaN 7.69 Microlensing 3300.000000 NaN NaN NaN 1760.00 Orbital Brightness Modulation 0.342887 NaN NaN NaN 1180.00 Pulsar Timing 66.541900 NaN NaN NaN 1200.00 Pulsation Timing Variations 1170.000000 NaN NaN NaN NaN Radial Velocity 360.200000 0.0036 25.00 1.260 1.35 Transit 5.714932 1.4700 1.47 1.470 38.00 Transit Timing Variations 57.011000 NaN NaN NaN 339.00 year max median min max median method Astrometry 20.77 17.875 2010 2013 2011.5 Eclipse Timing Variations 500.00 315.360 2008 2012 2010.0 Imaging 165.00 40.395 2004 2013 2009.0 Microlensing 7720.00 3840.000 2004 2013 2010.0 Orbital Brightness Modulation 1180.00 1180.000 2011 2013 2011.0 Pulsar Timing 1200.00 1200.000 1992 2011 1994.0 Pulsation Timing Variations NaN NaN 2007 2007 2007.0 Radial Velocity 354.00 40.445 1989 2014 2009.0 Transit 8500.00 341.000 2002 2014 2012.0 Transit Timing Variations 2119.00 855.000 2011 2014 2012.5
9. 对指定列进行指定操作
print('各方法每年发现行星最大的是多少;各方法在哪年最早有发现行星')bb = plants.groupby('method').aggregate({'number': max, 'year':min})print(bb)各方法每年发现行星最大的是多少;各方法在哪年最早有发现行星 number yearmethod Astrometry 1 2010Eclipse Timing Variations 2 2008Imaging 4 2004Microlensing 2 2004Orbital Brightness Modulation 2 2011Pulsar Timing 3 1992Pulsation Timing Variations 1 2007Radial Velocity 6 1989Transit 7 2002Transit Timing Variations 3 2011
bb = plants.groupby('method').aggregate({'number': min, 'year':min})print('各方法每年发现行星最少的是多少;各方法在哪年最早有发现行星')print(bb)各方法每年发现行星最少的是多少;各方法在哪年最早有发现行星 number yearmethod Astrometry 1 2010Eclipse Timing Variations 1 2008Imaging 1 2004Microlensing 1 2004Orbital Brightness Modulation 1 2011Pulsar Timing 1 1992Pulsation Timing Variations 1 2007Radial Velocity 1 1989Transit 1 2002Transit Timing Variations 2 2011
bb = plants.groupby('method').aggregate({'number': min, 'year':max})print('各方法每年发现行星最少的是多少;各方法在哪年最近有发现行星')print(bb)各方法每年发现行星最少的是多少;各方法在哪年最近有发现行星 number yearmethod Astrometry 1 2013Eclipse Timing Variations 1 2012Imaging 1 2013Microlensing 1 2013Orbital Brightness Modulation 1 2013Pulsar Timing 1 2011Pulsation Timing Variations 1 2007Radial Velocity 1 2014Transit 1 2014Transit Timing Variations 2 2014
bb = plants.groupby('method').aggregate({'number': max, 'year':max})print('各方法每年发现行星最大的是多少;各方法在哪年最近有发现行星')print(bb)各方法每年发现行星最大的是多少;各方法在哪年最近有发现行星 number yearmethod Astrometry 1 2013Eclipse Timing Variations 2 2012Imaging 4 2013Microlensing 2 2013Orbital Brightness Modulation 2 2013Pulsar Timing 3 2011Pulsation Timing Variations 1 2007Radial Velocity 6 2014Transit 7 2014Transit Timing Variations 3 2014
print('各个方法发现行星年份的中位数:')cc = plants.groupby('method')['year'].median()print(cc)各个方法发现行星年份的中位数:methodAstrometry 2011.5Eclipse Timing Variations 2010.0Imaging 2009.0Microlensing 2010.0Orbital Brightness Modulation 2011.0Pulsar Timing 1994.0Pulsation Timing Variations 2007.0Radial Velocity 2009.0Transit 2012.0Transit Timing Variations 2012.5Name: year, dtype: float64
10. 不同方法和年份发现的行星数量
dd = plants.pivot_table('number', index='method', columns='year', aggfunc='sum')print(dd)不同方法和年份发现的行星数量year 1989 1992 1994 1995 1996 1997 1998 1999 \method Astrometry NaN NaN NaN NaN NaN NaN NaN NaN Eclipse Timing Variations NaN NaN NaN NaN NaN NaN NaN NaN Imaging NaN NaN NaN NaN NaN NaN NaN NaN Microlensing NaN NaN NaN NaN NaN NaN NaN NaN Orbital Brightness Modulation NaN NaN NaN NaN NaN NaN NaN NaN Pulsar Timing NaN 6.0 3.0 NaN NaN NaN NaN NaN Pulsation Timing Variations NaN NaN NaN NaN NaN NaN NaN NaN Radial Velocity 1.0 NaN NaN 1.0 15.0 1.0 11.0 24.0 Transit NaN NaN NaN NaN NaN NaN NaN NaN Transit Timing Variations NaN NaN NaN NaN NaN NaN NaN NaN year 2000 2001 ... 2005 2006 2007 2008 2009 \method ... Astrometry NaN NaN ... NaN NaN NaN NaN NaN Eclipse Timing Variations NaN NaN ... NaN NaN NaN 4.0 1.0 Imaging NaN NaN ... 1.0 4.0 1.0 17.0 3.0 Microlensing NaN NaN ... 2.0 1.0 NaN 6.0 2.0 Orbital Brightness Modulation NaN NaN ... NaN NaN NaN NaN NaN Pulsar Timing NaN NaN ... NaN NaN NaN NaN NaN Pulsation Timing Variations NaN NaN ... NaN NaN 1.0 NaN NaN Radial Velocity 27.0 15.0 ... 61.0 33.0 47.0 76.0 105.0 Transit NaN NaN ... NaN 5.0 16.0 17.0 20.0 Transit Timing Variations NaN NaN ... NaN NaN NaN NaN NaN year 2010 2011 2012 2013 2014 method Astrometry 1.0 NaN NaN 1.0 NaN Eclipse Timing Variations 4.0 5.0 1.0 NaN NaN Imaging 9.0 3.0 2.0 7.0 NaN Microlensing 2.0 1.0 8.0 4.0 NaN Orbital Brightness Modulation NaN 4.0 NaN 1.0 NaN Pulsar Timing NaN 1.0 NaN NaN NaN Pulsation Timing Variations NaN NaN NaN NaN NaN Radial Velocity 92.0 176.0 70.0 65.0 21.0 Transit 85.0 162.0 175.0 197.0 93.0 Transit Timing Variations NaN 2.0 2.0 2.0 3.0 [10 rows x 23 columns]