1. 数据的导入与预览

  1. # 要求:计算美国各州的人口密度排名
  2. import numpy as np
  3. import pandas as pd
  4. pop = pd.read_csv('state-population.csv')
  5. # 当打开的文件没有第一行或者第一列的说明文字时,可以指定
  6. # areas = pd.read_csv('state-areas.csv', names=['state', 'area (sq, mi)'])
  7. areas = pd.read_csv('state-areas.csv')
  8. abbrevs = pd.read_csv('state-abbrevs.csv')
  9. print(pop.head())
  10. print(areas.head())
  11. print(abbrevs.head())
  12. state/region ages year population
  13. 0 AL under18 2012 1117489.0
  14. 1 AL total 2012 4817528.0
  15. 2 AL under18 2010 1130966.0
  16. 3 AL total 2010 4785570.0
  17. 4 AL under18 2011 1125763.0
  18. state area (sq. mi)
  19. 0 Alabama 52423
  20. 1 Alaska 656425
  21. 2 Arizona 114006
  22. 3 Arkansas 53182
  23. 4 California 163707
  24. state abbreviation
  25. 0 Alabama AL
  26. 1 Alaska AK
  27. 2 Arizona AZ
  28. 3 Arkansas AR
  29. 4 California CA

2. 连接pop与abbrevs表(丢弃重复信息)

  1. merged = pd.merge(pop, abbrevs, left_on='state/region', right_on='abbreviation', how='outer').drop('abbreviation', axis='columns')
  2. print(merged.head())
  3. state/region ages year population state
  4. 0 AL under18 2012 1117489.0 Alabama
  5. 1 AL total 2012 4817528.0 Alabama
  6. 2 AL under18 2010 1130966.0 Alabama
  7. 3 AL total 2010 4785570.0 Alabama
  8. 4 AL under18 2011 1125763.0 Alabama

3. 查找缺失值

  1. print(merged.isnull())
  2. print(merged.isnull().any()) # 有缺失值的那一列会显示True,没有显示False
  3. 查找哪些列有缺失值:
  4. state/region ages year population state
  5. 0 False False False False False
  6. 1 False False False False False
  7. 2 False False False False False
  8. 3 False False False False False
  9. 4 False False False False False
  10. ... ... ... ... ... ...
  11. 2539 False False False False True
  12. 2540 False False False False True
  13. 2541 False False False False True
  14. 2542 False False False False True
  15. 2543 False False False False True
  16. [2544 rows x 5 columns]
  17. state/region False
  18. ages False
  19. year False
  20. population True
  21. state True
  22. dtype: bool

4. 查看有缺失值的数据行

通过第3节已经发现了population列存在缺失值,可以通过输出population列有缺失值得所有行数据。

  1. print(merged[merged['population'].isnull()])
  2. state/region ages year population state
  3. 2448 PR under18 1990 NaN NaN
  4. 2449 PR total 1990 NaN NaN
  5. 2450 PR total 1991 NaN NaN
  6. 2451 PR under18 1991 NaN NaN
  7. 2452 PR total 1993 NaN NaN
  8. 2453 PR under18 1993 NaN NaN
  9. 2454 PR under18 1992 NaN NaN
  10. 2455 PR total 1992 NaN NaN
  11. 2456 PR under18 1994 NaN NaN
  12. 2457 PR total 1994 NaN NaN
  13. 2458 PR total 1995 NaN NaN
  14. 2459 PR under18 1995 NaN NaN
  15. 2460 PR under18 1996 NaN NaN
  16. 2461 PR total 1996 NaN NaN
  17. 2462 PR under18 1998 NaN NaN
  18. 2463 PR total 1998 NaN NaN
  19. 2464 PR total 1997 NaN NaN
  20. 2465 PR under18 1997 NaN NaN
  21. 2466 PR total 1999 NaN NaN
  22. 2467 PR under18 1999 NaN NaN

5. 查看哪些洲缺少全程

  1. print(merged.loc[merged['state'].isnull(), 'state/region'].unique())
  2. ['PR' 'USA']

6. 填充洲的缺失值

  1. merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
  2. merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States of America'
  3. print(merged.isnull().any())
  4. state/region False
  5. ages False
  6. year False
  7. population True
  8. state False
  9. dtype: bool

7. 连接merged和areas表

  1. final = pd.merge(merged, areas, how='outer')
  2. print(final.head())
  3. state/region ages year population state area (sq. mi)
  4. 0 AL under18 2012 1117489.0 Alabama 52423.0
  5. 1 AL total 2012 4817528.0 Alabama 52423.0
  6. 2 AL under18 2010 1130966.0 Alabama 52423.0
  7. 3 AL total 2010 4785570.0 Alabama 52423.0
  8. 4 AL under18 2011 1125763.0 Alabama 52423.0

8. 查找有缺失值列

  1. print(final.isnull().any())
  2. state/region False
  3. ages False
  4. year False
  5. population True
  6. state False
  7. area (sq. mi) True
  8. dtype: bool

查看哪些洲缺少面积,因为输出了美国,不是一个洲,所以忽略

  1. print(final.loc[final['area (sq. mi)'].isnull(), 'state'].unique())
  2. ['United States of America']

9. 删除缺失值行

在原始的final上进行删除缺失值操作,使用dropna()函数时:
# inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果,需要重新赋值才可以。

  1. print(final.dropna(inplace=True))

10. 统计2010年,ages=’total’的每行数据

  1. data2010 = final.query('year==2010 & ages=="total"') # query是过滤函数
  2. print(data2010.head())
  3. state/region ages year population state area (sq. mi)
  4. 3 AL total 2010 4785570.0 Alabama 52423.0
  5. 91 AK total 2010 713868.0 Alaska 656425.0
  6. 101 AZ total 2010 6408790.0 Arizona 114006.0
  7. 189 AR total 2010 2922280.0 Arkansas 53182.0
  8. 197 CA total 2010 37333601.0 California 163707.0

11. 把数据某列内容设置索引列

  1. print('\nset_index将列设置为索引列,inplace=True表示在原对象上进行操作')
  2. data2010.set_index('state', inplace=True)
  3. print(data2010.head())
  4. set_index将列设置为索引列,inplace=True表示在原对象上进行操作
  5. state/region ages year population area (sq. mi)
  6. state
  7. Alabama AL total 2010 4785570.0 52423.0
  8. Alaska AK total 2010 713868.0 656425.0
  9. Arizona AZ total 2010 6408790.0 114006.0
  10. Arkansas AR total 2010 2922280.0 53182.0
  11. California CA total 2010 37333601.0 163707.0

12. 排序

  1. density = data2010['population']/data2010['area (sq. mi)']
  2. density.sort_values(ascending=False, inplace=True) # ascending=False排序按照从大到小排,如过等于True表示从小到大
  3. print(density.head())
  4. state
  5. District of Columbia 8898.897059
  6. Puerto Rico 1058.665149
  7. New Jersey 1009.253268
  8. Rhode Island 681.339159
  9. Connecticut 645.600649
  10. dtype: float64

13. 获取最后五行数据

  1. print(density.tail())
  2. state
  3. South Dakota 10.583512
  4. North Dakota 9.537565
  5. Montana 6.736171
  6. Wyoming 5.768079
  7. Alaska 1.087509
  8. dtype: float64