1. 数据的导入与预览
# 要求:计算美国各州的人口密度排名
import numpy as np
import pandas as pd
pop = pd.read_csv('state-population.csv')
# 当打开的文件没有第一行或者第一列的说明文字时,可以指定
# areas = pd.read_csv('state-areas.csv', names=['state', 'area (sq, mi)'])
areas = pd.read_csv('state-areas.csv')
abbrevs = pd.read_csv('state-abbrevs.csv')
print(pop.head())
print(areas.head())
print(abbrevs.head())
state/region ages year population
0 AL under18 2012 1117489.0
1 AL total 2012 4817528.0
2 AL under18 2010 1130966.0
3 AL total 2010 4785570.0
4 AL under18 2011 1125763.0
state area (sq. mi)
0 Alabama 52423
1 Alaska 656425
2 Arizona 114006
3 Arkansas 53182
4 California 163707
state abbreviation
0 Alabama AL
1 Alaska AK
2 Arizona AZ
3 Arkansas AR
4 California CA
2. 连接pop与abbrevs表(丢弃重复信息)
merged = pd.merge(pop, abbrevs, left_on='state/region', right_on='abbreviation', how='outer').drop('abbreviation', axis='columns')
print(merged.head())
state/region ages year population state
0 AL under18 2012 1117489.0 Alabama
1 AL total 2012 4817528.0 Alabama
2 AL under18 2010 1130966.0 Alabama
3 AL total 2010 4785570.0 Alabama
4 AL under18 2011 1125763.0 Alabama
3. 查找缺失值
print(merged.isnull())
print(merged.isnull().any()) # 有缺失值的那一列会显示True,没有显示False
查找哪些列有缺失值:
state/region ages year population state
0 False False False False False
1 False False False False False
2 False False False False False
3 False False False False False
4 False False False False False
... ... ... ... ... ...
2539 False False False False True
2540 False False False False True
2541 False False False False True
2542 False False False False True
2543 False False False False True
[2544 rows x 5 columns]
state/region False
ages False
year False
population True
state True
dtype: bool
4. 查看有缺失值的数据行
通过第3节已经发现了population列存在缺失值,可以通过输出population列有缺失值得所有行数据。
print(merged[merged['population'].isnull()])
state/region ages year population state
2448 PR under18 1990 NaN NaN
2449 PR total 1990 NaN NaN
2450 PR total 1991 NaN NaN
2451 PR under18 1991 NaN NaN
2452 PR total 1993 NaN NaN
2453 PR under18 1993 NaN NaN
2454 PR under18 1992 NaN NaN
2455 PR total 1992 NaN NaN
2456 PR under18 1994 NaN NaN
2457 PR total 1994 NaN NaN
2458 PR total 1995 NaN NaN
2459 PR under18 1995 NaN NaN
2460 PR under18 1996 NaN NaN
2461 PR total 1996 NaN NaN
2462 PR under18 1998 NaN NaN
2463 PR total 1998 NaN NaN
2464 PR total 1997 NaN NaN
2465 PR under18 1997 NaN NaN
2466 PR total 1999 NaN NaN
2467 PR under18 1999 NaN NaN
5. 查看哪些洲缺少全程
print(merged.loc[merged['state'].isnull(), 'state/region'].unique())
['PR' 'USA']
6. 填充洲的缺失值
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'
merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States of America'
print(merged.isnull().any())
state/region False
ages False
year False
population True
state False
dtype: bool
7. 连接merged和areas表
final = pd.merge(merged, areas, how='outer')
print(final.head())
state/region ages year population state area (sq. mi)
0 AL under18 2012 1117489.0 Alabama 52423.0
1 AL total 2012 4817528.0 Alabama 52423.0
2 AL under18 2010 1130966.0 Alabama 52423.0
3 AL total 2010 4785570.0 Alabama 52423.0
4 AL under18 2011 1125763.0 Alabama 52423.0
8. 查找有缺失值列
print(final.isnull().any())
state/region False
ages False
year False
population True
state False
area (sq. mi) True
dtype: bool
查看哪些洲缺少面积,因为输出了美国,不是一个洲,所以忽略
print(final.loc[final['area (sq. mi)'].isnull(), 'state'].unique())
['United States of America']
9. 删除缺失值行
在原始的final上进行删除缺失值操作,使用dropna()函数时:
# inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果,需要重新赋值才可以。
print(final.dropna(inplace=True))
10. 统计2010年,ages=’total’的每行数据
data2010 = final.query('year==2010 & ages=="total"') # query是过滤函数
print(data2010.head())
state/region ages year population state area (sq. mi)
3 AL total 2010 4785570.0 Alabama 52423.0
91 AK total 2010 713868.0 Alaska 656425.0
101 AZ total 2010 6408790.0 Arizona 114006.0
189 AR total 2010 2922280.0 Arkansas 53182.0
197 CA total 2010 37333601.0 California 163707.0
11. 把数据某列内容设置索引列
print('\nset_index将列设置为索引列,inplace=True表示在原对象上进行操作')
data2010.set_index('state', inplace=True)
print(data2010.head())
set_index将列设置为索引列,inplace=True表示在原对象上进行操作
state/region ages year population area (sq. mi)
state
Alabama AL total 2010 4785570.0 52423.0
Alaska AK total 2010 713868.0 656425.0
Arizona AZ total 2010 6408790.0 114006.0
Arkansas AR total 2010 2922280.0 53182.0
California CA total 2010 37333601.0 163707.0
12. 排序
density = data2010['population']/data2010['area (sq. mi)']
density.sort_values(ascending=False, inplace=True) # ascending=False排序按照从大到小排,如过等于True表示从小到大
print(density.head())
state
District of Columbia 8898.897059
Puerto Rico 1058.665149
New Jersey 1009.253268
Rhode Island 681.339159
Connecticut 645.600649
dtype: float64
13. 获取最后五行数据
print(density.tail())
state
South Dakota 10.583512
North Dakota 9.537565
Montana 6.736171
Wyoming 5.768079
Alaska 1.087509
dtype: float64