1. 数据的导入与预览
# 要求:计算美国各州的人口密度排名import numpy as npimport pandas as pdpop = pd.read_csv('state-population.csv')# 当打开的文件没有第一行或者第一列的说明文字时,可以指定# areas = pd.read_csv('state-areas.csv', names=['state', 'area (sq, mi)'])areas = pd.read_csv('state-areas.csv')abbrevs = pd.read_csv('state-abbrevs.csv')print(pop.head())print(areas.head())print(abbrevs.head())state/region ages year population0 AL under18 2012 1117489.01 AL total 2012 4817528.02 AL under18 2010 1130966.03 AL total 2010 4785570.04 AL under18 2011 1125763.0state area (sq. mi)0 Alabama 524231 Alaska 6564252 Arizona 1140063 Arkansas 531824 California 163707state abbreviation0 Alabama AL1 Alaska AK2 Arizona AZ3 Arkansas AR4 California CA
2. 连接pop与abbrevs表(丢弃重复信息)
merged = pd.merge(pop, abbrevs, left_on='state/region', right_on='abbreviation', how='outer').drop('abbreviation', axis='columns')print(merged.head())state/region ages year population state0 AL under18 2012 1117489.0 Alabama1 AL total 2012 4817528.0 Alabama2 AL under18 2010 1130966.0 Alabama3 AL total 2010 4785570.0 Alabama4 AL under18 2011 1125763.0 Alabama
3. 查找缺失值
print(merged.isnull())print(merged.isnull().any()) # 有缺失值的那一列会显示True,没有显示False查找哪些列有缺失值:state/region ages year population state0 False False False False False1 False False False False False2 False False False False False3 False False False False False4 False False False False False... ... ... ... ... ...2539 False False False False True2540 False False False False True2541 False False False False True2542 False False False False True2543 False False False False True[2544 rows x 5 columns]state/region Falseages Falseyear Falsepopulation Truestate Truedtype: bool
4. 查看有缺失值的数据行
通过第3节已经发现了population列存在缺失值,可以通过输出population列有缺失值得所有行数据。
print(merged[merged['population'].isnull()])state/region ages year population state2448 PR under18 1990 NaN NaN2449 PR total 1990 NaN NaN2450 PR total 1991 NaN NaN2451 PR under18 1991 NaN NaN2452 PR total 1993 NaN NaN2453 PR under18 1993 NaN NaN2454 PR under18 1992 NaN NaN2455 PR total 1992 NaN NaN2456 PR under18 1994 NaN NaN2457 PR total 1994 NaN NaN2458 PR total 1995 NaN NaN2459 PR under18 1995 NaN NaN2460 PR under18 1996 NaN NaN2461 PR total 1996 NaN NaN2462 PR under18 1998 NaN NaN2463 PR total 1998 NaN NaN2464 PR total 1997 NaN NaN2465 PR under18 1997 NaN NaN2466 PR total 1999 NaN NaN2467 PR under18 1999 NaN NaN
5. 查看哪些洲缺少全程
print(merged.loc[merged['state'].isnull(), 'state/region'].unique())['PR' 'USA']
6. 填充洲的缺失值
merged.loc[merged['state/region'] == 'PR', 'state'] = 'Puerto Rico'merged.loc[merged['state/region'] == 'USA', 'state'] = 'United States of America'print(merged.isnull().any())state/region Falseages Falseyear Falsepopulation Truestate Falsedtype: bool
7. 连接merged和areas表
final = pd.merge(merged, areas, how='outer')print(final.head())state/region ages year population state area (sq. mi)0 AL under18 2012 1117489.0 Alabama 52423.01 AL total 2012 4817528.0 Alabama 52423.02 AL under18 2010 1130966.0 Alabama 52423.03 AL total 2010 4785570.0 Alabama 52423.04 AL under18 2011 1125763.0 Alabama 52423.0
8. 查找有缺失值列
print(final.isnull().any())state/region Falseages Falseyear Falsepopulation Truestate Falsearea (sq. mi) Truedtype: bool
查看哪些洲缺少面积,因为输出了美国,不是一个洲,所以忽略
print(final.loc[final['area (sq. mi)'].isnull(), 'state'].unique())['United States of America']
9. 删除缺失值行
在原始的final上进行删除缺失值操作,使用dropna()函数时:
# inplace=True:不创建新的对象,直接对原始对象进行修改;
# inplace=False:对数据进行修改,创建并返回新的对象承载其修改结果,需要重新赋值才可以。
print(final.dropna(inplace=True))
10. 统计2010年,ages=’total’的每行数据
data2010 = final.query('year==2010 & ages=="total"') # query是过滤函数print(data2010.head())state/region ages year population state area (sq. mi)3 AL total 2010 4785570.0 Alabama 52423.091 AK total 2010 713868.0 Alaska 656425.0101 AZ total 2010 6408790.0 Arizona 114006.0189 AR total 2010 2922280.0 Arkansas 53182.0197 CA total 2010 37333601.0 California 163707.0
11. 把数据某列内容设置索引列
print('\nset_index将列设置为索引列,inplace=True表示在原对象上进行操作')data2010.set_index('state', inplace=True)print(data2010.head())set_index将列设置为索引列,inplace=True表示在原对象上进行操作state/region ages year population area (sq. mi)stateAlabama AL total 2010 4785570.0 52423.0Alaska AK total 2010 713868.0 656425.0Arizona AZ total 2010 6408790.0 114006.0Arkansas AR total 2010 2922280.0 53182.0California CA total 2010 37333601.0 163707.0
12. 排序
density = data2010['population']/data2010['area (sq. mi)']density.sort_values(ascending=False, inplace=True) # ascending=False排序按照从大到小排,如过等于True表示从小到大print(density.head())stateDistrict of Columbia 8898.897059Puerto Rico 1058.665149New Jersey 1009.253268Rhode Island 681.339159Connecticut 645.600649dtype: float64
13. 获取最后五行数据
print(density.tail())stateSouth Dakota 10.583512North Dakota 9.537565Montana 6.736171Wyoming 5.768079Alaska 1.087509dtype: float64
