import pandas as pd
def csv_to_ls(file):
"""接收文件名为参数,读取文件中的数据,
根据逗号切分数据为列表加到列表中"""
with open(file, 'r', encoding='utf-8') as wine:
wine_list = [line.strip().split(',') for line in wine]
# print(wine_list)
return wine_list
def country(wine_list):
"""将列表中序号为1的列的数据提取出来,以列表形式返回"""
country_list = [x[1] for x in wine_list[1:]]
return country_list
def points(wine_list):
"""将列表中序号为1的列的数据提取出来,以列表形式返回"""
point_list = [x[4] for x in wine_list[1:]]
print(point_list)
return point_list
def avg_point(point_list):
"""将列表中的数据映射为整型,返回平均值"""
point_list_int = map(int, point_list)
return sum(point_list_int) / len(point_list_int)
def price(wine_list):
price_list = [x[5] for x in wine_list[1:]]
return price_list
if __name__ == '__main__':
filename = './wine point.csv'
wine = csv_to_ls(filename)
country_ls = country(wine)
point_ls = points(wine)
price_ls = price(wine)
print(avg_point(point_ls))
使用python方法将文件中的数据读入到列表后,统计时,可能会发现缺失数据会对处理方法造成很大的影响。列如,试图将值为整数或浮点数的列映射为数值类型时,会触发异常:
ValueError: invalid literal for int() with base 10: ''
这是因为空字符串无法用float()或int()函数转为浮点类型或整型。
用pd.read_csv()可以简单的将文件中的数据读入到dataframe中,再用df.values.tolist()可以方便的将数据转为列表。
def csv_to_df(file):
wine_ls = pd.read_csv(file).values.tolist()
print(wine_ls)
return wine_ls
查看列表,会发现里面存在一些值为nan的项,这些便是缺失数据,pandas读取时自动用nan进行了填充:
[[0, 'Italy', 'Aromas ', 'Bianco', 87.0, nan, 'Sicily', 'Etna', nan, 'Kerin', '@kerinokeefe', 'Nicosia 2013', 'White', 'Nicosia'],
[1, 'Portugal', 'This.', 'Avidagos', 87.0, 15.0, 'Douro', nan, nan, 'Roger', '@vossroger', 'Quinta 2011', 'Portuguese', 'Quinta'],
[2, 'US', 'Tart', nan, nan, 14.0, 'Oregon', 'Willamette', 'Willamette', 'Paul', '@paulgwine', 'Rainstorm 2013', 'Pinot', 'Rainstorm'],
[3, 'US', 'Pineapple', 'Reserve', 87.0, nan, 'Michigan', 'Lake', nan, 'Alexander', nan, 'Julian 2013', 'Riesling', 'St. Julian'],
[4, nan, 'Much', 'Vintner', 87.0, 65.0, 'Oregon', 'Willamette', 'Willamette', 'Paul', '@paulgwine', 'Sweet 2012', 'Pinot', 'Sweet'],
[5, 'Spain', 'Blackberry', 'Ars', 87.0, 15.0, 'Northern', 'Navarra', nan, 'Michael', '@wineschach', 'Tandem 2011', 'Tempranillo', 'Tandem'],
[6, 'Italy', 'Here', 'Belsito', 87.0, 16.0, 'Sicily', 'Vittoria', nan, 'Kerin', '@kerinokeefe', 'Terre 2013', 'Frappato', 'Terre'],
[7, 'France', 'This', nan, 87.0, 24.0, 'Alsace', 'Alsace', nan, 'Roger', '@vossroger', 'Trimbach 2012', 'Gewürztraminer', 'Trimbach'],
[8, 'Germany', 'Savory', 'Shine', 87.0, 12.0, 'Rheinhessen', nan, nan, 'Anna', nan, 'Heinz 2013', 'Gewürztraminer', 'Heinz'],
[9, 'France', 'This', 'Les Natures', 87.0, 27.0, 'Alsace', 'Alsace', nan, 'Roger', '@vossroger', 'Jean 2012', 'Pinot', 'Jean-Baptiste'],
[10, 'US', 'Soft', 'Mountain', 87.0, 19.0, 'California', 'Napa', 'Napa', 'Virginie', '@vboone', 'Kirkland 2011', 'Cabernet', 'Kirkland']]
nan是Not a Number的意思,其数据类型并不是字符串,而是”float”,所以不能用是否为空字符串或是否为字符串”nan”来判定该数据是否为空值。
如果用Python的方法,可以借助于math库里的方法
if math.isnan(x) == False可用于判定x的值是不是nan
当然,pandas中提供了大量用于缺失值的处理方法:
def csv_to_df(file):
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.width', None) # 显示宽度是无限
wine_df = pd.read_csv(file)
print(wine_df.isnull()) # 检测缺失值,返回布尔值,.notnull(),True表示数据缺失
order country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 False False False False False True False False True False False False False False
1 False False False False False False False True True False False False False False
2 False False False True True False False False False False False False False False
3 False False False False False True False False True False True False False False
4 False True False False False False False False False False False False False False
5 False False False False False False False False True False False False False False
6 False False False False False False False False True False False False False False
7 False False False True False False False False True False False False False False
8 False False False False False False False True True False True False False False
9 False False False False False False False False True False False False False False
10 False False False False False False False False False False False False False False
def csv_to_df(file):
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.width', None) # 显示宽度是无限
wine_df = pd.read_csv(file)
print(wine_df.price.isnull()) # 检测价格列的缺失值,True表示数据缺失
0 True
1 False
2 False
3 True
4 False
5 False
6 False
7 False
8 False
9 False
10 False
Name: price, dtype: bool
def csv_to_df(file):
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.width', None) # 显示宽度是无限
wine_df = pd.read_csv(file)
print(wine_df.loc[wine_df['price'].notnull(),:]) # 返回价格没有缺失的数据
order country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
1 1 Portugal This. Avidagos 87.0 15.0 Douro NaN NaN Roger @vossroger Quinta 2011 Portuguese Quinta
2 2 US Tart NaN NaN 14.0 Oregon Willamette Willamette Paul @paulgwine Rainstorm 2013 Pinot Rainstorm
4 4 NaN Much Vintner 87.0 65.0 Oregon Willamette Willamette Paul @paulgwine Sweet 2012 Pinot Sweet
5 5 Spain Blackberry Ars 87.0 15.0 Northern Navarra NaN Michael @wineschach Tandem 2011 Tempranillo Tandem
6 6 Italy Here Belsito 87.0 16.0 Sicily Vittoria NaN Kerin @kerinokeefe Terre 2013 Frappato Terre
7 7 France This NaN 87.0 24.0 Alsace Alsace NaN Roger @vossroger Trimbach 2012 Gewürztraminer Trimbach
8 8 Germany Savory Shine 87.0 12.0 Rheinhessen NaN NaN Anna NaN Heinz 2013 Gewürztraminer Heinz
9 9 France This Les Natures 87.0 27.0 Alsace Alsace NaN Roger @vossroger Jean 2012 Pinot Jean-Baptiste
10 10 US Soft Mountain 87.0 19.0 California Napa Napa Virginie @vboone Kirkland 2011 Cabernet Kirkland
def csv_to_df(file):
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.width', None) # 显示宽度是无限
wine_df = pd.read_csv(file)
print(wine_df.isnull()) # 检测缺失值,返回布尔值,.notnull()
print(wine_df.loc[wine_df['points'].notnull(), :]) # 返回评分没有缺失的数据
order country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 0 Italy Aromas Bianco 87.0 NaN Sicily Etna NaN Kerin @kerinokeefe Nicosia 2013 White Nicosia
1 1 Portugal This. Avidagos 87.0 15.0 Douro NaN NaN Roger @vossroger Quinta 2011 Portuguese Quinta
3 3 US Pineapple Reserve 87.0 NaN Michigan Lake NaN Alexander NaN Julian 2013 Riesling St. Julian
4 4 NaN Much Vintner 87.0 65.0 Oregon Willamette Willamette Paul @paulgwine Sweet 2012 Pinot Sweet
5 5 Spain Blackberry Ars 87.0 15.0 Northern Navarra NaN Michael @wineschach Tandem 2011 Tempranillo Tandem
6 6 Italy Here Belsito 87.0 16.0 Sicily Vittoria NaN Kerin @kerinokeefe Terre 2013 Frappato Terre
7 7 France This NaN 87.0 24.0 Alsace Alsace NaN Roger @vossroger Trimbach 2012 Gewürztraminer Trimbach
8 8 Germany Savory Shine 87.0 12.0 Rheinhessen NaN NaN Anna NaN Heinz 2013 Gewürztraminer Heinz
9 9 France This Les Natures 87.0 27.0 Alsace Alsace NaN Roger @vossroger Jean 2012 Pinot Jean-Baptiste
10 10 US Soft Mountain 87.0 19.0 California Napa Napa Virginie @vboone Kirkland 2011 Cabernet Kirkland
def csv_to_df(file):
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.width', None) # 显示宽度是无限
wine_df = pd.read_csv(file)
print(wine_df.country[4]) # nan
print(type(wine_df.country[4])) # <class 'float'>
对缺失值进行替换
def csv_to_df(file):
pd.set_option('display.max_columns', None) # 显示所有列
pd.set_option('display.max_rows', None) # 显示所有行
pd.set_option('display.width', None) # 显示宽度是无限
wine_df = pd.read_csv(file)
wine_df.fillna('', inplace=True) # 对nan值进行替换,用空字符串进行填充,inplace : 如果为True则修改当前df,否则返回新的df
print(wine_df)
order country description designation points price province region_1 region_2 taster_name taster_twitter_handle title variety winery
0 0 Italy Aromas Bianco 87 Sicily Etna Kerin @kerinokeefe Nicosia 2013 White Nicosia
1 1 Portugal This. Avidagos 87 15 Douro Roger @vossroger Quinta 2011 Portuguese Quinta
2 2 US Tart 14 Oregon Willamette Willamette Paul @paulgwine Rainstorm 2013 Pinot Rainstorm
3 3 US Pineapple Reserve 87 Michigan Lake Alexander Julian 2013 Riesling St. Julian
4 4 Much Vintner 87 65 Oregon Willamette Willamette Paul @paulgwine Sweet 2012 Pinot Sweet
5 5 Spain Blackberry Ars 87 15 Northern Navarra Michael @wineschach Tandem 2011 Tempranillo Tandem
6 6 Italy Here Belsito 87 16 Sicily Vittoria Kerin @kerinokeefe Terre 2013 Frappato Terre
7 7 France This 87 24 Alsace Alsace Roger @vossroger Trimbach 2012 Gewürztraminer Trimbach
8 8 Germany Savory Shine 87 12 Rheinhessen Anna Heinz 2013 Gewürztraminer Heinz
9 9 France This Les Natures 87 27 Alsace Alsace Roger @vossroger Jean 2012 Pinot Jean-Baptiste
10 10 US Soft Mountain 87 19 California Napa Napa Virginie @vboone Kirkland 2011 Cabernet Kirkland