pandas教程 - 07 缺失数据处理 - 《Python程序设计数字教程》

import pandas as pd
def csv_to_ls(file):
    """接收文件名为参数，读取文件中的数据，
    根据逗号切分数据为列表加到列表中"""
    with open(file, 'r', encoding='utf-8') as wine:
        wine_list = [line.strip().split(',') for line in wine]
    # print(wine_list)
    return wine_list
def country(wine_list):
    """将列表中序号为1的列的数据提取出来，以列表形式返回"""
    country_list = [x[1] for x in wine_list[1:]]
    return country_list
def points(wine_list):
    """将列表中序号为1的列的数据提取出来，以列表形式返回"""
    point_list = [x[4] for x in wine_list[1:]]
    print(point_list)
    return point_list
def avg_point(point_list):
    """将列表中的数据映射为整型，返回平均值"""
    point_list_int = map(int, point_list)
    return sum(point_list_int) / len(point_list_int)
def price(wine_list):
    price_list = [x[5] for x in wine_list[1:]]
    return price_list
if __name__ == '__main__':
    filename = './wine point.csv'
    wine = csv_to_ls(filename)
    country_ls = country(wine)
    point_ls = points(wine)
    price_ls = price(wine)
    print(avg_point(point_ls))

使用python方法将文件中的数据读入到列表后，统计时，可能会发现缺失数据会对处理方法造成很大的影响。列如，试图将值为整数或浮点数的列映射为数值类型时，会触发异常：

ValueError: invalid literal for int() with base 10: ''

这是因为空字符串无法用float()或int()函数转为浮点类型或整型。
用pd.read_csv()可以简单的将文件中的数据读入到dataframe中，再用df.values.tolist()可以方便的将数据转为列表。

def csv_to_df(file):
    wine_ls = pd.read_csv(file).values.tolist()
    print(wine_ls)
    return wine_ls

查看列表，会发现里面存在一些值为nan的项，这些便是缺失数据，pandas读取时自动用nan进行了填充：

[[0, 'Italy', 'Aromas ', 'Bianco', 87.0, nan, 'Sicily', 'Etna', nan, 'Kerin', '@kerinokeefe', 'Nicosia 2013', 'White', 'Nicosia'], 
 [1, 'Portugal', 'This.', 'Avidagos', 87.0, 15.0, 'Douro', nan, nan, 'Roger', '@vossroger', 'Quinta 2011', 'Portuguese', 'Quinta'], 
 [2, 'US', 'Tart', nan, nan, 14.0, 'Oregon', 'Willamette', 'Willamette', 'Paul', '@paulgwine', 'Rainstorm 2013', 'Pinot', 'Rainstorm'], 
 [3, 'US', 'Pineapple', 'Reserve', 87.0, nan, 'Michigan', 'Lake', nan, 'Alexander', nan, 'Julian 2013', 'Riesling', 'St. Julian'], 
 [4, nan, 'Much', 'Vintner', 87.0, 65.0, 'Oregon', 'Willamette', 'Willamette', 'Paul', '@paulgwine', 'Sweet 2012', 'Pinot', 'Sweet'], 
 [5, 'Spain', 'Blackberry', 'Ars', 87.0, 15.0, 'Northern', 'Navarra', nan, 'Michael', '@wineschach', 'Tandem 2011', 'Tempranillo', 'Tandem'], 
 [6, 'Italy', 'Here', 'Belsito', 87.0, 16.0, 'Sicily', 'Vittoria', nan, 'Kerin', '@kerinokeefe', 'Terre 2013', 'Frappato', 'Terre'], 
 [7, 'France', 'This', nan, 87.0, 24.0, 'Alsace', 'Alsace', nan, 'Roger', '@vossroger', 'Trimbach 2012', 'Gewürztraminer', 'Trimbach'], 
 [8, 'Germany', 'Savory', 'Shine', 87.0, 12.0, 'Rheinhessen', nan, nan, 'Anna', nan, 'Heinz 2013', 'Gewürztraminer', 'Heinz'], 
 [9, 'France', 'This', 'Les Natures', 87.0, 27.0, 'Alsace', 'Alsace', nan, 'Roger', '@vossroger', 'Jean 2012', 'Pinot', 'Jean-Baptiste'], 
 [10, 'US', 'Soft', 'Mountain', 87.0, 19.0, 'California', 'Napa', 'Napa', 'Virginie', '@vboone', 'Kirkland 2011', 'Cabernet', 'Kirkland']]

nan是Not a Number的意思，其数据类型并不是字符串，而是”float”,所以不能用是否为空字符串或是否为字符串”nan”来判定该数据是否为空值。
如果用Python的方法，可以借助于math库里的方法

if math.isnan(x) == False可用于判定x的值是不是nan

当然，pandas中提供了大量用于缺失值的处理方法：

def csv_to_df(file):
    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('display.width', None)  # 显示宽度是无限
    wine_df = pd.read_csv(file)
    print(wine_df.isnull())        # 检测缺失值，返回布尔值,.notnull()，True表示数据缺失

    order  country  description  designation  points  price  province  region_1  region_2  taster_name  taster_twitter_handle  title  variety  winery
0   False    False        False        False   False   True     False     False      True        False                  False  False    False   False
1   False    False        False        False   False  False     False      True      True        False                  False  False    False   False
2   False    False        False         True    True  False     False     False     False        False                  False  False    False   False
3   False    False        False        False   False   True     False     False      True        False                   True  False    False   False
4   False     True        False        False   False  False     False     False     False        False                  False  False    False   False
5   False    False        False        False   False  False     False     False      True        False                  False  False    False   False
6   False    False        False        False   False  False     False     False      True        False                  False  False    False   False
7   False    False        False         True   False  False     False     False      True        False                  False  False    False   False
8   False    False        False        False   False  False     False      True      True        False                   True  False    False   False
9   False    False        False        False   False  False     False     False      True        False                  False  False    False   False
10  False    False        False        False   False  False     False     False     False        False                  False  False    False   False

def csv_to_df(file):
    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('display.width', None)  # 显示宽度是无限
    wine_df = pd.read_csv(file)
    print(wine_df.price.isnull())  # 检测价格列的缺失值，True表示数据缺失

0      True
1     False
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
Name: price, dtype: bool

def csv_to_df(file):
    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('display.width', None)  # 显示宽度是无限
    wine_df = pd.read_csv(file)
    print(wine_df.loc[wine_df['price'].notnull(),:])    # 返回价格没有缺失的数据

    order   country description  designation  points  price     province    region_1    region_2 taster_name taster_twitter_handle           title         variety         winery
1       1  Portugal       This.     Avidagos    87.0   15.0        Douro         NaN         NaN       Roger            @vossroger     Quinta 2011      Portuguese         Quinta
2       2        US        Tart          NaN     NaN   14.0       Oregon  Willamette  Willamette        Paul            @paulgwine  Rainstorm 2013           Pinot      Rainstorm
4       4       NaN        Much      Vintner    87.0   65.0       Oregon  Willamette  Willamette        Paul            @paulgwine      Sweet 2012           Pinot          Sweet
5       5     Spain  Blackberry          Ars    87.0   15.0     Northern     Navarra         NaN     Michael           @wineschach     Tandem 2011     Tempranillo         Tandem
6       6     Italy        Here      Belsito    87.0   16.0       Sicily    Vittoria         NaN       Kerin          @kerinokeefe      Terre 2013        Frappato          Terre
7       7    France        This          NaN    87.0   24.0       Alsace      Alsace         NaN       Roger            @vossroger   Trimbach 2012  Gewürztraminer       Trimbach
8       8   Germany      Savory        Shine    87.0   12.0  Rheinhessen         NaN         NaN        Anna                   NaN      Heinz 2013  Gewürztraminer          Heinz
9       9    France        This  Les Natures    87.0   27.0       Alsace      Alsace         NaN       Roger            @vossroger       Jean 2012           Pinot  Jean-Baptiste
10     10        US        Soft     Mountain    87.0   19.0   California        Napa        Napa    Virginie               @vboone   Kirkland 2011        Cabernet       Kirkland

def csv_to_df(file):
    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('display.width', None)  # 显示宽度是无限
    wine_df = pd.read_csv(file)
    print(wine_df.isnull())        # 检测缺失值，返回布尔值,.notnull()
    print(wine_df.loc[wine_df['points'].notnull(), :])  # 返回评分没有缺失的数据

    order   country description  designation  points  price     province    region_1    region_2 taster_name taster_twitter_handle          title         variety         winery
0       0     Italy     Aromas        Bianco    87.0    NaN       Sicily        Etna         NaN       Kerin          @kerinokeefe   Nicosia 2013           White        Nicosia
1       1  Portugal       This.     Avidagos    87.0   15.0        Douro         NaN         NaN       Roger            @vossroger    Quinta 2011      Portuguese         Quinta
3       3        US   Pineapple      Reserve    87.0    NaN     Michigan        Lake         NaN   Alexander                   NaN    Julian 2013        Riesling     St. Julian
4       4       NaN        Much      Vintner    87.0   65.0       Oregon  Willamette  Willamette        Paul            @paulgwine     Sweet 2012           Pinot          Sweet
5       5     Spain  Blackberry          Ars    87.0   15.0     Northern     Navarra         NaN     Michael           @wineschach    Tandem 2011     Tempranillo         Tandem
6       6     Italy        Here      Belsito    87.0   16.0       Sicily    Vittoria         NaN       Kerin          @kerinokeefe     Terre 2013        Frappato          Terre
7       7    France        This          NaN    87.0   24.0       Alsace      Alsace         NaN       Roger            @vossroger  Trimbach 2012  Gewürztraminer       Trimbach
8       8   Germany      Savory        Shine    87.0   12.0  Rheinhessen         NaN         NaN        Anna                   NaN     Heinz 2013  Gewürztraminer          Heinz
9       9    France        This  Les Natures    87.0   27.0       Alsace      Alsace         NaN       Roger            @vossroger      Jean 2012           Pinot  Jean-Baptiste
10     10        US        Soft     Mountain    87.0   19.0   California        Napa        Napa    Virginie               @vboone  Kirkland 2011        Cabernet       Kirkland

def csv_to_df(file):
    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)     # 显示所有行
    pd.set_option('display.width', None)        # 显示宽度是无限
    wine_df = pd.read_csv(file)
    print(wine_df.country[4])        # nan
    print(type(wine_df.country[4]))  # <class 'float'>

对缺失值进行替换

def csv_to_df(file):
    pd.set_option('display.max_columns', None)  # 显示所有列
    pd.set_option('display.max_rows', None)  # 显示所有行
    pd.set_option('display.width', None)  # 显示宽度是无限
    wine_df = pd.read_csv(file)
    wine_df.fillna('', inplace=True)   # 对nan值进行替换，用空字符串进行填充，inplace : 如果为True则修改当前df，否则返回新的df
    print(wine_df)

    order   country description  designation points price     province    region_1    region_2 taster_name taster_twitter_handle           title         variety         winery
0       0     Italy     Aromas        Bianco     87             Sicily        Etna                   Kerin          @kerinokeefe    Nicosia 2013           White        Nicosia
1       1  Portugal       This.     Avidagos     87    15        Douro                               Roger            @vossroger     Quinta 2011      Portuguese         Quinta
2       2        US        Tart                        14       Oregon  Willamette  Willamette        Paul            @paulgwine  Rainstorm 2013           Pinot      Rainstorm
3       3        US   Pineapple      Reserve     87           Michigan        Lake               Alexander                           Julian 2013        Riesling     St. Julian
4       4                  Much      Vintner     87    65       Oregon  Willamette  Willamette        Paul            @paulgwine      Sweet 2012           Pinot          Sweet
5       5     Spain  Blackberry          Ars     87    15     Northern     Navarra                 Michael           @wineschach     Tandem 2011     Tempranillo         Tandem
6       6     Italy        Here      Belsito     87    16       Sicily    Vittoria                   Kerin          @kerinokeefe      Terre 2013        Frappato          Terre
7       7    France        This                  87    24       Alsace      Alsace                   Roger            @vossroger   Trimbach 2012  Gewürztraminer       Trimbach
8       8   Germany      Savory        Shine     87    12  Rheinhessen                                Anna                            Heinz 2013  Gewürztraminer          Heinz
9       9    France        This  Les Natures     87    27       Alsace      Alsace                   Roger            @vossroger       Jean 2012           Pinot  Jean-Baptiste
10     10        US        Soft     Mountain     87    19   California        Napa        Napa    Virginie               @vboone   Kirkland 2011        Cabernet       Kirkland