1. %%html
  2. <style>
  3. @import url('https://fonts.googleapis.com/css?family=Ewert|Roboto&effect=3d|ice|');
  4. body {background-color: gainsboro;}
  5. a {color: #37c9e1; font-family: 'Roboto';}
  6. h1 {color: #37c9e1; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;}
  7. h2, h3 {color: slategray; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;}
  8. h4 {color: #818286; font-family: 'Roboto';}
  9. span {font-family:'Roboto'; color:black; text-shadow: 5px 5px 5px #aaa;}
  10. div.output_area pre{font-family:'Roboto'; font-size:110%; color:lightblue;}
  11. </style>

🌑 Setup and Read Dataset

  1. import pandas as pd
  1. dataPath = '../input/zoo-animal-classification/'
  2. animalClass = pd.read_csv(dataPath+'class.csv')
  3. zoo = pd.read_csv(dataPath+'zoo.csv')

🌒 Data Structure

The type of rows and columns of ‘DataFrame’ is “Series”

  1. # 1D(base) --> Series
  2. row = zoo.iloc[0]
  3. col = zoo.eggs
  4. print(type(row))
  5. print(type(col))
  6. # 2D(base) --> DataFram
  7. print(type(zoo))
  1. <class 'pandas.core.series.Series'>
  2. <class 'pandas.core.series.Series'>
  3. <class 'pandas.core.frame.DataFrame'>

🌒 Print

  1. # zoo.head() # 默认打印前五行
  2. zoo.head(4)
  3. # del zoo['hair'] # 删除 zoo 中 hair 这一列
  4. # zoo.head()
animal_name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize class_type
0 aardvark 1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
1 antelope 1 0 0 1 0 0 0 1 1 1 0 0 4 1 0 1 1
2 bass 0 0 1 0 0 1 1 1 1 0 0 1 0 1 0 0 4
3 bear 1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
  1. animalClass.head()
Class_Number Number_Of_Animal_Species_In_Class Class_Type Animal_Names
0 1 41 Mammal aardvark, antelope, bear, boar, buffalo, calf,…
1 2 20 Bird chicken, crow, dove, duck, flamingo, gull, haw…
2 3 5 Reptile pitviper, seasnake, slowworm, tortoise, tuatara
3 4 13 Fish bass, carp, catfish, chub, dogfish, haddock, h…
4 5 4 Amphibian frog, frog, newt, toad

🌓 Rows Indexing

  1. # row index location (positional indexing) --> iloc
  2. row = zoo.iloc[0]
  3. # label based indexing --> loc
  4. # row = zoo.loc['bass'] # 相当于用自己命名的 label 替换自带的 index
  5. print(row)
  1. animal_name aardvark
  2. hair 1
  3. feathers 0
  4. eggs 0
  5. milk 1
  6. airborne 0
  7. aquatic 0
  8. predator 1
  9. toothed 1
  10. backbone 1
  11. breathes 1
  12. venomous 0
  13. fins 0
  14. legs 4
  15. tail 0
  16. domestic 0
  17. catsize 1
  18. class_type 1
  19. Name: 0, dtype: object

🌔 Column Indexing

  1. # according ['column name']
  2. col = zoo['eggs']
  3. # according '.'
  4. col = zoo.eggs
  5. print(col)
  1. 0 0
  2. 1 0
  3. 2 1
  4. 3 0
  5. 4 0
  6. ..
  7. 96 0
  8. 97 1
  9. 98 0
  10. 99 1
  11. 100 1
  12. Name: eggs, Length: 101, dtype: int64

🌕 Describe the Data

  1. # describe each column
  2. zoo.describe()
hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize class_type
count 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000 101.000000
mean 0.425743 0.198020 0.584158 0.405941 0.237624 0.356436 0.554455 0.603960 0.821782 0.792079 0.079208 0.168317 2.841584 0.742574 0.128713 0.435644 2.831683
std 0.496921 0.400495 0.495325 0.493522 0.427750 0.481335 0.499505 0.491512 0.384605 0.407844 0.271410 0.376013 2.033385 0.439397 0.336552 0.498314 2.102709
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 2.000000 0.000000 0.000000 0.000000 1.000000
50% 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 4.000000 1.000000 0.000000 0.000000 2.000000
75% 1.000000 0.000000 1.000000 1.000000 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 4.000000 1.000000 0.000000 1.000000 4.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 8.000000 1.000000 1.000000 1.000000 7.000000
  1. # describe one column
  2. zoo.eggs.describe()
  1. count 101.000000
  2. mean 0.584158
  3. std 0.495325
  4. min 0.000000
  5. 25% 0.000000
  6. 50% 1.000000
  7. 75% 1.000000
  8. max 1.000000
  9. Name: eggs, dtype: float64
  1. zoo.mean() # .min() .max() .std() .mode() .corr() .mean()
  2. zoo.corr() # 计算相不同列(Series)之间的(DataFrame)
  3. zoo.corrwith(zoo.eggs) # 分别计算 zoo(DataFrame)中所有列与 eggs(Series)的相关系数
  4. zoo.eggs.corr(zoo.milk) # 计算相关系数(one Series with the other Series)
  1. -0.9388478737749797

🌖 Filter the Data

  • filter.any() 相当于对 filter 进行或 (or) 运算,寻找 True
  • filter.all() 相当于对 filter 进行与 (and) 运算,寻找 False
  1. # choose some rows that satisfy the filter(zoo.eggs==0)
  2. myFilter = zoo.eggs==0 # <1 or >1
  3. print("myFilter:\n",myFilter)
  4. f_all = myFilter.any() # find the True --> return Ture
  5. f_any = myFilter.all() # find the False --> return False
  6. print("myFilter.any() : %s \nmyFilter.all() : %s"%(f_all,f_any))
  1. myFilter:
  2. 0 True
  3. 1 True
  4. 2 False
  5. 3 True
  6. 4 True
  7. ...
  8. 96 True
  9. 97 False
  10. 98 True
  11. 99 False
  12. 100 False
  13. Name: eggs, Length: 101, dtype: bool
  14. myFilter.any() : True
  15. myFilter.all() : False

🌗 Clean the Data

  1. zoo.shape # not zoo.shape()
  1. (101, 18)
  1. zoo.head(4)
animal_name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize class_type
0 aardvark 1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
1 antelope 1 0 0 1 0 0 0 1 1 1 0 0 4 1 0 1 1
2 bass 0 0 1 0 0 1 1 1 1 0 0 1 0 1 0 0 4
3 bear 1 0 0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
  1. zoo.isnull().head(4)
animal_name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize class_type
0 False False False False False False False False False False False False False False False False False False
1 False False False False False False False False False False False False False False False False False False
2 False False False False False False False False False False False False False False False False False False
3 False False False False False False False False False False False False False False False False False False
  1. zoo.isnull().any().any() # try to find whether zoo has null values(none)
  1. False
  1. zoo.eggs[0] = None # create the null value
  2. zoo.head(4)
animal_name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize class_type
0 aardvark 1 0 NaN 1 0 0 1 1 1 1 0 0 4 0 0 1 1
1 antelope 1 0 0.0 1 0 0 0 1 1 1 0 0 4 1 0 1 1
2 bass 0 0 1.0 0 0 1 1 1 1 0 0 1 0 1 0 0 4
3 bear 1 0 0.0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
  1. zoo = zoo.dropna() # delete the rows containing null values
  2. zoo.head(4)
animal_name hair feathers eggs milk airborne aquatic predator toothed backbone breathes venomous fins legs tail domestic catsize class_type
1 antelope 1 0 0.0 1 0 0 0 1 1 1 0 0 4 1 0 1 1
2 bass 0 0 1.0 0 0 1 1 1 1 0 0 1 0 1 0 0 4
3 bear 1 0 0.0 1 0 0 1 1 1 1 0 0 4 0 0 1 1
4 boar 1 0 0.0 1 0 0 1 1 1 1 0 0 4 1 0 1 1

🌑 Others

append

dfa.append(dfb) 用于将 DataFrame b 连接到 DataFrame a 后面,需要注意的是,这里和 lista.append(listb) 是有去别的,即 pandas 并不在原来的 DataFrame a 上直接操作,而是返回添加好的 DataFrame result 。正确的使用方法应该是 dfa = dfa.append(dfb)