%%html<style>@import url('https://fonts.googleapis.com/css?family=Ewert|Roboto&effect=3d|ice|');body {background-color: gainsboro;} a {color: #37c9e1; font-family: 'Roboto';} h1 {color: #37c9e1; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;} h2, h3 {color: slategray; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;}h4 {color: #818286; font-family: 'Roboto';}span {font-family:'Roboto'; color:black; text-shadow: 5px 5px 5px #aaa;} div.output_area pre{font-family:'Roboto'; font-size:110%; color:lightblue;} </style>
🌑 Setup and Read Dataset
import pandas as pd
dataPath = '../input/zoo-animal-classification/'animalClass = pd.read_csv(dataPath+'class.csv')zoo = pd.read_csv(dataPath+'zoo.csv')
🌒 Data Structure
The type of rows and columns of ‘DataFrame’ is “Series”
# 1D(base) --> Series row = zoo.iloc[0]col = zoo.eggsprint(type(row))print(type(col))# 2D(base) --> DataFramprint(type(zoo))
<class 'pandas.core.series.Series'><class 'pandas.core.series.Series'><class 'pandas.core.frame.DataFrame'>
🌒 Print
# zoo.head() # 默认打印前五行zoo.head(4)# del zoo['hair'] # 删除 zoo 中 hair 这一列# zoo.head()
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
| 0 |
aardvark |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
| 1 |
antelope |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
| 2 |
bass |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
| 3 |
bear |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
animalClass.head()
|
Class_Number |
Number_Of_Animal_Species_In_Class |
Class_Type |
Animal_Names |
| 0 |
1 |
41 |
Mammal |
aardvark, antelope, bear, boar, buffalo, calf,… |
| 1 |
2 |
20 |
Bird |
chicken, crow, dove, duck, flamingo, gull, haw… |
| 2 |
3 |
5 |
Reptile |
pitviper, seasnake, slowworm, tortoise, tuatara |
| 3 |
4 |
13 |
Fish |
bass, carp, catfish, chub, dogfish, haddock, h… |
| 4 |
5 |
4 |
Amphibian |
frog, frog, newt, toad |
🌓 Rows Indexing
# row index location (positional indexing) --> ilocrow = zoo.iloc[0]# label based indexing --> loc# row = zoo.loc['bass'] # 相当于用自己命名的 label 替换自带的 indexprint(row)
animal_name aardvarkhair 1feathers 0eggs 0milk 1airborne 0aquatic 0predator 1toothed 1backbone 1breathes 1venomous 0fins 0legs 4tail 0domestic 0catsize 1class_type 1Name: 0, dtype: object
🌔 Column Indexing
# according ['column name']col = zoo['eggs']# according '.'col = zoo.eggsprint(col)
0 01 02 13 04 0 ..96 097 198 099 1100 1Name: eggs, Length: 101, dtype: int64
🌕 Describe the Data
# describe each columnzoo.describe()
|
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
| count |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
| mean |
0.425743 |
0.198020 |
0.584158 |
0.405941 |
0.237624 |
0.356436 |
0.554455 |
0.603960 |
0.821782 |
0.792079 |
0.079208 |
0.168317 |
2.841584 |
0.742574 |
0.128713 |
0.435644 |
2.831683 |
| std |
0.496921 |
0.400495 |
0.495325 |
0.493522 |
0.427750 |
0.481335 |
0.499505 |
0.491512 |
0.384605 |
0.407844 |
0.271410 |
0.376013 |
2.033385 |
0.439397 |
0.336552 |
0.498314 |
2.102709 |
| min |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
| 25% |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
2.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
| 50% |
0.000000 |
0.000000 |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
4.000000 |
1.000000 |
0.000000 |
0.000000 |
2.000000 |
| 75% |
1.000000 |
0.000000 |
1.000000 |
1.000000 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
4.000000 |
1.000000 |
0.000000 |
1.000000 |
4.000000 |
| max |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
8.000000 |
1.000000 |
1.000000 |
1.000000 |
7.000000 |
# describe one columnzoo.eggs.describe()
count 101.000000mean 0.584158std 0.495325min 0.00000025% 0.00000050% 1.00000075% 1.000000max 1.000000Name: eggs, dtype: float64
zoo.mean() # .min() .max() .std() .mode() .corr() .mean()zoo.corr() # 计算相不同列(Series)之间的(DataFrame)zoo.corrwith(zoo.eggs) # 分别计算 zoo(DataFrame)中所有列与 eggs(Series)的相关系数zoo.eggs.corr(zoo.milk) # 计算相关系数(one Series with the other Series)
-0.9388478737749797
🌖 Filter the Data
filter.any() 相当于对 filter 进行或 (or) 运算,寻找 Truefilter.all() 相当于对 filter 进行与 (and) 运算,寻找 False
# choose some rows that satisfy the filter(zoo.eggs==0)myFilter = zoo.eggs==0 # <1 or >1print("myFilter:\n",myFilter)f_all = myFilter.any() # find the True --> return Turef_any = myFilter.all() # find the False --> return Falseprint("myFilter.any() : %s \nmyFilter.all() : %s"%(f_all,f_any))
myFilter: 0 True1 True2 False3 True4 True ... 96 True97 False98 True99 False100 FalseName: eggs, Length: 101, dtype: boolmyFilter.any() : True myFilter.all() : False
🌗 Clean the Data
zoo.shape # not zoo.shape()
(101, 18)
zoo.head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
| 0 |
aardvark |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
| 1 |
antelope |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
| 2 |
bass |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
| 3 |
bear |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
zoo.isnull().head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
| 0 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
| 1 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
| 2 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
| 3 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
zoo.isnull().any().any() # try to find whether zoo has null values(none)
False
zoo.eggs[0] = None # create the null valuezoo.head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
| 0 |
aardvark |
1 |
0 |
NaN |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
| 1 |
antelope |
1 |
0 |
0.0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
| 2 |
bass |
0 |
0 |
1.0 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
| 3 |
bear |
1 |
0 |
0.0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
zoo = zoo.dropna() # delete the rows containing null valueszoo.head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
| 1 |
antelope |
1 |
0 |
0.0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
| 2 |
bass |
0 |
0 |
1.0 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
| 3 |
bear |
1 |
0 |
0.0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
| 4 |
boar |
1 |
0 |
0.0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
🌑 Others
append
dfa.append(dfb) 用于将 DataFrame b 连接到 DataFrame a 后面,需要注意的是,这里和 lista.append(listb) 是有去别的,即 pandas 并不在原来的 DataFrame a 上直接操作,而是返回添加好的 DataFrame result 。正确的使用方法应该是 dfa = dfa.append(dfb) 。