%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Ewert|Roboto&effect=3d|ice|');
body {background-color: gainsboro;}
a {color: #37c9e1; font-family: 'Roboto';}
h1 {color: #37c9e1; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;}
h2, h3 {color: slategray; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;}
h4 {color: #818286; font-family: 'Roboto';}
span {font-family:'Roboto'; color:black; text-shadow: 5px 5px 5px #aaa;}
div.output_area pre{font-family:'Roboto'; font-size:110%; color:lightblue;}
</style>
🌑 Setup and Read Dataset
import pandas as pd
dataPath = '../input/zoo-animal-classification/'
animalClass = pd.read_csv(dataPath+'class.csv')
zoo = pd.read_csv(dataPath+'zoo.csv')
🌒 Data Structure
The type of rows and columns of ‘DataFrame’ is “Series”
# 1D(base) --> Series
row = zoo.iloc[0]
col = zoo.eggs
print(type(row))
print(type(col))
# 2D(base) --> DataFram
print(type(zoo))
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>
🌒 Print
# zoo.head() # 默认打印前五行
zoo.head(4)
# del zoo['hair'] # 删除 zoo 中 hair 这一列
# zoo.head()
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
0 |
aardvark |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
1 |
antelope |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
2 |
bass |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
3 |
bear |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
animalClass.head()
|
Class_Number |
Number_Of_Animal_Species_In_Class |
Class_Type |
Animal_Names |
0 |
1 |
41 |
Mammal |
aardvark, antelope, bear, boar, buffalo, calf,… |
1 |
2 |
20 |
Bird |
chicken, crow, dove, duck, flamingo, gull, haw… |
2 |
3 |
5 |
Reptile |
pitviper, seasnake, slowworm, tortoise, tuatara |
3 |
4 |
13 |
Fish |
bass, carp, catfish, chub, dogfish, haddock, h… |
4 |
5 |
4 |
Amphibian |
frog, frog, newt, toad |
🌓 Rows Indexing
# row index location (positional indexing) --> iloc
row = zoo.iloc[0]
# label based indexing --> loc
# row = zoo.loc['bass'] # 相当于用自己命名的 label 替换自带的 index
print(row)
animal_name aardvark
hair 1
feathers 0
eggs 0
milk 1
airborne 0
aquatic 0
predator 1
toothed 1
backbone 1
breathes 1
venomous 0
fins 0
legs 4
tail 0
domestic 0
catsize 1
class_type 1
Name: 0, dtype: object
🌔 Column Indexing
# according ['column name']
col = zoo['eggs']
# according '.'
col = zoo.eggs
print(col)
0 0
1 0
2 1
3 0
4 0
..
96 0
97 1
98 0
99 1
100 1
Name: eggs, Length: 101, dtype: int64
🌕 Describe the Data
# describe each column
zoo.describe()
|
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
count |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
101.000000 |
mean |
0.425743 |
0.198020 |
0.584158 |
0.405941 |
0.237624 |
0.356436 |
0.554455 |
0.603960 |
0.821782 |
0.792079 |
0.079208 |
0.168317 |
2.841584 |
0.742574 |
0.128713 |
0.435644 |
2.831683 |
std |
0.496921 |
0.400495 |
0.495325 |
0.493522 |
0.427750 |
0.481335 |
0.499505 |
0.491512 |
0.384605 |
0.407844 |
0.271410 |
0.376013 |
2.033385 |
0.439397 |
0.336552 |
0.498314 |
2.102709 |
min |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
25% |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
2.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
50% |
0.000000 |
0.000000 |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
4.000000 |
1.000000 |
0.000000 |
0.000000 |
2.000000 |
75% |
1.000000 |
0.000000 |
1.000000 |
1.000000 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
4.000000 |
1.000000 |
0.000000 |
1.000000 |
4.000000 |
max |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
8.000000 |
1.000000 |
1.000000 |
1.000000 |
7.000000 |
# describe one column
zoo.eggs.describe()
count 101.000000
mean 0.584158
std 0.495325
min 0.000000
25% 0.000000
50% 1.000000
75% 1.000000
max 1.000000
Name: eggs, dtype: float64
zoo.mean() # .min() .max() .std() .mode() .corr() .mean()
zoo.corr() # 计算相不同列(Series)之间的(DataFrame)
zoo.corrwith(zoo.eggs) # 分别计算 zoo(DataFrame)中所有列与 eggs(Series)的相关系数
zoo.eggs.corr(zoo.milk) # 计算相关系数(one Series with the other Series)
-0.9388478737749797
🌖 Filter the Data
filter.any()
相当于对 filter
进行或 (or) 运算,寻找 True
filter.all()
相当于对 filter
进行与 (and) 运算,寻找 False
# choose some rows that satisfy the filter(zoo.eggs==0)
myFilter = zoo.eggs==0 # <1 or >1
print("myFilter:\n",myFilter)
f_all = myFilter.any() # find the True --> return Ture
f_any = myFilter.all() # find the False --> return False
print("myFilter.any() : %s \nmyFilter.all() : %s"%(f_all,f_any))
myFilter:
0 True
1 True
2 False
3 True
4 True
...
96 True
97 False
98 True
99 False
100 False
Name: eggs, Length: 101, dtype: bool
myFilter.any() : True
myFilter.all() : False
🌗 Clean the Data
zoo.shape # not zoo.shape()
(101, 18)
zoo.head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
0 |
aardvark |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
1 |
antelope |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
2 |
bass |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
3 |
bear |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
zoo.isnull().head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
0 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
1 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
2 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
3 |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
False |
zoo.isnull().any().any() # try to find whether zoo has null values(none)
False
zoo.eggs[0] = None # create the null value
zoo.head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
0 |
aardvark |
1 |
0 |
NaN |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
1 |
antelope |
1 |
0 |
0.0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
2 |
bass |
0 |
0 |
1.0 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
3 |
bear |
1 |
0 |
0.0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
zoo = zoo.dropna() # delete the rows containing null values
zoo.head(4)
|
animal_name |
hair |
feathers |
eggs |
milk |
airborne |
aquatic |
predator |
toothed |
backbone |
breathes |
venomous |
fins |
legs |
tail |
domestic |
catsize |
class_type |
1 |
antelope |
1 |
0 |
0.0 |
1 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
2 |
bass |
0 |
0 |
1.0 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
4 |
3 |
bear |
1 |
0 |
0.0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
0 |
0 |
1 |
1 |
4 |
boar |
1 |
0 |
0.0 |
1 |
0 |
0 |
1 |
1 |
1 |
1 |
0 |
0 |
4 |
1 |
0 |
1 |
1 |
🌑 Others
append
dfa.append(dfb)
用于将 DataFrame b
连接到 DataFrame a
后面,需要注意的是,这里和 lista.append(listb)
是有去别的,即 pandas 并不在原来的 DataFrame a
上直接操作,而是返回添加好的 DataFrame result
。正确的使用方法应该是 dfa = dfa.append(dfb)
。