手册 - Pandas | 手册 - 《CS》

🌑 Setup and Read Dataset
🌒 Data Structure
🌒 Print
🌓 Rows Indexing
🌔 Column Indexing
🌕 Describe the Data
🌖 Filter the Data
🌗 Clean the Data
🌑 Others
- append

%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Ewert|Roboto&effect=3d|ice|');
body {background-color: gainsboro;} 
a {color: #37c9e1; font-family: 'Roboto';} 
h1 {color: #37c9e1; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;} 
h2, h3 {color: slategray; font-family: 'Orbitron'; text-shadow: 4px 4px 4px #aaa;}
h4 {color: #818286; font-family: 'Roboto';}
span {font-family:'Roboto'; color:black; text-shadow: 5px 5px 5px #aaa;}  
div.output_area pre{font-family:'Roboto'; font-size:110%; color:lightblue;}      
</style>

🌑 Setup and Read Dataset

import pandas as pd

dataPath = '../input/zoo-animal-classification/'
animalClass = pd.read_csv(dataPath+'class.csv')
zoo = pd.read_csv(dataPath+'zoo.csv')

🌒 Data Structure

The type of rows and columns of ‘DataFrame’ is “Series”

# 1D(base) --> Series  
row = zoo.iloc[0]
col = zoo.eggs
print(type(row))
print(type(col))
# 2D(base) --> DataFram
print(type(zoo))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>

🌒 Print

# zoo.head() # 默认打印前五行
zoo.head(4)
# del zoo['hair'] # 删除 zoo 中 hair 这一列
# zoo.head()

	animal_name	hair	eggs	milk	aquatic	predator	toothed	backbone	breathes	fins	legs	tail	catsize	class_type
0	aardvark	1	0	1	0	1	1	1	1	0	4	0	1	1
1	antelope	1	0	1	0	0	1	1	1	0	4	1	1	1
2	bass	0	1	0	1	1	1	1	0	1	0	1	0	4
3	bear	1	0	1	0	1	1	1	1	0	4	0	1	1

animalClass.head()

	Class_Number	Number_Of_Animal_Species_In_Class	Class_Type	Animal_Names
0	1	41	Mammal	aardvark, antelope, bear, boar, buffalo, calf,…
1	2	20	Bird	chicken, crow, dove, duck, flamingo, gull, haw…
2	3	5	Reptile	pitviper, seasnake, slowworm, tortoise, tuatara
3	4	13	Fish	bass, carp, catfish, chub, dogfish, haddock, h…
4	5	4	Amphibian	frog, frog, newt, toad

🌓 Rows Indexing

# row index location (positional indexing) --> iloc
row = zoo.iloc[0]
# label based indexing --> loc
# row = zoo.loc['bass'] # 相当于用自己命名的 label 替换自带的 index
print(row)

animal_name    aardvark
hair                  1
feathers              0
eggs                  0
milk                  1
airborne              0
aquatic               0
predator              1
toothed               1
backbone              1
breathes              1
venomous              0
fins                  0
legs                  4
tail                  0
domestic              0
catsize               1
class_type            1
Name: 0, dtype: object

🌔 Column Indexing

# according ['column name']
col = zoo['eggs']
# according '.'
col = zoo.eggs
print(col)

0      0
1      0
2      1
3      0
4      0
      ..
96     0
97     1
98     0
99     1
100    1
Name: eggs, Length: 101, dtype: int64

🌕 Describe the Data

# describe each column
zoo.describe()

	hair	feathers	eggs	milk	airborne	aquatic	predator	toothed	backbone	breathes	venomous	fins	legs	tail	domestic	catsize	class_type
count	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000	101.000000
mean	0.425743	0.198020	0.584158	0.405941	0.237624	0.356436	0.554455	0.603960	0.821782	0.792079	0.079208	0.168317	2.841584	0.742574	0.128713	0.435644	2.831683
std	0.496921	0.400495	0.495325	0.493522	0.427750	0.481335	0.499505	0.491512	0.384605	0.407844	0.271410	0.376013	2.033385	0.439397	0.336552	0.498314	2.102709
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	1.000000
50%	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	4.000000	1.000000	0.000000	0.000000	2.000000
75%	1.000000	0.000000	1.000000	1.000000	0.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	4.000000	1.000000	0.000000	1.000000	4.000000
max	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	8.000000	1.000000	1.000000	1.000000	7.000000

# describe one column
zoo.eggs.describe()

count    101.000000
mean       0.584158
std        0.495325
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
Name: eggs, dtype: float64

zoo.mean() # .min() .max() .std() .mode() .corr() .mean()
zoo.corr() # 计算相不同列（Series）之间的（DataFrame）
zoo.corrwith(zoo.eggs) # 分别计算 zoo（DataFrame）中所有列与 eggs（Series）的相关系数
zoo.eggs.corr(zoo.milk) # 计算相关系数（one Series with the other Series）

-0.9388478737749797

🌖 Filter the Data

filter.any() 相当于对 filter 进行或 (or) 运算，寻找 True
filter.all() 相当于对 filter 进行与 (and) 运算，寻找 False

# choose some rows that satisfy the filter(zoo.eggs==0)
myFilter = zoo.eggs==0 # <1 or >1
print("myFilter:\n",myFilter)
f_all = myFilter.any() # find the True  --> return Ture
f_any = myFilter.all() # find the False --> return False
print("myFilter.any() : %s  \nmyFilter.all() : %s"%(f_all,f_any))

myFilter:
 0       True
1       True
2      False
3       True
4       True
       ...  
96      True
97     False
98      True
99     False
100    False
Name: eggs, Length: 101, dtype: bool
myFilter.any() : True  
myFilter.all() : False

🌗 Clean the Data

zoo.shape # not zoo.shape()

(101, 18)

zoo.head(4)

	animal_name	hair	eggs	milk	aquatic	predator	toothed	backbone	breathes	fins	legs	tail	catsize	class_type
0	aardvark	1	0	1	0	1	1	1	1	0	4	0	1	1
1	antelope	1	0	1	0	0	1	1	1	0	4	1	1	1
2	bass	0	1	0	1	1	1	1	0	1	0	1	0	4
3	bear	1	0	1	0	1	1	1	1	0	4	0	1	1

zoo.isnull().head(4)

	animal_name	hair	feathers	eggs	milk	airborne	aquatic	predator	toothed	backbone	breathes	venomous	fins	legs	tail	domestic	catsize	class_type
0	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
1	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
2	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False
3	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False	False

zoo.isnull().any().any() # try to find whether zoo has null values（none）

False

zoo.eggs[0] = None # create the null value
zoo.head(4)

	animal_name	hair	eggs	milk	aquatic	predator	toothed	backbone	breathes	fins	legs	tail	catsize	class_type
0	aardvark	1	NaN	1	0	1	1	1	1	0	4	0	1	1
1	antelope	1	0.0	1	0	0	1	1	1	0	4	1	1	1
2	bass	0	1.0	0	1	1	1	1	0	1	0	1	0	4
3	bear	1	0.0	1	0	1	1	1	1	0	4	0	1	1

zoo = zoo.dropna() # delete the rows containing null values
zoo.head(4)

	animal_name	hair	eggs	milk	aquatic	predator	toothed	backbone	breathes	fins	legs	tail	catsize	class_type
1	antelope	1	0.0	1	0	0	1	1	1	0	4	1	1	1
2	bass	0	1.0	0	1	1	1	1	0	1	0	1	0	4
3	bear	1	0.0	1	0	1	1	1	1	0	4	0	1	1
4	boar	1	0.0	1	0	1	1	1	1	0	4	1	1	1

🌑 Others

append

dfa.append(dfb) 用于将 DataFrame b 连接到 DataFrame a 后面，需要注意的是，这里和 lista.append(listb) 是有去别的，即 pandas 并不在原来的 DataFrame a 上直接操作，而是返回添加好的 DataFrame result 。正确的使用方法应该是 dfa = dfa.append(dfb) 。