Sklearn - 随机森林 - 《动手深度学习》

onehot 编码

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None)
df.head()

df.columns=['age','sex','cp','restbp','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','hd']
df.head()

df.describe()

df['ca'].unique() #输出所有的种类

len(df.loc[(df['ca']=='?') |(df['thal']=='?')]) #查看这两列有多少列
#6
len(df) #查看有多少行

df_np_missing=df.loc[(df['ca']!='?') &(df['thal']!='?')] #直接去掉含有这些？的行

df_np_missing['ca'].unique() #查看类别

X=df_np_missing.drop('hd',axis=1).copy()
X.head()

onehot 编码

X['cp'].unique()
pd.get_dummies(X,columns=['cp']).head() #类似于转换为因子

X_encoded=pd.get_dummies(X,columns=['cp','restecg','slope','thal']);X_encoded.head()

y.unique()
y_not_zero_index= y > 0
y[y_not_zero_index]=1
y.unique()

y_not_zero_index.head()

X_train,x_test,y_train,y_test=train_test_split(X_encoded,y,random_state=42)

[1]https://statquest.org/video-index/