1. import pandas as pd
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. from sklearn.tree import DecisionTreeClassifier
  5. from sklearn.tree import plot_tree
  6. from sklearn.model_selection import train_test_split
  7. from sklearn.model_selection import cross_val_score
  8. from sklearn.metrics import confusion_matrix
  9. from sklearn.metrics import plot_confusion_matrix
  1. df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data',header=None)
  2. df.head()

image.png

  1. df.columns=['age','sex','cp','restbp','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','hd']
  2. df.head()

image.png

  1. df.describe()

image.png

  1. df['ca'].unique() #输出所有的种类

image.png

  1. len(df.loc[(df['ca']=='?') |(df['thal']=='?')]) #查看这两列有多少列
  2. #6
  3. len(df) #查看有多少行
  1. df_np_missing=df.loc[(df['ca']!='?') &(df['thal']!='?')] #直接去掉含有这些?的行

image.pngimage.png

  1. df_np_missing['ca'].unique() #查看类别

image.png

  1. X=df_np_missing.drop('hd',axis=1).copy()
  2. X.head()

image.png

onehot 编码

  1. X['cp'].unique()
  2. pd.get_dummies(X,columns=['cp']).head() #类似于转换为因子

image.png

  1. X_encoded=pd.get_dummies(X,columns=['cp','restecg','slope','thal']);X_encoded.head()

image.png

  1. y.unique()
  2. y_not_zero_index= y > 0
  3. y[y_not_zero_index]=1
  4. y.unique()
  1. y_not_zero_index.head()
  1. X_train,x_test,y_train,y_test=train_test_split(X_encoded,y,random_state=42)

[1]https://statquest.org/video-index/