NOTE - Categorical Variables

image.png 访问链接

Split the training and testing set

  1. import pandas as pd
  2. from sklearn.model_selection import train_test_split
  3. # Read the data
  4. X = pd.read_csv('../input/train.csv', index_col='Id')
  5. X_test = pd.read_csv('../input/test.csv', index_col='Id')
  6. # Remove rows with missing target, separate target from predictors
  7. X.dropna(axis=0, subset=['SalePrice'], inplace=True)
  8. y = X.SalePrice
  9. X.drop(['SalePrice'], axis=1, inplace=True)
  10. # To keep things simple, we'll drop columns with missing values
  11. cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
  12. X.drop(cols_with_missing, axis=1, inplace=True)
  13. X_test.drop(cols_with_missing, axis=1, inplace=True)
  14. # Break off validation set from training data
  15. X_train, X_valid, y_train, y_valid = train_test_split(X, y,
  16. train_size=0.8, test_size=0.2,
  17. random_state=0)

Function to comparing different approaches and accuracy

  1. from sklearn.ensemble import RandomForestRegressor
  2. from sklearn.metrics import mean_absolute_error
  3. # function for comparing different approaches
  4. def score_dataset(X_train, X_valid, y_train, y_valid):
  5. model = RandomForestRegressor(n_estimators=100, random_state=0)
  6. model.fit(X_train, y_train)
  7. preds = model.predict(X_valid)
  8. return round(mean_absolute_error(y_valid, preds),6)

Investigating cardinality

  1. # Get number of unique entries in each column with categorical data
  2. object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))
  3. d = dict(zip(object_cols, object_nunique))
  4. # Print number of unique entries by column, in ascending order
  5. sorted(d.items(), key=lambda x: x[1])

Set up the One-Hot encoding

  1. # Columns that will be one-hot encoded
  2. low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]
  3. # Columns that will be dropped from the dataset
  4. high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
  5. print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)
  6. print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
  1. from sklearn.preprocessing import OneHotEncoder
  2. # Use as many lines of code as you need!
  3. OH_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
  4. OH_X_train = pd.DataFrame(
  5. OH_encoder.fit_transform(X_train[low_cardinality_cols])
  6. )
  7. OH_X_valid = pd.DataFrame(
  8. OH_encoder.transform(X_valid[low_cardinality_cols])
  9. )
  10. OH_X_train.index = X_train.index # Your code here
  11. OH_X_valid.index = X_valid.index # Your code here
  12. num_X_train = X_train.drop(object_cols, axis = 1)
  13. num_X_valid = X_valid.drop(object_cols, axis = 1)
  14. OH_X_train = pd.concat([num_X_train, OH_X_train], axis = 1)
  15. OH_X_valid = pd.concat([num_X_valid, OH_X_valid], axis = 1)