NOTE - Categorical Variables
Split the training and testing set
import pandas as pdfrom sklearn.model_selection import train_test_split# Read the dataX = pd.read_csv('../input/train.csv', index_col='Id')X_test = pd.read_csv('../input/test.csv', index_col='Id')# Remove rows with missing target, separate target from predictorsX.dropna(axis=0, subset=['SalePrice'], inplace=True)y = X.SalePriceX.drop(['SalePrice'], axis=1, inplace=True)# To keep things simple, we'll drop columns with missing valuescols_with_missing = [col for col in X.columns if X[col].isnull().any()]X.drop(cols_with_missing, axis=1, inplace=True)X_test.drop(cols_with_missing, axis=1, inplace=True)# Break off validation set from training dataX_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8, test_size=0.2,random_state=0)
Function to comparing different approaches and accuracy
from sklearn.ensemble import RandomForestRegressorfrom sklearn.metrics import mean_absolute_error# function for comparing different approachesdef score_dataset(X_train, X_valid, y_train, y_valid):model = RandomForestRegressor(n_estimators=100, random_state=0)model.fit(X_train, y_train)preds = model.predict(X_valid)return round(mean_absolute_error(y_valid, preds),6)
Investigating cardinality
# Get number of unique entries in each column with categorical dataobject_nunique = list(map(lambda col: X_train[col].nunique(), object_cols))d = dict(zip(object_cols, object_nunique))# Print number of unique entries by column, in ascending ordersorted(d.items(), key=lambda x: x[1])
Set up the One-Hot encoding
# Columns that will be one-hot encodedlow_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10]# Columns that will be dropped from the datasethigh_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))print('Categorical columns that will be one-hot encoded:', low_cardinality_cols)print('\nCategorical columns that will be dropped from the dataset:', high_cardinality_cols)
from sklearn.preprocessing import OneHotEncoder# Use as many lines of code as you need!OH_encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)OH_X_train = pd.DataFrame(OH_encoder.fit_transform(X_train[low_cardinality_cols]))OH_X_valid = pd.DataFrame(OH_encoder.transform(X_valid[low_cardinality_cols]))OH_X_train.index = X_train.index # Your code hereOH_X_valid.index = X_valid.index # Your code herenum_X_train = X_train.drop(object_cols, axis = 1)num_X_valid = X_valid.drop(object_cols, axis = 1)OH_X_train = pd.concat([num_X_train, OH_X_train], axis = 1)OH_X_valid = pd.concat([num_X_valid, OH_X_valid], axis = 1)

