1. import pandas as pd
    2. import matplotlib.pyplot as plt
    3. import seaborn as sns
    4. import numpy as np
    5. from scipy.stats import norm
    6. from sklearn.preprocessing import StandardScaler
    7. from scipy import stats
    8. import warnings
    9. warnings.filterwarnings('ignore')
    10. %matplotlib inline
    1. train_names = ["date",
    2. "price",
    3. "bedroom_num",
    4. "bathroom_num",
    5. "house_area",
    6. "park_space",
    7. "floor_num",
    8. "house_score",
    9. "covered_area",
    10. "basement_area",
    11. "yearbuilt",
    12. "yearremodadd",
    13. "lat",
    14. "long"]
    15. df_train = pd.read_csv("D:\Python\kc_train.csv",names=train_names)
    1. df_train.columns

    Index([‘date’, ‘price’, ‘bedroom_num’, ‘bathroom_num’, ‘house_area’,
    ‘park_space’, ‘floor_num’, ‘house_score’, ‘covered_area’,
    ‘basement_area’, ‘yearbuilt’, ‘yearremodadd’, ‘lat’, ‘long’],
    dtype=’object’)

    1. df_train['price'].describe()

    count 1.000000e+04
    mean 5.428749e+05
    std 3.729258e+05
    min 7.500000e+04
    25% 3.225000e+05
    50% 4.507000e+05
    75% 6.450000e+05
    max 6.885000e+06
    Name: price, dtype: float64
    image.png

    1. #skewness and kurtosis计算偏度与峰度:
    2. print("Skewness: %f" % df_train['price'].skew())
    3. print("Kurtosis: %f" % df_train['price'].kurt())

    Skewness: 3.898737
    Kurtosis: 29.356202

    1. #居住面积平方英尺
    2. var = 'house_area'
    3. data = pd.concat([df_train['price'], df_train[var]], axis=1)
    4. data.plot.scatter(x=var, y='price', xlim=(0,7000), ylim=(0,4000000));

    image.png

    1. #地下室面积平方英尺
    2. var = 'basement_area'
    3. data = pd.concat([df_train['price'], df_train[var]], axis=1)
    4. data.plot.scatter(x=var, y='price',xlim=(0,4000), ylim=(0,4000000));

    image.png

    1. #原施工日期
    2. var = 'yearbuilt'
    3. data = pd.concat([df_train['price'], df_train[var]], axis=1)
    4. f, ax = plt.subplots(figsize=(16, 8))
    5. fig = sns.boxplot(x=var, y="price", data=data)
    6. fig.axis(ymin=0, ymax=4000000);
    7. plt.xticks(rotation=90);

    image.png

    1. #bedroom_num
    2. var = 'bedroom_num'
    3. data = pd.concat([df_train['price'], df_train[var]], axis=1)
    4. f, ax = plt.subplots(figsize=(8, 6))
    5. fig = sns.boxplot(x=var, y="price", data=data)
    6. fig.axis(ymin=0, ymax=4000000);

    image.png

    1. #选出与价格因素最相近的10个特征,观察它们的相关性。
    2. k = 10
    3. corrmat = df_train.corr()
    4. cols = corrmat.nlargest(k, 'price')['price'].index
    5. cm = np.corrcoef(df_train[cols].values.T)
    6. sns.set(font_scale=1.25)
    7. plt.figure(figsize = (20,10))
    8. hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values,cmap='YlGnBu')
    9. plt.show()

    image.png

    1. #选出与价格因素最相近的13个特征,观察它们的相关性。
    2. k = 13
    3. corrmat = df_train.corr()
    4. cols = corrmat.nlargest(k, 'price')['price'].index
    5. cm = np.corrcoef(df_train[cols].values.T)
    6. sns.set(font_scale=1.25)
    7. plt.figure(figsize = (20,10))
    8. hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values,cmap='YlGnBu')
    9. plt.show()

    image.png

    1. #打印出相关性的排名
    2. print(corrmat["price"].sort_values(ascending=False))

    price 1.000000
    house_score 0.705110
    house_area 0.694800
    covered_area 0.601667
    bathroom_num 0.556525
    lat 0.451337
    bedroom_num 0.352195
    basement_area 0.318296
    floor_num 0.299244
    yearremodadd 0.113862
    park_space 0.103211
    yearbuilt 0.082579
    long 0.055840
    date 0.002005
    Name: price, dtype: float64

    1. #scatterplot
    2. sns.set()
    3. cols = ['price', 'house_area', 'house_score', 'covered_area', 'bathroom_num', 'basement_area', 'bedroom_num']
    4. sns.pairplot(df_train[cols], size = 2.5)
    5. plt.show();

    image.png

    1. #转换前的数据分布
    2. sns.distplot(df_train['price'] , fit=norm);
    3. (mu, sigma) = norm.fit(df_train['price'])
    4. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
    5. #分布图
    6. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    7. loc='best')
    8. plt.ylabel('Frequency')
    9. plt.title('price distribution')
    10. #QQ图
    11. fig = plt.figure()
    12. res = stats.probplot(df_train['price'], plot=plt)
    13. plt.show()

    mu = 542874.93 and sigma = 372907.12
    image.png
    image.png

    1. #转换后的数据分布:
    2. #对数变换log(1+x)
    3. df_train["price"] = np.log1p(df_train["price"])
    4. #看看新的分布
    5. sns.distplot(df_train["price"] , fit=norm);
    6. # 参数
    7. (mu, sigma) = norm.fit(df_train["price"])
    8. print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
    9. #画图
    10. plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
    11. loc='best')
    12. plt.ylabel('Frequency')
    13. plt.title('price distribution')
    14. #QQ图
    15. fig = plt.figure()
    16. res = stats.probplot(df_train["price"], plot=plt)
    17. plt.show()

    mu = 13.05 and sigma = 0.53
    image.png
    image.png

    1. #特征缩放
    2. data = df_train.astype('float')
    3. x = data.drop('price',axis=1)
    4. y = data['price']
    5. from sklearn.preprocessing import MinMaxScaler
    6. scaler = MinMaxScaler()
    7. newX= scaler.fit_transform(x)
    8. newX = pd.DataFrame(newX, columns=x.columns)
    9. newX.head()
    1. #先将数据集分成训练集和测试集
    2. from sklearn.model_selection import train_test_split
    3. X_train,X_test,y_train,y_test = train_test_split(newX, y, test_size=0.2, random_state=21)
    1. #模型建立
    2. from sklearn import metrics
    3. def RF(X_train, X_test, y_train, y_test): #随机森林
    4. from sklearn.ensemble import RandomForestRegressor
    5. model= RandomForestRegressor(n_estimators=200,max_features=None)
    6. model.fit(X_train, y_train)
    7. predicted= model.predict(X_test)
    8. mse = metrics.mean_squared_error(y_test,predicted)
    9. return (mse/10000)
    10. def LR(X_train, X_test, y_train, y_test): #线性回归
    11. from sklearn.linear_model import LinearRegression
    12. LR = LinearRegression()
    13. LR.fit(X_train, y_train)
    14. predicted = LR.predict(X_test)
    15. mse = metrics.mean_squared_error(y_test,predicted)
    16. return (mse/10000)
    1. print('RF mse: ',RF(X_train, X_test, y_train, y_test))
    2. print('LR mse: ',LR(X_train, X_test, y_train, y_test))

    RF mse: 3.5241062705249253e-06
    LR mse: 7.1541094041807234e-06