# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# 载入训练数据
train=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train.head()
# 选取部分训练数据
train_y=train.SalePrice
predict_data=['LotArea','OverallQual','YearBuilt','YearRemodAdd','GrLivArea','TotRmsAbvGrd']
train_X=train[predict_data]
# 进行训练
model=RandomForestRegressor()
model.fit(train_X,train_y)
# 载入测试数据,并进行预测
test=pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test_X=test[predict_data]
predict_price=model.predict(test_X)
print(predict_price)
# 输出成表格的形式并提交
my_submission=pd.DataFrame({'Id':test.Id,"SalePrice":predict_price})
#my_submission.head()
my_submission.to_csv("submission.csv",index=False)
提交后查看一下分数
分数并不是很好(0为最高分)
依旧需要参考一下大神的代码
大神的代码
#invite people for the Kaggle party
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
#bring in the six packs
df_train = pd.read_csv('./input/train.csv')
#check the decoration
df_train.columns
# 对数据中的离群值进行处理,低范围的值都差不多,但是高范围的值相差很多,需要着重注意一下最高的两个7
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis]);
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)
# 观察grlivarea与saleprice的关系,可以看到grlivarea有两个特别大的值但是saleprice都不高,应当认定为离群值,需要删除
# 但是顶部的两个值虽然看似离群值,但是是顺应趋势的,需要保留
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));