# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# 加载sklearn的随机森林模型
from sklearn.ensemble import RandomForestClassifier
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('./input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# 打印train.csv的部分数据
train_data=pd.read_csv("./input/train.csv")
train_data.head()
# 打印test.csv的部分数据
test_data=pd.read_csv("./input/test.csv")
test_data.head()
# 获取乘客幸存数据
y=train_data['Survived']
# 获取乘客其他数据
features=['Pclass','Sex','SibSp','Parch']
X=pd.get_dummies(train_data[features])
X_test=pd.get_dummies(test_data[features])
# 创建随机森林模型
# n_estimators:模型中决策树的数量
# max_depth:决策树的最大深度
# random_state:随机数生成器使用的种子
model=RandomForestClassifier(n_estimators=100,max_depth=5,random_state=1)
# 训练模型
model.fit(X,y)
predictions=model.predict(X_test)
# 将预测的结果以csv文件形式保存
output=pd.DataFrame({'PassengerId':test_data.PassengerId,'Survived':predictions})
print(output)
output.to_csv("./output/submission.csv",index=False)
print("数据已保存!")
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('./input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
train_path = "./input/train.csv"
test_path = "./input/test.csv"
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
#Some info about the data
train.info()
print('--------')
print('Percentage of NA per property sorted')
print('--------')
p = (train.isna().sum()/len(train)*100).sort_values(ascending=False)
print(p)
print('--------')
print('Unique values for duplications and other useful info')
print('--------')
u = train.nunique().sort_values()
print(u)
# 检查Embarked值,以便进行手动替换
train['Embarked'].value_counts()
#######################################################################
# 对数据中缺失的信息进行处理
# 参数:
# data:缺失了部分数据的数据表
#
# 返回值:
# data:经过处理后的数据表
#######################################################################
def cleanData(data):
# 部分项目缺失的太多,无法补足,所以舍弃
data.drop(['Cabin','Name','Ticket'], axis=1, inplace=True)
# 补足年龄的缺失
data['Age'] = data.groupby(['Pclass','Sex'])['Age'].transform(lambda x: x.fillna(x.median()))
# 补足test数据集中的票价信息
data['Fare'] = data.groupby(['Pclass','Sex'])['Fare'].transform(lambda x: x.fillna(x.median()))
# 登船信息虽然缺失很少,但是这项数据本身意义不大,所以舍弃
data.dropna(axis=0, subset=['Embarked'], inplace=True)
# Categorical Data
le = preprocessing.LabelEncoder()
# Sex
data['Sex'].replace({'male':0, 'female':1}, inplace=True)
# Embarked
data['Embarked'].replace({'S':0, 'C':1, 'Q':2}, inplace=True)
return data
clean_train = cleanData(train)
clean_test = cleanData(test)
# Set X and y
y = train['Survived']
X = pd.get_dummies(train.drop('Survived', axis=1))
# Split model train test data
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, random_state=42)
#######################################################################
# 训练模型并返回模型精确度
# 参数:
# model:选定的模型
#
# 返回值:
# accuracy_score(y_val, prediction):模型精确度
#######################################################################
def fitAndPredict(model):
model.fit(X_train, y_train)
prediction = model.predict(X_val)
return accuracy_score(y_val, prediction)
# 逻辑回归
model1 = LogisticRegression(solver='liblinear', random_state=42)
# 梯度提升决策树
model2 = GradientBoostingClassifier()
# 随机森林
model3 = RandomForestClassifier()
# 随机梯度下降
model4 = SGDClassifier()
# C-支持向量分类器
model5 = SVC()
models = [model1, model2, model3, model4, model5]
i = 0
# 对指定的模型进行训练并打印精确度
for model in models:
i +=1
print("Model ", i,":", model)
print("ACC: ", fitAndPredict(model))
# 梯度提升决策树是表现比较好的一个模型,我们加一些超参数,并进行训练
model = GradientBoostingClassifier(min_samples_split=20, min_samples_leaf=60, max_depth=3, max_features=7)
fitAndPredict(model)
# 使用训练好的数据对
predict = model.predict(pd.get_dummies(clean_test))
output = pd.DataFrame({'PassengerId': clean_test.PassengerId, 'Survived': predict})
print(output)
#output.to_csv('my_submission.csv', index=False)
print("Submission saved")