26 Apr 2021 18572字 62分
CC BY 4.0 （除特别声明或转载文章外）
如果这篇博客帮助到你，可以请我喝一杯咖啡~

#导入相关包
import warnings 
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
#设置sns样式
sns.set(style='white',context='notebook',palette='muted')
import matplotlib.pyplot as plt
#导入数据
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

#理解数据
print('实验数据大小:',train.shape)
print('预测数据大小:',test.shape)

实验数据大小: (891, 12)
预测数据大小: (418, 11)

#记录异常值缺失值情况
full=train.append(test,ignore_index=True)
full.describe()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	1309.000000	891.000000	1309.000000	1046.000000	1309.000000	1309.000000	1308.000000
mean	655.000000	0.383838	2.294882	29.881138	0.498854	0.385027	33.295479
std	378.020061	0.486592	0.837836	14.413493	1.041658	0.865560	51.758668
min	1.000000	0.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	328.000000	0.000000	2.000000	21.000000	0.000000	0.000000	7.895800
50%	655.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	982.000000	1.000000	3.000000	39.000000	1.000000	0.000000	31.275000
max	1309.000000	1.000000	3.000000	80.000000	8.000000	9.000000	512.329200

full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB

# Age Fare Cabin Embarked 均有缺失

查看特征与标签间的关系

3.2.1 Embarked与Survived：法国登船的乘客生存率较

sns.barplot(data=train,x='Embarked',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d2300250>

png

#计算不同类型embarked的乘客，其生存率为多少
print('Embarked为"S"的乘客，其生存率为%.2f'%full['Survived'][full['Embarked']=='S'].value_counts(normalize=True)[1])
print('Embarked为"C"的乘客，其生存率为%.2f'%full['Survived'][full['Embarked']=='C'].value_counts(normalize=True)[1])
print('Embarked为"Q"的乘客，其生存率为%.2f'%full['Survived'][full['Embarked']=='Q'].value_counts(normalize=True)[1])
#'C','Q'代码类同'S'，这里不赘述啦

Embarked为"S"的乘客，其生存率为0.34
Embarked为"C"的乘客，其生存率为0.55
Embarked为"Q"的乘客，其生存率为0.39

# 法国登船乘客生存率较高原因可能与其头等舱乘客比例较高有关，因此继续查看不同登船地点乘客各舱位乘客数量情况。
sns.factorplot('Pclass',col='Embarked',data=train,kind='count',size=3)

<seaborn.axisgrid.FacetGrid at 0x194d239b190>

png

3.2.2 Parch与Survived：当乘客同行的父母及子女数量适中时，生存率较高

sns.barplot(data=train,x='Parch',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d24b6bb0>

png

3.2.3 SibSp与Survived：当乘客同行的同辈数量适中时生存率较高

sns.barplot(data=train,x='SibSp',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d24e6c10>

png

3.2.4 Pclass与Survived：乘客客舱等级越高，生存率越高

sns.barplot(data=train,x='Pclass',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d259bdc0>

png

3.2.5 Sex与Survived：女性的生存率远高于男性

sns.barplot(data=train,x='Sex',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d260d100>

png

3.2.6 Age与Survived：当乘客年龄段在0-10岁期间时生存率会较高

#创建坐标轴
ageFacet=sns.FacetGrid(train,hue='Survived',aspect=3)
#作图，选择图形类型
ageFacet.map(sns.kdeplot,'Age',shade=True)
#其他信息：坐标轴范围、标签等
ageFacet.set(xlim=(0,train['Age'].max()))
ageFacet.add_legend()

<seaborn.axisgrid.FacetGrid at 0x194d26558b0>

png

3.2.7 Fare与Survived：当票价低于18左右时乘客生存率较低，票价越高生存率一般越高

ageFacet=sns.FacetGrid(train,hue='Survived',aspect=3)
ageFacet.map(sns.kdeplot,'Fare',shade=True)
ageFacet.set(xlim=(0,150))
ageFacet.add_legend()

<seaborn.axisgrid.FacetGrid at 0x194d26cb610>

png

查看票价的分布特征

farePlot=sns.distplot(full['Fare'][full['Fare'].notnull()],label='skewness:%.2f'%(full['Fare'].skew()))
farePlot.legend(loc='best')

<matplotlib.legend.Legend at 0x194d2720730>

png

fare的分布呈左偏的形态，其偏度skewness=4.37较大，说明数据偏移平均值较多，因此我们需要对数据进行对数化处理，防止数据权重分布不均匀。

#对数化处理fare值
full['Fare']=full['Fare'].map(lambda x: np.log(x) if x>0 else 0)

4.数据预处理

4.1.1Cabin缺失值填充

full['Cabin']=full['Cabin'].fillna('U')
full['Cabin'].head()

     U
   C85
     U
  C123
     U
Name: Cabin, dtype: object

4.1.2Embarked缺失值填充

#对Embarked缺失值进行处理，查看缺失值情况
full[full['Embarked'].isnull()]

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
61	62	1.0	1	Icard, Miss. Amelie	female	38.0	0	0	113572	4.382027	B28	NaN
829	830	1.0	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	0	113572	4.382027	B28	NaN

full['Embarked'].value_counts()
full['Embarked']=full['Embarked'].fillna('S')

4.1.3Fare缺失值填充

full[full['Fare'].isnull()]

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked

full['Fare']=full['Fare'].fillna(full[(full['Pclass']==3)&(full['Embarked']=='S')&(full['Cabin']=='U')]['Fare'].mean())

4.2特征工程

4.2.1 Name中的头衔信息-Title

full['Title']=full['Name'].map(lambda x:x.split(',')[1].split('.')[0].strip())
#查看title数据分布
full['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
the Countess      1
Dona              1
Jonkheer          1
Lady              1
Capt              1
Don               1
Sir               1
Mme               1
Name: Title, dtype: int64

TitleDict={}
TitleDict['Mr']='Mr'
TitleDict['Mlle']='Miss'
TitleDict['Miss']='Miss'
TitleDict['Master']='Master'
TitleDict['Jonkheer']='Master'
TitleDict['Mme']='Mrs'
TitleDict['Ms']='Mrs'
TitleDict['Mrs']='Mrs'
TitleDict['Don']='Royalty'
TitleDict['Sir']='Royalty'
TitleDict['the Countess']='Royalty'
TitleDict['Dona']='Royalty'
TitleDict['Lady']='Royalty'
TitleDict['Capt']='Officer'
TitleDict['Col']='Officer'
TitleDict['Major']='Officer'
TitleDict['Dr']='Officer'
TitleDict['Rev']='Officer'

full['Title']=full['Title'].map(TitleDict)
full['Title'].value_counts()

Mr         757
Miss       262
Mrs        200
Master      62
Officer     23
Royalty      5
Name: Title, dtype: int64

sns.barplot(data=full,x='Title',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d2400d60>

png

4.2.2 FamilyNum及FamilySize信息

full['familyNum']=full['Parch']+full['SibSp']+1
#查看familyNum与Survived
sns.barplot(data=full,x='familyNum',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d27af580>

png

def familysize(familyNum):
    if familyNum==1:
        return 0
    elif (familyNum>=2)&(familyNum<=4):
        return 1
    else:
        return 2

full['familySize']=full['familyNum'].map(familysize)
full['familySize'].value_counts()

  790
  437
   82
Name: familySize, dtype: int64

sns.barplot(data=full,x='familySize',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d28d4be0>

png

4.2.3 Cabin客舱类型信息-Deck

full['Deck']=full['Cabin'].map(lambda x:x[0])
#查看不同Deck类型乘客的生存率
sns.barplot(data=full,x='Deck',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d2929610>

png

4.2.4 共票号乘客数量TickCot及TickGroup

TickCountDict={}
TickCountDict=full['Ticket'].value_counts()
TickCountDict.head()

CA. 2343        11
1601             8
CA 2144          8
3101295          7
S.O.C. 14879     7
Name: Ticket, dtype: int64

#将同票号乘客数量数据并入数据集中
full['TickCot']=full['Ticket'].map(TickCountDict)
full['TickCot'].head()

  1
  2
  1
  2
  1
Name: TickCot, dtype: int64

#查看TickCot与Survived之间关系
sns.barplot(data=full,x='TickCot',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d2998c70>

png

#按照TickCot大小，将TickGroup分为三类。
def TickCountGroup(num):
    if (num>=2)&(num<=4):
        return 0
    elif (num==1)|((num>=5)&(num<=8)):
        return 1
    else :
        return 2
#得到各位乘客TickGroup的类别
full['TickGroup']=full['TickCot'].map(TickCountGroup)
#查看TickGroup与Survived之间关系
sns.barplot(data=full,x='TickGroup',y='Survived')

<matplotlib.axes._subplots.AxesSubplot at 0x194d2a16520>

png

4.2.5Age缺失值填充-构建随机森林模型预测缺失的数据

full[full['Age'].isnull()].head()

	PassengerId	Survived	Pclass	Name	Sex	Age	Ticket	Fare	Cabin	Embarked	Title	familyNum	Deck	TickCot	TickGroup
5	6	0.0	3	Moran, Mr. James	male	NaN	330877	2.135148	U	Q	Mr	1	U	1	1
17	18	1.0	2	Williams, Mr. Charles Eugene	male	NaN	244373	2.564949	U	S	Mr	1	U	1	1
19	20	1.0	3	Masselmani, Mrs. Fatima	female	NaN	2649	1.977547	U	C	Mrs	1	U	1	1
26	27	0.0	3	Emir, Mr. Farred Chehab	male	NaN	2631	1.977547	U	C	Mr	1	U	1	1
28	29	1.0	3	O'Dwyer, Miss. Ellen "Nellie"	female	NaN	330959	2.064226	U	Q	Miss	1	U	1	1

AgePre=full[['Age','Parch','Pclass','SibSp','Title','familyNum','TickCot']]
#进行one-hot编码
AgePre=pd.get_dummies(AgePre)
ParAge=pd.get_dummies(AgePre['Parch'],prefix='Parch')
SibAge=pd.get_dummies(AgePre['SibSp'],prefix='SibSp')
PclAge=pd.get_dummies(AgePre['Pclass'],prefix='Pclass')
#查看变量间相关性
AgeCorrDf=pd.DataFrame()
AgeCorrDf=AgePre.corr()
AgeCorrDf['Age'].sort_values()

#拼接数据
AgePre=pd.concat([AgePre,ParAge,SibAge,PclAge],axis=1)
AgePre.head()

	Age	Pclass	SibSp	familyNum	TickCot	Title_Miss	Title_Mr	Title_Mrs	...	SibSp_0	SibSp_1	Pclass_1	Pclass_3
0	22.0	3	1	2	1	0	1	0	...	0	1	0	1
1	38.0	1	1	2	2	0	0	1	...	0	1	1	0
2	26.0	3	0	1	1	1	0	0	...	1	0	0	1
3	35.0	1	1	2	2	0	0	1	...	0	1	1	0
4	35.0	3	0	1	1	0	1	0	...	1	0	0	1

5 rows × 30 columns

#拆分实验集和预测集
AgeKnown=AgePre[AgePre['Age'].notnull()]
AgeUnKnown=AgePre[AgePre['Age'].isnull()]

#生成实验数据的特征和标签
AgeKnown_X=AgeKnown.drop(['Age'],axis=1)
AgeKnown_y=AgeKnown['Age']
#生成预测数据的特征
AgeUnKnown_X=AgeUnKnown.drop(['Age'],axis=1)

#利用随机森林构建模型
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(random_state=None,n_estimators=500,n_jobs=-1)
rfr.fit(AgeKnown_X,AgeKnown_y)

RandomForestRegressor(n_estimators=500, n_jobs=-1)

3、利用模型进行预测并填入原数据集中

rfr.score(AgeKnown_X,AgeKnown_y)

0.5875085143761927

#预测年龄
AgeUnKnown_y=rfr.predict(AgeUnKnown_X)
#填充预测数据
full.loc[full['Age'].isnull(),['Age']]=AgeUnKnown_y
full.info()  #此时已无缺失值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 PassengerId  1309 non-null   int64  
 Survived     891 non-null    float64
 Pclass       1309 non-null   int64  
 Name         1309 non-null   object 
 Sex          1309 non-null   object 
 Age          1309 non-null   float64
 SibSp        1309 non-null   int64  
 Parch        1309 non-null   int64  
 Ticket       1309 non-null   object 
 Fare         1309 non-null   float64
Cabin        1309 non-null   object 
Embarked     1309 non-null   object 
Title        1309 non-null   object 
familyNum    1309 non-null   int64  
familySize   1309 non-null   int64  
Deck         1309 non-null   object 
TickCot      1309 non-null   int64  
TickGroup    1309 non-null   int64  
dtypes: float64(3), int64(8), object(7)
memory usage: 184.2+ KB

4.3同组识别

#提取乘客的姓氏及相应的乘客数
full['Surname']=full['Name'].map(lambda x:x.split(',')[0].strip())
SurNameDict={}
SurNameDict=full['Surname'].value_counts()
full['SurnameNum']=full['Surname'].map(SurNameDict)

#将数据分为两组
MaleDf=full[(full['Sex']=='male')&(full['Age']>12)&(full['familyNum']>=2)]
FemChildDf=full[((full['Sex']=='female')|(full['Age']<=12))&(full['familyNum']>=2)]

#分析男性同组效应
MSurNamDf=MaleDf['Survived'].groupby(MaleDf['Surname']).mean()
MSurNamDf.head()
MSurNamDf.value_counts()

0    89
0    19
5     3
Name: Survived, dtype: int64

MSurNamDict={}
MSurNamDict=MSurNamDf[MSurNamDf.values==1].index
MSurNamDict

Index(['Beane', 'Beckwith', 'Bishop', 'Cardeza', 'Chambers', 'Dick',
       'Duff Gordon', 'Frauenthal', 'Frolicher-Stehli', 'Goldenberg',
       'Greenfield', 'Harder', 'Hoyt', 'Kimball', 'Lindqvist', 'McCoy',
       'Nakid', 'Persson', 'Taylor'],
      dtype='object', name='Surname')

FCSurNamDf=FemChildDf['Survived'].groupby(FemChildDf['Surname']).mean()
FCSurNamDf.head()
FCSurNamDf.value_counts()

000000    115
000000     27
750000      2
333333      1
142857      1
Name: Survived, dtype: int64

FCSurNamDict={}
FCSurNamDict=FCSurNamDf[FCSurNamDf.values==0].index
FCSurNamDict

Index(['Ahlin', 'Arnold-Franchi', 'Barbara', 'Boulos', 'Bourke', 'Caram',
       'Danbom', 'Ford', 'Goodwin', 'Ilmakangas', 'Johnston', 'Jussila',
       'Lahtinen', 'Lefebre', 'Lobb', 'Palsson', 'Panula', 'Rice', 'Robins',
       'Rosblom', 'Sage', 'Skoog', 'Strom', 'Turpin', 'Van Impe',
       'Vander Planke', 'Zabour'],
      dtype='object', name='Surname')

full.loc[(full['Survived'].isnull())&(full['Surname'].isin(MSurNamDict))&(full['Sex']=='male'),'Age']=5
full.loc[(full['Survived'].isnull())&(full['Surname'].isin(MSurNamDict))&(full['Sex']=='male'),'Sex']='female'

#对数据集中这些姓氏的女性及儿童的数据进行修正：1、性别改为男；2、年龄改为60。
full.loc[(full['Survived'].isnull())&(full['Surname'].isin(FCSurNamDict))&((full['Sex']=='female')|(full['Age']<=12)),'Age']=60
full.loc[(full['Survived'].isnull())&(full['Surname'].isin(FCSurNamDict))&((full['Sex']=='female')|(full['Age']<=12)),'Sex']='male'

4.4筛选子集

#人工筛选
fullSel=full.drop(['Cabin','Name','Ticket','PassengerId','Surname','SurnameNum'],axis=1)
#查看各特征与标签的相关性
corrDf=pd.DataFrame()
corrDf=fullSel.corr()
corrDf['Survived'].sort_values(ascending=True)

Pclass       -0.338481
TickGroup    -0.319278
Age          -0.059792
SibSp        -0.035322
familyNum     0.016639
TickCot       0.064962
Parch         0.081629
familySize    0.108631
Fare          0.331805
Survived      1.000000
Name: Survived, dtype: float64

#热力图，查看Survived与其他特征间相关性大小
plt.figure(figsize=(8,8))
sns.heatmap(fullSel[['Survived','Age','Embarked','Fare','Parch','Pclass',
                    'Sex','SibSp','Title','familyNum','familySize','Deck',
                     'TickCot','TickGroup']].corr(),cmap='BrBG',annot=True,
           linewidths=.5)
plt.xticks(rotation=45)

(array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5]),
 <a list of 10 Text major ticklabel objects>)

png

fullSel=fullSel.drop(['familyNum','SibSp','TickCot','Parch'],axis=1)
#one-hot编码
fullSel=pd.get_dummies(fullSel)
PclassDf=pd.get_dummies(full['Pclass'],prefix='Pclass')
TickGroupDf=pd.get_dummies(full['TickGroup'],prefix='TickGroup')
familySizeDf=pd.get_dummies(full['familySize'],prefix='familySize')

fullSel=pd.concat([fullSel,PclassDf,TickGroupDf,familySizeDf],axis=1)

5.构建模型

本文先比较了SCV/Decision Tree/Gradient Boosting/Neural network/KNN/Logistic Regression等多种机器学习算法的结果，并对表现较好的Gradient Boosting及Logistic Regression做进一步的对比，最终选择Gradient Boosting对乘客生存率进行预测。

5.1模型选择

主要考虑使用以下常用的机器学习算法进行比较：

SCV
Decision Tree
Extra Trees
Gradient Boosting
Random Forest
KNN
Logistic Regression
Linear Discriminant Analysis

#拆分实验数据与预测数据
experData=fullSel[fullSel['Survived'].notnull()]
preData=fullSel[fullSel['Survived'].isnull()]

experData_X=experData.drop('Survived',axis=1)
experData_y=experData['Survived']
preData_X=preData.drop('Survived',axis=1)

#导入机器学习算法库
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold

#设置kfold，交叉采样法拆分数据集
kfold=StratifiedKFold(n_splits=10)

#汇总不同模型算法
classifiers=[]
classifiers.append(SVC())
classifiers.append(DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(ExtraTreesClassifier())
classifiers.append(GradientBoostingClassifier())
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression())
classifiers.append(LinearDiscriminantAnalysis())

5.1.2比较各种算法结果，进一步选择模型

#不同机器学习交叉验证结果汇总
cv_results=[]
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier,experData_X,experData_y,
                                      scoring='accuracy',cv=kfold,n_jobs=-1))

#求出模型得分的均值和标准差
cv_means=[]
cv_std=[]
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
#汇总数据
cvResDf=pd.DataFrame({'cv_mean':cv_means,
                     'cv_std':cv_std,
                     'algorithm':['SVC','DecisionTreeCla','RandomForestCla','ExtraTreesCla',
                                  'GradientBoostingCla','KNN','LR','LinearDiscrimiAna']})

cvResDf

	cv_mean	cv_std	algorithm
0	0.764419	0.050512	SVC
1	0.778976	0.058657	DecisionTreeCla
2	0.815980	0.043057	RandomForestCla
3	0.799139	0.045985	ExtraTreesCla
4	0.837291	0.041373	GradientBoostingCla
5	0.794657	0.038325	KNN
6	0.827191	0.031660	LR
7	0.823833	0.029460	LinearDiscrimiAna

# sns.barplot(data=cvResDf,x='cv_mean',y='algorithm',**{'xerr':cv_std})

cvResFacet=sns.FacetGrid(cvResDf.sort_values(by='cv_mean',ascending=False),sharex=False,
            sharey=False,aspect=2)
cvResFacet.map(sns.barplot,'cv_mean','algorithm',**{'xerr':cv_std},
               palette='muted')
cvResFacet.set(xlim=(0.7,0.9))
cvResFacet.add_legend()

<seaborn.axisgrid.FacetGrid at 0x194d5419880>

png

5.1.3模型调优

综合以上模型表现，考虑选择GradientBoostingCla、LR两种模型进一步对比。

分别建立GradientBoostingClassifier以及LogisticRegression模型，并进行模型调优。

#GradientBoostingClassifier模型
GBC = GradientBoostingClassifier()
gb_param_grid = {'loss' : ["deviance"],
              'n_estimators' : [100,200,300],
              'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [4, 8],
              'min_samples_leaf': [100,150],
              'max_features': [0.3, 0.1] 
              }
modelgsGBC = GridSearchCV(GBC,param_grid = gb_param_grid, cv=kfold, 
                                     scoring="accuracy", n_jobs= -1, verbose = 1)
modelgsGBC.fit(experData_X,experData_y)

#LogisticRegression模型
modelLR=LogisticRegression()
LR_param_grid = {'C' : [1,2,3],
                'penalty':['l1','l2']}
modelgsLR = GridSearchCV(modelLR,param_grid = LR_param_grid, cv=kfold, 
                                     scoring="accuracy", n_jobs= -1, verbose = 1)
modelgsLR.fit(experData_X,experData_y)

Fitting 10 folds for each of 72 candidates, totalling 720 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 522 tasks      | elapsed:   27.9s
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed:   38.1s finished

Fitting 10 folds for each of 6 candidates, totalling 60 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.6s finished

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': [1, 2, 3], 'penalty': ['l1', 'l2']},
             scoring='accuracy', verbose=1)

5.2 查看模型得分

1、查看模型准确度

#modelgsGBC模型
print('modelgsGBC模型得分为：%.3f'%modelgsGBC.best_score_)
#modelgsLR模型
print('modelgsLR模型得分为：%.3f'%modelgsLR.best_score_)

modelgsGBC模型得分为：0.842
modelgsLR模型得分为：0.828

#查看模型ROC曲线
#求出测试数据模型的预测值
modelgsGBCtestpre_y=modelgsGBC.predict(experData_X).astype(int)
#画图
from sklearn.metrics import roc_curve, auc  ###计算roc和auc
# Compute ROC curve and ROC area for each class
fpr,tpr,threshold = roc_curve(experData_y, modelgsGBCtestpre_y) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值

plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='r',
         lw=lw, label='ROC curve (area = %0.3f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Titanic GradientBoostingClassifier Model')
plt.legend(loc="lower right")
plt.show()

<Figure size 432x288 with 0 Axes>

png

#查看模型ROC曲线
#求出测试数据模型的预测值
testpre_y=modelgsLR.predict(experData_X).astype(int)
#画图
from sklearn.metrics import roc_curve, auc  ###计算roc和auc
# Compute ROC curve and ROC area for each class
fpr,tpr,threshold = roc_curve(experData_y, testpre_y) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值

plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='r',
         lw=lw, label='ROC curve (area = %0.3f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Titanic LogisticRegression Model')
plt.legend(loc="lower right")
plt.show()

<Figure size 432x288 with 0 Axes>

png

from sklearn.metrics import confusion_matrix
print('GradientBoostingClassifier模型混淆矩阵为\n',confusion_matrix(experData_y.astype(int).astype(str),modelgsGBCtestpre_y.astype(str)))
print('LinearRegression模型混淆矩阵为\n',confusion_matrix(experData_y.astype(int).astype(str),testpre_y.astype(str)))

GradientBoostingClassifier模型混淆矩阵为
 [[503  46]
 [ 76 266]]
LinearRegression模型混淆矩阵为
 [[477  72]
 [ 78 264]]

#TitanicGBSmodle
GBCpreData_y=modelgsGBC.predict(preData_X)
GBCpreData_y=GBCpreData_y.astype(int)
#导出预测结果
GBCpreResultDf=pd.DataFrame()
GBCpreResultDf['PassengerId']=full['PassengerId'][full['Survived'].isnull()]
GBCpreResultDf['Survived']=GBCpreData_y
GBCpreResultDf
#将预测结果导出为csv文件
GBCpreResultDf.to_csv('TitanicGBSmodle.csv',index=False)

Kaggle Titanic生存预测