본문 바로가기

Data Science/Project

[Tabular] Titanic: Machine Learning from Disaster

Table Data를 다뤄보는 과제 중 가장 기초적이며, Table Data 뿐 아니라 머신러닝/딥러닝을 배울 때 가장 먼저 접하게 되는 문제 중 하나인 타이타닉 문제를 다뤄보겠습니다.

* Kaggle Competition을 바탕으로 진행하였습니다.

 


Step 0. Import Packages

더보기
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline

from collections import Counter

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

sns.set(style='white', context='notebook', palette='deep')

import warnings
warnings.filterwarnings("ignore")

Step 1. Data Load & Check

train_df = pd.read_csv("./Data/train.csv")
test_df = pd.read_csv("./Data/test.csv")

test_id = test_df["PassengerId"]
def detect_outliers(df, n, features):
    outlier_indices = []
    
    for col in features:
        Q1 = np.percentile(df[col], 25)
        Q3 = np.percentile(df[col], 75)
        IQR = Q3 - Q1
        
        outlier_step = 1.5 * IQR
        
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step)].index
        outlier_indices.extend(outlier_list_col)
        
    outlier_indices = Counter(outlier_indices)
    
    multiple_outliers = list(k for k, v in outlier_indices.items() if v > n)
    
    return multiple_outliers
# numerical data 에 대해서 아웃라이어를 찾는다.

Outliers_to_drop = detect_outliers(train_df, 2, ["Age", "SibSp", "Parch", "Fare"])
train_df.loc[Outliers_to_drop]
train_df = train_df.drop(Outliers_to_drop, axis = 0).reset_index(drop = True)
train_len = len(train_df)
df = pd.concat([train_df, test_df], axis = 0).reset_index(drop = True)
df = df.fillna(np.nan)

df.isnull().sum()
# survived 에 있는 null 값 418개는 test 셋에 라벨이 없는 것이므로 신경쓰지 않아도 된다.
train_df.info()

train_df.isnull().sum()
train_df.describe()

Step 2. Feature Analysis

g = sns.heatmap(train_df[["Survived", "SibSp", "Parch",
                         "Age", "Fare"]].corr(), annot = True, fmt = ".2f", cmap = "coolwarm")
g = sns.factorplot(x = "SibSp", y = "Survived", data = train_df , kind = "bar",
                  size = 6, palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x = "Parch", y = "Survived", data = train_df, kind = "bar",
                  size = 6, palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Survival Probability")
# 연속형 수치 데이터
g = sns.FacetGrid(train_df, col = "Survived")
g = g.map(sns.distplot, "Age")
g = sns.kdeplot(train_df["Age"][(train_df["Survived"] == 0) & (train_df["Age"].notnull())], color = "Red", shade = True)
g = sns.kdeplot(train_df["Age"][(train_df["Survived"] == 1) & (train_df["Age"].notnull())], ax = g, color = "Blue", shade = True)

g.set_xlabel("Age")
g.set_ylabel("Frequency")

g = g.legend(["Not Survived", "Survived"])
df["Fare"].isnull().sum()
plt.hist(df["Fare"], bins = 100)
df["Fare"] = df["Fare"].fillna(df["Fare"].median())
g = sns.distplot(df["Fare"], color = "m", label = "Skewness: %.2f"%(df["Fare"].skew()))
g = g.legend(loc = "best")
df["Fare"] = df["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
g = sns.distplot(df["Fare"], color = "b", label = "Skewness: %.2f"%(df["Fare"].skew()))
g = g.legend(loc = "best")
g = sns.barplot(x = "Sex", y = "Survived", data = train_df)
g = g.set_ylabel("Survival Probability")
train_df[["Sex", "Survived"]].groupby("Sex", as_index = False).mean()
g = sns.factorplot(x = "Pclass", y = "Survived", data = train_df, kind = "bar", size = 6, palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x = "Pclass", y = "Survived", hue = "Sex", data = train_df,
                  size = 6, kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Survival Probability")
df["Embarked"].isnull().sum()
df["Embarked"] = df["Embarked"].fillna("S")
g = sns.factorplot(x = "Embarked", y = "Survived", data = train_df, size = 6, kind = "bar", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Surival Probability")
g = sns.factorplot("Pclass", col = "Embarked", data = train_df, size = 6, kind = "count", palette = "muted")
g.despine(left = True)
g = g.set_ylabels("Count")
g = sns.factorplot(x = "Sex", y = "Age", data = df, kind = "box")
g = sns.factorplot(x = "Sex", y = "Age", hue = "Pclass", data = df, kind = "box")
g = sns.factorplot(x = "Parch", y = "Age", data = df, kind = "box")
g = sns.factorplot(x = "SibSp", y = "Age", data = df, kind = "box")
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
g = sns.heatmap(df[["Age", "Sex", "SibSp", "Parch", "Pclass"]].corr(), cmap = "BrBG", annot = True)
index_NaN_age = list(df["Age"][df["Age"].isnull()].index)

for i in index_NaN_age:
    age_med = df["Age"].median()
    age_pred = df["Age"][((df["SibSp"] == df.iloc[i]["SibSp"]) & (df["Parch"] == df.iloc[i]["Parch"]) & (df["Pclass"] == df.iloc[i]["Pclass"]))].median()
    
    if not np.isnan(age_pred):
        df["Age"].iloc[i] = age_pred
    else:
        df["Age"].iloc[i] = age_med
g = sns.factorplot(x = "Survived", y = "Age", data = train_df, kind = "box")
g = sns.factorplot(x = "Survived", y = "Age", data = train_df, kind = "violin")

Step 3. Feature Engineering

display(df["Name"].head())
df_title = [i.split(",")[1].split(".")[0].strip() for i in df["Name"]]
df["Title"] = pd.Series(df_title)
display(df["Title"].head())

print(df["Title"].unique())
g = sns.countplot(x = "Title", data = df)
g = plt.setp(g.get_xticklabels(), rotation = 45)
df["Title"] = df["Title"].replace(["Lady", "the Countess", "Countess", "Capt", "Col", "Don",
                                  "Dr", "Major", "Rev", "Sir", "Jonkheer", "Dona"], "Rare")
df["Title"] = df["Title"].map({"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3})
df["Title"] = df["Title"].astype(int)
g = sns.countplot(df["Title"])
g = g.set_xticklabels(["Master", "Miss/Ms/Mme/Mlle/Mrs", "Mr", "Rare"])
g = sns.factorplot(x = "Title", y = "Survived", data = df, kind = "bar")
g = g.set_xticklabels(["Master", "Miss-Mrs", "Mr", "Rare"])
g = g.set_ylabels("Survival Probability")
df.drop(labels = ["Name"], axis = 1, inplace = True)
df["Family Size"] = df["SibSp"] + df["Parch"] + 1
g = sns.factorplot(x = "Family Size", y = "Survived", data = df)
g = g.set_ylabels("Survival Probability")
df["Single"] = df["Family Size"].map(lambda s: 1 if s == 1 else 0)
df["SmallF"] = df["Family Size"].map(lambda s: 1 if s == 2 else 0)
df["MedF"] = df["Family Size"].map(lambda s: 1 if 3 <= s <= 4 else 0)
df["LargeF"] = df["Family Size"].map(lambda s: 1 if s>= 5 else 0)
g = sns.factorplot(x = "Single", y = "Survived", data = df, kind = "bar")
g = g.set_ylabels("Survival Probability")

g = sns.factorplot(x = "SmallF", y = "Survived", data = df, kind = "bar")
g = g.set_ylabels("Survival Probability")

g = sns.factorplot(x = "MedF", y = "Survived", data = df, kind = "bar")
g = g.set_ylabels("Survival Probability")

g = sns.factorplot(x = "LargeF", y = "Survived", data = df, kind = "bar")
g = g.set_ylabels("Survival Probability")
df = pd.get_dummies(df, columns = ["Title"])
df = pd.get_dummies(df, columns = ["Embarked"], prefix = "Em")
display(df["Cabin"].head())

print(df["Cabin"].describe())
print(df["Cabin"].isnull().sum())
df["Cabin"][df["Cabin"].notnull()].head()
df["Cabin"] = pd.Series([i[0] if not pd.isnull(i) else "X" for i in df["Cabin"]])
g = sns.countplot(df["Cabin"], order = ["A", "B", "C", "D", "E", "F", "G", "T", "X"])
g = sns.factorplot(y = "Survived", x = "Cabin", data = df, kind = "bar",
                  order = ["A", "B", "C", "D", "E", "F", "G", "T", "X"])
g = g.set_ylabels("Survival Probability")
df = pd.get_dummies(df, columns = ["Cabin"], prefix = "Cabin")
df[["Ticket"]].head()
Ticket = []

for i in list(df.Ticket):
    if not i.isdigit():
        Ticket.append(i.replace(".", "").replace("/", "").strip().split(" ")[0])
    else:
        Ticket.append("X")
df["Ticket"] = Ticket

display(df["Ticket"].head())
df = pd.get_dummies(df, columns = ["Ticket"], prefix = "T")
df["Pclass"] = df["Pclass"].astype("category")
df = pd.get_dummies(df, columns = ["Pclass"], prefix = "Pc")
df.drop(labels = ["PassengerId"], axis = 1, inplace = True)
display(df.head())
print(df.columns)

Step 4. Modeling

train_df = df[:train_len]
test_df = df[train_len:]

test_df.drop(labels = ["Survived"], axis = 1, inplace = True)
train_df["Survived"] = train_df["Survived"].astype(int)

y_train = train_df["Survived"]
X_train = train_df.drop(labels = ["Survived"], axis = 1)
X_test = test_df
kfold = StratifiedKFold(n_splits = 10)
random_state = 2
classifiers = []

classifiers.append(SVC(random_state = random_state))
classifiers.append(DecisionTreeClassifier(random_state = random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state = random_state),
                                      random_state = random_state, learning_rate = 0.1))
classifiers.append(RandomForestClassifier(random_state = random_state))
classifiers.append(ExtraTreesClassifier(random_state = random_state))
classifiers.append(GradientBoostingClassifier(random_state = random_state))
classifiers.append(MLPClassifier(random_state = random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = [] # cv 는 cross validation을 의미
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y_train, scoring = "accuracy", cv = kfold))
    
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
cv_res = pd.DataFrame({"CrossValMeans": cv_means, "CrossValErrors": cv_std,
                       "Algorithm": ["SVC", "DecisionTree", "AdaBoost", "RandomForest", "ExtraTrees", "GradientBoosting", 
                                     "MultipleLayerPerceptron", "KNeighbors", "LogisticRegression", "LinearDiscriminantAnalysis"]})
plt.figure(figsize = (16, 10))

sns.barplot("CrossValMeans", "Algorithm", data = cv_res,
           palette = "Set3", orient = "h", **{"xerr": cv_std})
plt.xlabel("Mean Accuracy")
plt.title("Cross validation scores")
DTC = DecisionTreeClassifier()
adaDTC = AdaBoostClassifier(DTC, random_state = 7)

ada_param_grid = {"base_estimator__criterion": ["gini", "entropy"],
                 "base_estimator__splitter": ["best", "random"],
                 "algorithm": ["SAMME", "SAMME.R"], "n_estimators": [1, 2],
                 "learning_rate": [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 1.5]}

gsadaDTC = GridSearchCV(adaDTC, param_grid = ada_param_grid, cv = kfold,
                       scoring = "accuracy", verbose = 1)

gsadaDTC.fit(X_train, y_train)

ada_best = gsadaDTC.best_estimator_
ExtC = ExtraTreesClassifier()

ex_param_grid = {"max_depth": [None], "max_features": [1, 3, 10],
                "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10],
                "bootstrap": [False], "n_estimators": [100, 300], "criterion": ["gini"]}

gsExtC = GridSearchCV(ExtC, param_grid = ex_param_grid, cv = kfold, scoring = "accuracy", verbose = 1)

gsExtC.fit(X_train, y_train)

ExtC_best = gsExtC.best_estimator_
RFC = RandomForestClassifier()

rf_param_grid = {"max_depth": [None], "max_features": [1, 3, 10],
                "min_samples_split": [2, 3, 10], "min_samples_leaf": [1, 3, 10],
                "bootstrap": [False], "n_estimators": [100, 300], "criterion": ["gini"]}

gsRFC = GridSearchCV(RFC, param_grid = rf_param_grid, cv = kfold,
                    scoring = "accuracy", verbose = 1)

gsRFC.fit(X_train, y_train)

RFC_best = gsRFC.best_estimator_
GBC = GradientBoostingClassifier()

gb_param_grid = {"loss": ["deviance"], "n_estimators": [100, 200, 300],
                "learning_rate": [0.1, 0.05, 0.01], "max_depth": [4, 8],
                "min_samples_leaf": [100, 150], "max_features": [0.3, 0.1]}

gsGBC = GridSearchCV(GBC, param_grid = gb_param_grid, cv = kfold,
                    scoring = "accuracy", verbose = 1)

gsGBC.fit(X_train, y_train)

GBC_best = gsGBC.best_estimator_
SVMC = SVC(probability = True)

svc_param_grid = {"kernel": ["rbf"], "gamma": [0.001, 0.01, 0.1, 1],
                 "C": [1, 10, 50, 100, 200, 300, 1000]}

gsSVMC = GridSearchCV(SVMC, param_grid = svc_param_grid, cv = kfold,
                     scoring = "accuracy", verbose = 1)

gsSVMC.fit(X_train, y_train)

SVMC_best = gsSVMC.best_estimator_
def plot_learning_curve(estimator, title, X, y, ylim = None, cv = None, train_sizes = np.linspace(.1, 1.0, 5)):
    plt.figure(figsize = (10, 6))
    
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv = cv, train_sizes = train_sizes)
    train_scores_mean = np.mean(train_scores, axis = 1)
    train_scores_std = np.std(train_scores, axis = 1)
    test_scores_mean = np.mean(test_scores, axis = 1)
    test_scores_std = np.std(test_scores, axis = 1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                    train_scores_mean + train_scores_std, alpha = 0.1, color = "r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                    test_scores_mean + test_scores_std, alpha = 0.1, color = "g")
    plt.plot(train_sizes, train_scores_mean, "o-", color = "r", label = 'Training score')
    plt.plot(train_sizes, test_scores_mean, "o-", color = "g", label = "Cross Validation score")
    
    plt.legend(loc = "best")
    
    return plt
g = plot_learning_curve(gsRFC.best_estimator_, "RF learning curves", X_train, y_train, cv = kfold)
g = plot_learning_curve(gsExtC.best_estimator_, "ExtraTrees learning curves", X_train, y_train, cv = kfold)
g = plot_learning_curve(gsSVMC.best_estimator_,"SVC learning curves",X_train, y_train, cv=kfold)
g = plot_learning_curve(gsadaDTC.best_estimator_,"AdaBoost learning curves",X_train, y_train, cv=kfold)
g = plot_learning_curve(gsGBC.best_estimator_,"GradientBoosting learning curves",X_train, y_train, cv=kfold)
nrows = ncols = 2

fig, axes = plt.subplots(nrows = nrows , ncols = ncols, sharex = "all", figsize = (15, 15))

names_classifiers = [("AgaBoosting", ada_best), ("ExtraTrees", ExtC_best), ("RandomForest", RFC_best), ("GradientBoosting", GBC_best)]

nclassifier = 0
for row in range(nrows):
    for col in range(ncols):
        name = names_classifiers[nclassifier][0]
        classifier = names_classifiers[nclassifier][1]
        indices = np.argsort(classifier.feature_importances_)[::-1]
        
        g = sns.barplot(x = classifier.feature_importances_[indices],
                       y = train_df.columns[1:][indices], orient = "h", ax = axes[row][col])
        g.set_xlabel("Relative importance", fontsize = 12)
        g.set_ylabel("Features", fontsize = 12)
        g.tick_params(labelsize = 9)
        g.set_title(name + " feature importance")
        
        nclassifier += 1
test_Survived_RFC = pd.Series(RFC_best.predict(X_test), name = "RFC")
test_Survived_ExtC = pd.Series(ExtC_best.predict(X_test), name = "ExtC")
test_Survived_SVMC = pd.Series(SVMC_best.predict(X_test), name = "SVC")
test_Survived_AdaC = pd.Series(ada_best.predict(X_test), name = "Ada")
test_Survived_GBC = pd.Series(GBC_best.predict(X_test), name = "GBC")
ensemble_results = pd.concat([test_Survived_RFC, test_Survived_ExtC,
                             test_Survived_AdaC, test_Survived_GBC, test_Survived_SVMC], axis = 1)

plt.figure(figsize = (10, 6))

sns.heatmap(ensemble_results.corr(), annot = True)

plt.show()
votingC = VotingClassifier(estimators = [("rfc", RFC_best), ("extc", ExtC_best),
                                        ("svc", SVMC_best), ("adac", ada_best), ("gbc", GBC_best)],
                          voting = "soft") # hard voting 과 soft voting 의 차이 알기

votingC = votingC.fit(X_train, y_train)
test_Survived = pd.Series(votingC.predict(X_test), name = "Survived")

results = pd.concat([test_id, test_Survived], axis = 1)

 

'Data Science > Project' 카테고리의 다른 글

[IMAGE] Cats vs. Dogs  (0) 2019.10.07
[NLP] Naver Movie Review sentiment analysis  (0) 2019.10.04