import warnings 

import pandas as pd
import numpy as npy
import matplotlib.pyplot as mpl
import seaborn as sn
from xgboost import XGBClassifier

print('All standard libraries available, setup successful.')

All standard libraries available, setup successful.

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.info()
df_train.shape
display(df_train.head())
df_train.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

def preprocess(df):
    df = df.copy()

    def normalize_name(x):
            return " ".join([v.strip(",()[].\"'/") for v in x.split(" ")])
    def name_title(x):
        found_keyword = []
        keywords = {"Mr.", "Mrs.", "Miss.", "Rev.", "Master"}
        for keyword in keywords:
            if keyword in x:
                found_keyword.append(keyword)
                return found_keyword
        else: 
            return "NaN"
        
        
    def ticket_number(x):
        if x == "LINE":
            return "NaN"
        return x.split(" ")[-1]
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    df["Title"] = df["Name"].apply(name_title)
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)
    return df.drop("Ticket", axis=1)

preprocessed_train_df = preprocess(df_train)
preprocessed_serving_df = preprocess(df_test)
print("preprocessing completed") #Debugging phrase

preprocessing completed

preprocessed_train_df.info()
display(preprocessed_train_df.head())
display(preprocessed_train_df.tail())
preprocessed_serving_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    891 non-null    int64  
 1   Survived       891 non-null    int64  
 2   Pclass         891 non-null    int64  
 3   Name           891 non-null    object 
 4   Sex            891 non-null    object 
 5   Age            714 non-null    float64
 6   SibSp          891 non-null    int64  
 7   Parch          891 non-null    int64  
 8   Fare           891 non-null    float64
 9   Cabin          204 non-null    object 
 10  Embarked       889 non-null    object 
 11  Title          891 non-null    object 
 12  Ticket_number  891 non-null    object 
 13  Ticket_item    891 non-null    object 
dtypes: float64(2), int64(5), object(7)
memory usage: 97.6+ KB

print('Correlation heatmap of numerical variables:')

#Only looking at the numeric or binary values to find outliers and correlations that are of interest for data cleaning and refinement.

#Select for columns with numeric values, those with string values will be omitted. 
num_df = preprocessed_train_df.drop("PassengerId", axis = 1).select_dtypes(include=[npy.number])

sn.color_palette("colorblind")

if num_df.shape[1] >= 4:
    mpl.figure(figsize=(10,8)) 
    sn.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='rocket')
    mpl.title('correlation heatmap of numeric features')
    mpl.tight_layout()
    mpl.show()
else:
    print('Not enough numeric features for correlation analysis')

Correlation heatmap of numerical variables:

display(preprocessed_train_df["Age"].describe())
sn.set_palette("colorblind")
sn.catplot(data = preprocessed_train_df, x="Age", kind="box")
sn.displot(data = preprocessed_train_df, x="Age")

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

<seaborn.axisgrid.FacetGrid at 0x21ad3301e50>

#Tokenize Names for model compatible format. 
from sklearn.preprocessing import LabelEncoder 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')

cols = ["Name","Title","Pclass","Sex","Cabin","Embarked", "Ticket_item"]
encoder = LabelEncoder()
for col in cols:
    preprocessed_train_df[col] = encoder.fit_transform(preprocessed_train_df[col].astype(str))
    preprocessed_serving_df[col] = encoder.fit_transform(preprocessed_serving_df[col].astype(str))
    
y_train = preprocessed_train_df["Survived"]
X_train = preprocessed_train_df.drop(["Survived", "PassengerId"], axis=1)

X_test = preprocessed_serving_df.drop('PassengerId', axis = 1)

#debug output for Data troubleshooting. Not for Production Use  
#ts_output = preprocessed_train_df
#ts_output.to_csv('debug.csv', index=False)

#Check for Numpy arrays
X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
y = y_train.values if isinstance(X_train, pd.Series) else y_train
X_test_npy = X_test.values if isinstance(X_test, pd.DataFrame) else X_test

#Configure Stratified K Fold function for 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #Answer to life
test_preds = npy.zeros(len(X_test_npy))
val_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training fold {fold + 1}...")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    model = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.2, subsample=0.8, colsample_bytree=0.8, tree_method='gpu_hist', 
        predictor='gpu_predictor', random_state=42, use_label_encoder=False, eval_metric='logloss')

    model.fit(X_tr, y_tr)

    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_scores.append(val_acc)

    test_preds += model.predict(X_test_npy)

final_preds = (test_preds >= 3).astype(int)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, "Survived": final_preds})
output.to_csv('submission.csv', index=False)

print(f"Your submission was successfully saved! CV Scores: {val_scores}")
print(f"Average CV Accuracy: {npy.mean(val_scores):.4f}")

Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...
Your submission was successfully saved! CV Scores: [0.8268156424581006, 0.8539325842696629, 0.8033707865168539, 0.848314606741573, 0.8314606741573034]
Average CV Accuracy: 0.8328

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
886	887	0	2	Montvila, Rev. Juozas	male	27.0	0	0	211536	13.00	NaN	S
887	888	1	1	Graham, Miss. Margaret Edith	female	19.0	0	0	112053	30.00	B42	S
888	889	0	3	Johnston, Miss. Catherine Helen "Carrie"	female	NaN	1	2	W./C. 6607	23.45	NaN	S
889	890	1	1	Behr, Mr. Karl Howell	male	26.0	0	0	111369	30.00	C148	C
890	891	0	3	Dooley, Mr. Patrick	male	32.0	0	0	370376	7.75	NaN	Q

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Fare	Cabin	Embarked	Title	Ticket_number	Ticket_item
0	1	0	3	Braund Mr Owen Harris	male	22.0	1	7.2500	NaN	S	[Mr.]	21171	A/5
1	2	1	1	Cumings Mrs John Bradley Florence Briggs Thayer	female	38.0	1	71.2833	C85	C	[Mrs.]	17599	PC
2	3	1	3	Heikkinen Miss Laina	female	26.0	0	7.9250	NaN	S	[Miss.]	3101282	STON/O2.
3	4	1	1	Futrelle Mrs Jacques Heath Lily May Peel	female	35.0	1	53.1000	C123	S	[Mrs.]	113803	NONE
4	5	0	3	Allen Mr William Henry	male	35.0	0	8.0500	NaN	S	[Mr.]	373450	NONE

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Fare	Cabin	Embarked	Title	Ticket_number	Ticket_item
886	887	0	2	Montvila Rev Juozas	male	27.0	0	0	13.00	NaN	S	[Rev.]	211536	NONE
887	888	1	1	Graham Miss Margaret Edith	female	19.0	0	0	30.00	B42	S	[Miss.]	112053	NONE
888	889	0	3	Johnston Miss Catherine Helen Carrie	female	NaN	1	2	23.45	NaN	S	[Miss.]	6607	W./C.
889	890	1	1	Behr Mr Karl Howell	male	26.0	0	0	30.00	C148	C	[Mr.]	111369	NONE
890	891	0	3	Dooley Mr Patrick	male	32.0	0	0	7.75	NaN	Q	[Mr.]	370376	NONE

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Fare	Cabin	Embarked	Title	Ticket_number	Ticket_item
0	892	3	Kelly Mr James	male	34.5	0	0	7.8292	NaN	Q	[Mr.]	330911	NONE
1	893	3	Wilkes Mrs James Ellen Needs	female	47.0	1	0	7.0000	NaN	S	[Mrs.]	363272	NONE
2	894	2	Myles Mr Thomas Francis	male	62.0	0	0	9.6875	NaN	Q	[Mr.]	240276	NONE
3	895	3	Wirz Mr Albert	male	27.0	0	0	8.6625	NaN	S	[Mr.]	315154	NONE
4	896	3	Hirvonen Mrs Alexander Helga E Lindqvist	female	22.0	1	1	12.2875	NaN	S	[Mrs.]	3101298	NONE

Titanic Fatality Prediction Model¶

Introduction¶

Purpose:¶

Goal Model:¶

Data Exploration:¶

Summary of the data we are going to be working with to train the model.¶

Preprocessing Data Frame:¶

Exploratory modeling¶

Survived vs Fare¶

Fare vs PClass¶

PClass vs Survived¶

Age Distribution Investigation¶

Data Model¶

Titanic - Machine Learning from Disaster Competition¶

Results:¶