import warnings 

import pandas as pd
import numpy as npy
import matplotlib.pyplot as mpl
import seaborn as sn
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

print('All standard libraries available, setup successful.')

All standard libraries available, setup successful.

#Import data as pandas dataframes
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.info()
df_train.shape
display(df_train.head())
df_train.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB

#Ensuring that the missing values are numpy not-a-number. Missing values from Scikitlearn can only interpret numpy.nan
imputer = IterativeImputer(missing_values=npy.nan, max_iter=10, tol=1e-3, n_nearest_features=None, initial_strategy='mean', fill_value=npy.float64, 
                            imputation_order='ascending', random_state=42,)

def preprocess(df):
    df = df.copy()
    
    def yesorno(x):
        if x=="Yes":
            return "1"
        if x=="No":
            return "0"
        else:
            return npy.nan
            
    df["Stage_fear"] = df["Stage_fear"].apply(yesorno).fillna(npy.nan).astype(npy.float64)
    df["Stage_fear"] = pd.to_numeric(df["Stage_fear"],errors='coerce')
    df["Drained_after_socializing"] = df["Drained_after_socializing"].apply(yesorno).fillna(npy.nan).astype(npy.float64)
    df["Drained_after_socializing"] = pd.to_numeric(df["Drained_after_socializing"],errors='coerce')
    
    return df

#Numerizing personality to be represented as a binary set in order to produce proper correlation charts to. 0 = Introvert, 1 = Extrovert
#Since the test does not contain a column for Personality, this function is seperated from the main preprocessing() function. 
def personalitybin(df):
    df = df.copy()
    
    def binary(x):
        if x=="Introvert":
            return "0"
        else:
            return "1"
    df["Personality"]= df["Personality"].apply(binary).astype(npy.float64)
    return df
    
preprocessed_train_df = preprocess(personalitybin(df_train)).astype(npy.float64)
preprocessed_test_df = preprocess(df_test).fillna(npy.nan).astype(npy.float64)

print("preprocessing completed") #Debugging phrase

preprocessing completed

#Defining the fit and transform data frame recursively. Output set to be outputted as pandas dataframe. 
imputer.set_output(transform='pandas')

#Converting data set to uniform integer dtype for model  consistency
preprocessed_train_df = imputer.fit_transform(preprocessed_train_df).astype(npy.int64)

#preprocessed_train_df.to_csv('imputedebug.csv', index=False) - For Debugging purposes

preprocessed_train_df.info()
print("preprocessed training data head:")
display(preprocessed_train_df.head())
print("preprocessed training data tail:")
display(preprocessed_train_df.tail())
print("preprocessed test data head:")
preprocessed_test_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   id                         18524 non-null  int64
 1   Time_spent_Alone           18524 non-null  int64
 2   Stage_fear                 18524 non-null  int64
 3   Social_event_attendance    18524 non-null  int64
 4   Going_outside              18524 non-null  int64
 5   Drained_after_socializing  18524 non-null  int64
 6   Friends_circle_size        18524 non-null  int64
 7   Post_frequency             18524 non-null  int64
 8   Personality                18524 non-null  int64
dtypes: int64(9)
memory usage: 1.3 MB
preprocessed training data head:

preprocessed training data tail:

preprocessed test data head:

#Ensures that all variables selected for are numeric. Dropping index column.
num_df = preprocessed_train_df.drop("id", axis = 1).select_dtypes(include=[npy.number])

sn.color_palette("colorblind")

if num_df.shape[1] >= 4:
    mpl.figure(figsize=(10,8)) 
    sn.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='rocket')
    mpl.title('Correlation heatmap of Variables:')
    mpl.tight_layout()
    mpl.show()
else:
    print('Not enough numeric features for correlation analysis')

Y_axis = df_train["Personality"]
df_dist = df_train.drop(["id","Stage_fear", "Drained_after_socializing", "Personality"], axis=1)


for column in df_dist:
    sn.color_palette("colorblind")
    sn.catplot(data = df_train, y=Y_axis, x=df_dist[column], hue="Personality", kind="box")

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')

y_train = preprocessed_train_df["Personality"]
X_train = preprocessed_train_df.drop(["id", "Personality","Going_outside", "Friends_circle_size", "Post_frequency", "Social_event_attendance"], axis=1)

X_test = preprocessed_test_df.drop(["id","Going_outside", "Friends_circle_size", "Post_frequency", "Social_event_attendance"], axis = 1)

#Check for Numpy arrays
X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
y = y_train.values if isinstance(X_train, pd.Series) else y_train
X_test_npy = X_test.values if isinstance(X_test, pd.DataFrame) else X_test

#Configure Stratified K Fold function for 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #Answer to life
test_preds = npy.zeros(len(X_test_npy))
val_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training fold {fold + 1}...")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    model = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.01, subsample=0.9, 
            colsample_bytree=0.75, colsample_bynode=0.75, colsample_bylevel=0.75, 
            tree_method='gpu_hist', predictor='gpu_predictor', random_state=42, use_label_encoder=False, eval_metric='logloss')

    model.fit(X_tr, y_tr)

    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_scores.append(val_acc)

    test_preds += model.predict(X_test_npy)

final_preds = (test_preds >= 3).astype(int)

#convert binary of predictions back to objects for introvert = 0 and extrovert = 1

def debin(df):
    df = df.copy()
    
    def binary(x):
        if x==0:
            return "Introvert"
        else:
            return "Extrovert"
    df["Personality"]=df["Personality"].apply(binary).astype(object)
    return df

print(f"CV Scores: {val_scores}")
print(f"Average CV Accuracy: {npy.mean(val_scores):.4f}")

Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...
CV Scores: [0.968421052631579, 0.9657219973009447, 0.9654520917678813, 0.9692307692307692, 0.9700323974082073]
Average CV Accuracy: 0.9678

#Submission output

output = pd.DataFrame({'id': df_test.id, "Personality": final_preds})
output = debin(output)
output.to_csv('submission.csv', index=False)

	id	Time_spent_Alone	Stage_fear	Social_event_attendance	Going_outside	Drained_after_socializing	Friends_circle_size	Post_frequency	Personality
18519	18519	3.0	No	7.0	3.0	No	9.0	7.0	Extrovert
18520	18520	1.0	NaN	6.0	7.0	No	6.0	5.0	Extrovert
18521	18521	7.0	Yes	1.0	1.0	Yes	1.0	NaN	Introvert
18522	18522	NaN	Yes	1.0	0.0	Yes	5.0	2.0	Introvert
18523	18523	1.0	No	8.0	6.0	No	4.0	7.0	Extrovert

	id	Time_spent_Alone	Stage_fear	Social_event_attendance	Going_outside	Drained_after_socializing	Friends_circle_size	Post_frequency
0	18524.0	3.0	0.0	7.0	4.0	0.0	6.0	NaN
1	18525.0	NaN	1.0	0.0	0.0	1.0	5.0	1.0
2	18526.0	3.0	0.0	5.0	6.0	0.0	15.0	9.0
3	18527.0	3.0	0.0	4.0	4.0	0.0	5.0	6.0
4	18528.0	9.0	1.0	1.0	2.0	1.0	1.0	1.0

Introduction:¶

Purpose:¶

Goal Model:¶

Data Exploration:¶

Data Pre-processing:¶

Preprocessing Summary:¶

Data Dimensionality Exploration:¶

Correlation heatmap of Variables Summary:¶

Distribution plots of Personality for non-binary features:¶

Model Building¶

Model with all Features included -¶

	id	Time_spent_Alone	Stage_fear	Social_event_attendance	Going_outside	Drained_after_socializing	Friends_circle_size	Post_frequency	Personality
0	0	0.0	No	6.0	4.0	No	15.0	5.0	Extrovert
1	1	1.0	No	7.0	3.0	No	10.0	8.0	Extrovert
2	2	6.0	Yes	1.0	0.0	NaN	3.0	0.0	Introvert
3	3	3.0	No	7.0	3.0	No	11.0	5.0	Extrovert
4	4	1.0	No	4.0	4.0	No	13.0	NaN	Extrovert