In [2]:
import warnings 

import pandas as pd
import numpy as npy
import matplotlib.pyplot as mpl
import seaborn as sn
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

print('All standard libraries available, setup successful.')
All standard libraries available, setup successful.

Introduction:¶

Version 2.0 - This is the second version of this notebook. Imputation will be added to fill in the missing data points to help the fitting of the model. Scikitlearn multivartiate imputation class is still in the experimental phase. Imputations are not considered highly reliable and needs further testing.

Purpose:¶

In this notebook, we will be synthesizing a prediction model to predict the Personality trait of an individual based on several features.

Goal Model:¶

Our goal model will utilize the Gradient Boosting method using XGBoost to boost the models being produced using the Sckit-Learn library. The Gradient boosting method aims to reduce inherited biases produced by subsequent models while maximizing accuracy rates of the predictions.

In [3]:
#Import data as pandas dataframes
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

Data Exploration:¶

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train.info()
df_train.shape
display(df_train.head())
df_train.tail()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         18524 non-null  int64  
 1   Time_spent_Alone           17334 non-null  float64
 2   Stage_fear                 16631 non-null  object 
 3   Social_event_attendance    17344 non-null  float64
 4   Going_outside              17058 non-null  float64
 5   Drained_after_socializing  17375 non-null  object 
 6   Friends_circle_size        17470 non-null  float64
 7   Post_frequency             17260 non-null  float64
 8   Personality                18524 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 1.3+ MB
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency Personality
0 0 0.0 No 6.0 4.0 No 15.0 5.0 Extrovert
1 1 1.0 No 7.0 3.0 No 10.0 8.0 Extrovert
2 2 6.0 Yes 1.0 0.0 NaN 3.0 0.0 Introvert
3 3 3.0 No 7.0 3.0 No 11.0 5.0 Extrovert
4 4 1.0 No 4.0 4.0 No 13.0 NaN Extrovert
Out[4]:
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency Personality
18519 18519 3.0 No 7.0 3.0 No 9.0 7.0 Extrovert
18520 18520 1.0 NaN 6.0 7.0 No 6.0 5.0 Extrovert
18521 18521 7.0 Yes 1.0 1.0 Yes 1.0 NaN Introvert
18522 18522 NaN Yes 1.0 0.0 Yes 5.0 2.0 Introvert
18523 18523 1.0 No 8.0 6.0 No 4.0 7.0 Extrovert

Considering that the features that are objects are binary choices, we are going ot convert the Stage Fear and Drained after Socializing into binary values.

Data Pre-processing:¶

In [5]:
#Ensuring that the missing values are numpy not-a-number. Missing values from Scikitlearn can only interpret numpy.nan
imputer = IterativeImputer(missing_values=npy.nan, max_iter=10, tol=1e-3, n_nearest_features=None, initial_strategy='mean', fill_value=npy.float64, 
                            imputation_order='ascending', random_state=42,)
                            
In [6]:
def preprocess(df):
    df = df.copy()
    
    def yesorno(x):
        if x=="Yes":
            return "1"
        if x=="No":
            return "0"
        else:
            return npy.nan
            
    df["Stage_fear"] = df["Stage_fear"].apply(yesorno).fillna(npy.nan).astype(npy.float64)
    df["Stage_fear"] = pd.to_numeric(df["Stage_fear"],errors='coerce')
    df["Drained_after_socializing"] = df["Drained_after_socializing"].apply(yesorno).fillna(npy.nan).astype(npy.float64)
    df["Drained_after_socializing"] = pd.to_numeric(df["Drained_after_socializing"],errors='coerce')
    
    return df

#Numerizing personality to be represented as a binary set in order to produce proper correlation charts to. 0 = Introvert, 1 = Extrovert
#Since the test does not contain a column for Personality, this function is seperated from the main preprocessing() function. 
def personalitybin(df):
    df = df.copy()
    
    def binary(x):
        if x=="Introvert":
            return "0"
        else:
            return "1"
    df["Personality"]= df["Personality"].apply(binary).astype(npy.float64)
    return df
    
preprocessed_train_df = preprocess(personalitybin(df_train)).astype(npy.float64)
preprocessed_test_df = preprocess(df_test).fillna(npy.nan).astype(npy.float64)

print("preprocessing completed") #Debugging phrase 
preprocessing completed
In [7]:
#Defining the fit and transform data frame recursively. Output set to be outputted as pandas dataframe. 
imputer.set_output(transform='pandas')

#Converting data set to uniform integer dtype for model  consistency
preprocessed_train_df = imputer.fit_transform(preprocessed_train_df).astype(npy.int64)

#preprocessed_train_df.to_csv('imputedebug.csv', index=False) - For Debugging purposes

Preprocessing Summary:¶

We have converted all the features to NumPy float64 values for data uniformity, decreasing the degrees of complexity that needs to be factored in. We are going to utilize the scikitlearn Multivariate Imputer class to impute missing data based on the aggregation of other variables. most of the features are missing 5% of data, so utilizing the Multivariate imputer function will help fill in these gaps for the model to train off of. b n

In [8]:
preprocessed_train_df.info()
print("preprocessed training data head:")
display(preprocessed_train_df.head())
print("preprocessed training data tail:")
display(preprocessed_train_df.tail())
print("preprocessed test data head:")
preprocessed_test_df.head()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18524 entries, 0 to 18523
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   id                         18524 non-null  int64
 1   Time_spent_Alone           18524 non-null  int64
 2   Stage_fear                 18524 non-null  int64
 3   Social_event_attendance    18524 non-null  int64
 4   Going_outside              18524 non-null  int64
 5   Drained_after_socializing  18524 non-null  int64
 6   Friends_circle_size        18524 non-null  int64
 7   Post_frequency             18524 non-null  int64
 8   Personality                18524 non-null  int64
dtypes: int64(9)
memory usage: 1.3 MB
preprocessed training data head:
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency Personality
0 0 0 0 6 4 0 15 5 1
1 1 1 0 7 3 0 10 8 1
2 2 6 1 1 0 1 3 0 0
3 3 3 0 7 3 0 11 5 1
4 4 1 0 4 4 0 13 6 1
preprocessed training data tail:
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency Personality
18519 18519 3 0 7 3 0 9 7 1
18520 18520 1 0 6 7 0 6 5 1
18521 18521 7 1 1 1 1 1 1 0
18522 18522 7 1 1 0 1 5 2 0
18523 18523 1 0 8 6 0 4 7 1
preprocessed test data head:
Out[8]:
id Time_spent_Alone Stage_fear Social_event_attendance Going_outside Drained_after_socializing Friends_circle_size Post_frequency
0 18524.0 3.0 0.0 7.0 4.0 0.0 6.0 NaN
1 18525.0 NaN 1.0 0.0 0.0 1.0 5.0 1.0
2 18526.0 3.0 0.0 5.0 6.0 0.0 15.0 9.0
3 18527.0 3.0 0.0 4.0 4.0 0.0 5.0 6.0
4 18528.0 9.0 1.0 1.0 2.0 1.0 1.0 1.0

Data Dimensionality Exploration:¶

Correlation heatmap of Variables Summary:¶

Personality variable is binarized, Introverts are represented with a value of X = 0, extroverts with a value of X = 1.

Based on the generated heatmap, there seems to be a strong correlation between Personality and Stage_fear, Personality and Drained_after_socializing, and Personality and Time_spent_alone. Personality seems to have an overall negative correlation (In favor of being true for introverts) with these variables, so the model needs to be fine tuned in terms of determining of an individual is an extrovert or not. The three variables have a high correlation with each other. We will take note of these variables as being strong predictors in determining if someone is an Introvert (x=0).

In [9]:
#Ensures that all variables selected for are numeric. Dropping index column.
num_df = preprocessed_train_df.drop("id", axis = 1).select_dtypes(include=[npy.number])

sn.color_palette("colorblind")

if num_df.shape[1] >= 4:
    mpl.figure(figsize=(10,8)) 
    sn.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='rocket')
    mpl.title('Correlation heatmap of Variables:')
    mpl.tight_layout()
    mpl.show()
else:
    print('Not enough numeric features for correlation analysis')
No description has been provided for this image

Distribution plots of Personality for non-binary features:¶

Based on the distributions of each Personality for the Time spent alone, social event attendance, going outside , friends circle size and post frequency, the Ranges vary greatly with some sample distributions spanning the entire range. It might be best to trim features that have the most null values, while maintaining the features with greatest range variance for the distribution in respect to Personality (df[personality]).

With these limits being set, we will be dropping the Going_outside and Stage_fear features as they are the two that have the least amount of non-null values, one being a binary category and the other on a integer scale. But for comparison sake, we will first construct one that includes all the features.

In [10]:
Y_axis = df_train["Personality"]
df_dist = df_train.drop(["id","Stage_fear", "Drained_after_socializing", "Personality"], axis=1)


for column in df_dist:
    sn.color_palette("colorblind")
    sn.catplot(data = df_train, y=Y_axis, x=df_dist[column], hue="Personality", kind="box")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Model Building¶

We will be utilizing the XGBClassifier, a type of Random Forest Classifier model that is boosted using the XGBClassifier attribute from the xgboost library.

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')

y_train = preprocessed_train_df["Personality"]
X_train = preprocessed_train_df.drop(["id", "Personality","Going_outside", "Friends_circle_size", "Post_frequency", "Social_event_attendance"], axis=1)

X_test = preprocessed_test_df.drop(["id","Going_outside", "Friends_circle_size", "Post_frequency", "Social_event_attendance"], axis = 1)

Model with all Features included -¶

In [12]:
#Check for Numpy arrays
X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
y = y_train.values if isinstance(X_train, pd.Series) else y_train
X_test_npy = X_test.values if isinstance(X_test, pd.DataFrame) else X_test

#Configure Stratified K Fold function for 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #Answer to life
test_preds = npy.zeros(len(X_test_npy))
val_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    print(f"Training fold {fold + 1}...")

    X_tr, X_val = X[train_idx], X[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]

    model = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.01, subsample=0.9, 
            colsample_bytree=0.75, colsample_bynode=0.75, colsample_bylevel=0.75, 
            tree_method='gpu_hist', predictor='gpu_predictor', random_state=42, use_label_encoder=False, eval_metric='logloss')

    model.fit(X_tr, y_tr)

    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_scores.append(val_acc)

    test_preds += model.predict(X_test_npy)

final_preds = (test_preds >= 3).astype(int)

#convert binary of predictions back to objects for introvert = 0 and extrovert = 1

def debin(df):
    df = df.copy()
    
    def binary(x):
        if x==0:
            return "Introvert"
        else:
            return "Extrovert"
    df["Personality"]=df["Personality"].apply(binary).astype(object)
    return df

print(f"CV Scores: {val_scores}")
print(f"Average CV Accuracy: {npy.mean(val_scores):.4f}")
Training fold 1...
Training fold 2...
Training fold 3...
Training fold 4...
Training fold 5...
CV Scores: [0.968421052631579, 0.9657219973009447, 0.9654520917678813, 0.9692307692307692, 0.9700323974082073]
Average CV Accuracy: 0.9678
In [13]:
#Submission output

output = pd.DataFrame({'id': df_test.id, "Personality": final_preds})
output = debin(output)
output.to_csv('submission.csv', index=False)
In [ ]: