import warnings
import pandas as pd
import numpy as npy
import matplotlib.pyplot as mpl
import seaborn as sn
from xgboost import XGBClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
print('All standard libraries available, setup successful.')
All standard libraries available, setup successful.
Introduction:¶
Version 2.0 - This is the second version of this notebook. Imputation will be added to fill in the missing data points to help the fitting of the model. Scikitlearn multivartiate imputation class is still in the experimental phase. Imputations are not considered highly reliable and needs further testing.
Purpose:¶
In this notebook, we will be synthesizing a prediction model to predict the Personality trait of an individual based on several features.
Goal Model:¶
Our goal model will utilize the Gradient Boosting method using XGBoost to boost the models being produced using the Sckit-Learn library. The Gradient boosting method aims to reduce inherited biases produced by subsequent models while maximizing accuracy rates of the predictions.
#Import data as pandas dataframes
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
Data Exploration:¶
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.info()
df_train.shape
display(df_train.head())
df_train.tail()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18524 entries, 0 to 18523 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 18524 non-null int64 1 Time_spent_Alone 17334 non-null float64 2 Stage_fear 16631 non-null object 3 Social_event_attendance 17344 non-null float64 4 Going_outside 17058 non-null float64 5 Drained_after_socializing 17375 non-null object 6 Friends_circle_size 17470 non-null float64 7 Post_frequency 17260 non-null float64 8 Personality 18524 non-null object dtypes: float64(5), int64(1), object(3) memory usage: 1.3+ MB
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | Personality | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0 | No | 6.0 | 4.0 | No | 15.0 | 5.0 | Extrovert |
1 | 1 | 1.0 | No | 7.0 | 3.0 | No | 10.0 | 8.0 | Extrovert |
2 | 2 | 6.0 | Yes | 1.0 | 0.0 | NaN | 3.0 | 0.0 | Introvert |
3 | 3 | 3.0 | No | 7.0 | 3.0 | No | 11.0 | 5.0 | Extrovert |
4 | 4 | 1.0 | No | 4.0 | 4.0 | No | 13.0 | NaN | Extrovert |
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | Personality | |
---|---|---|---|---|---|---|---|---|---|
18519 | 18519 | 3.0 | No | 7.0 | 3.0 | No | 9.0 | 7.0 | Extrovert |
18520 | 18520 | 1.0 | NaN | 6.0 | 7.0 | No | 6.0 | 5.0 | Extrovert |
18521 | 18521 | 7.0 | Yes | 1.0 | 1.0 | Yes | 1.0 | NaN | Introvert |
18522 | 18522 | NaN | Yes | 1.0 | 0.0 | Yes | 5.0 | 2.0 | Introvert |
18523 | 18523 | 1.0 | No | 8.0 | 6.0 | No | 4.0 | 7.0 | Extrovert |
Considering that the features that are objects are binary choices, we are going ot convert the Stage Fear and Drained after Socializing into binary values.
Data Pre-processing:¶
#Ensuring that the missing values are numpy not-a-number. Missing values from Scikitlearn can only interpret numpy.nan
imputer = IterativeImputer(missing_values=npy.nan, max_iter=10, tol=1e-3, n_nearest_features=None, initial_strategy='mean', fill_value=npy.float64,
imputation_order='ascending', random_state=42,)
def preprocess(df):
df = df.copy()
def yesorno(x):
if x=="Yes":
return "1"
if x=="No":
return "0"
else:
return npy.nan
df["Stage_fear"] = df["Stage_fear"].apply(yesorno).fillna(npy.nan).astype(npy.float64)
df["Stage_fear"] = pd.to_numeric(df["Stage_fear"],errors='coerce')
df["Drained_after_socializing"] = df["Drained_after_socializing"].apply(yesorno).fillna(npy.nan).astype(npy.float64)
df["Drained_after_socializing"] = pd.to_numeric(df["Drained_after_socializing"],errors='coerce')
return df
#Numerizing personality to be represented as a binary set in order to produce proper correlation charts to. 0 = Introvert, 1 = Extrovert
#Since the test does not contain a column for Personality, this function is seperated from the main preprocessing() function.
def personalitybin(df):
df = df.copy()
def binary(x):
if x=="Introvert":
return "0"
else:
return "1"
df["Personality"]= df["Personality"].apply(binary).astype(npy.float64)
return df
preprocessed_train_df = preprocess(personalitybin(df_train)).astype(npy.float64)
preprocessed_test_df = preprocess(df_test).fillna(npy.nan).astype(npy.float64)
print("preprocessing completed") #Debugging phrase
preprocessing completed
#Defining the fit and transform data frame recursively. Output set to be outputted as pandas dataframe.
imputer.set_output(transform='pandas')
#Converting data set to uniform integer dtype for model consistency
preprocessed_train_df = imputer.fit_transform(preprocessed_train_df).astype(npy.int64)
#preprocessed_train_df.to_csv('imputedebug.csv', index=False) - For Debugging purposes
Preprocessing Summary:¶
We have converted all the features to NumPy float64 values for data uniformity, decreasing the degrees of complexity that needs to be factored in. We are going to utilize the scikitlearn Multivariate Imputer class to impute missing data based on the aggregation of other variables. most of the features are missing 5% of data, so utilizing the Multivariate imputer function will help fill in these gaps for the model to train off of. b n
preprocessed_train_df.info()
print("preprocessed training data head:")
display(preprocessed_train_df.head())
print("preprocessed training data tail:")
display(preprocessed_train_df.tail())
print("preprocessed test data head:")
preprocessed_test_df.head()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 18524 entries, 0 to 18523 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 18524 non-null int64 1 Time_spent_Alone 18524 non-null int64 2 Stage_fear 18524 non-null int64 3 Social_event_attendance 18524 non-null int64 4 Going_outside 18524 non-null int64 5 Drained_after_socializing 18524 non-null int64 6 Friends_circle_size 18524 non-null int64 7 Post_frequency 18524 non-null int64 8 Personality 18524 non-null int64 dtypes: int64(9) memory usage: 1.3 MB preprocessed training data head:
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | Personality | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 6 | 4 | 0 | 15 | 5 | 1 |
1 | 1 | 1 | 0 | 7 | 3 | 0 | 10 | 8 | 1 |
2 | 2 | 6 | 1 | 1 | 0 | 1 | 3 | 0 | 0 |
3 | 3 | 3 | 0 | 7 | 3 | 0 | 11 | 5 | 1 |
4 | 4 | 1 | 0 | 4 | 4 | 0 | 13 | 6 | 1 |
preprocessed training data tail:
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | Personality | |
---|---|---|---|---|---|---|---|---|---|
18519 | 18519 | 3 | 0 | 7 | 3 | 0 | 9 | 7 | 1 |
18520 | 18520 | 1 | 0 | 6 | 7 | 0 | 6 | 5 | 1 |
18521 | 18521 | 7 | 1 | 1 | 1 | 1 | 1 | 1 | 0 |
18522 | 18522 | 7 | 1 | 1 | 0 | 1 | 5 | 2 | 0 |
18523 | 18523 | 1 | 0 | 8 | 6 | 0 | 4 | 7 | 1 |
preprocessed test data head:
id | Time_spent_Alone | Stage_fear | Social_event_attendance | Going_outside | Drained_after_socializing | Friends_circle_size | Post_frequency | |
---|---|---|---|---|---|---|---|---|
0 | 18524.0 | 3.0 | 0.0 | 7.0 | 4.0 | 0.0 | 6.0 | NaN |
1 | 18525.0 | NaN | 1.0 | 0.0 | 0.0 | 1.0 | 5.0 | 1.0 |
2 | 18526.0 | 3.0 | 0.0 | 5.0 | 6.0 | 0.0 | 15.0 | 9.0 |
3 | 18527.0 | 3.0 | 0.0 | 4.0 | 4.0 | 0.0 | 5.0 | 6.0 |
4 | 18528.0 | 9.0 | 1.0 | 1.0 | 2.0 | 1.0 | 1.0 | 1.0 |
Data Dimensionality Exploration:¶
Correlation heatmap of Variables Summary:¶
Personality variable is binarized, Introverts are represented with a value of X = 0, extroverts with a value of X = 1.
Based on the generated heatmap, there seems to be a strong correlation between Personality and Stage_fear, Personality and Drained_after_socializing, and Personality and Time_spent_alone. Personality seems to have an overall negative correlation (In favor of being true for introverts) with these variables, so the model needs to be fine tuned in terms of determining of an individual is an extrovert or not. The three variables have a high correlation with each other. We will take note of these variables as being strong predictors in determining if someone is an Introvert (x=0).
#Ensures that all variables selected for are numeric. Dropping index column.
num_df = preprocessed_train_df.drop("id", axis = 1).select_dtypes(include=[npy.number])
sn.color_palette("colorblind")
if num_df.shape[1] >= 4:
mpl.figure(figsize=(10,8))
sn.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='rocket')
mpl.title('Correlation heatmap of Variables:')
mpl.tight_layout()
mpl.show()
else:
print('Not enough numeric features for correlation analysis')
Distribution plots of Personality for non-binary features:¶
Based on the distributions of each Personality for the Time spent alone, social event attendance, going outside , friends circle size and post frequency, the Ranges vary greatly with some sample distributions spanning the entire range. It might be best to trim features that have the most null values, while maintaining the features with greatest range variance for the distribution in respect to Personality (df[personality]).
With these limits being set, we will be dropping the Going_outside and Stage_fear features as they are the two that have the least amount of non-null values, one being a binary category and the other on a integer scale. But for comparison sake, we will first construct one that includes all the features.
Y_axis = df_train["Personality"]
df_dist = df_train.drop(["id","Stage_fear", "Drained_after_socializing", "Personality"], axis=1)
for column in df_dist:
sn.color_palette("colorblind")
sn.catplot(data = df_train, y=Y_axis, x=df_dist[column], hue="Personality", kind="box")
Model Building¶
We will be utilizing the XGBClassifier, a type of Random Forest Classifier model that is boosted using the XGBClassifier attribute from the xgboost library.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')
y_train = preprocessed_train_df["Personality"]
X_train = preprocessed_train_df.drop(["id", "Personality","Going_outside", "Friends_circle_size", "Post_frequency", "Social_event_attendance"], axis=1)
X_test = preprocessed_test_df.drop(["id","Going_outside", "Friends_circle_size", "Post_frequency", "Social_event_attendance"], axis = 1)
Model with all Features included -¶
#Check for Numpy arrays
X = X_train.values if isinstance(X_train, pd.DataFrame) else X_train
y = y_train.values if isinstance(X_train, pd.Series) else y_train
X_test_npy = X_test.values if isinstance(X_test, pd.DataFrame) else X_test
#Configure Stratified K Fold function for 5-fold cross-validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) #Answer to life
test_preds = npy.zeros(len(X_test_npy))
val_scores = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
print(f"Training fold {fold + 1}...")
X_tr, X_val = X[train_idx], X[val_idx]
y_tr, y_val = y[train_idx], y[val_idx]
model = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.01, subsample=0.9,
colsample_bytree=0.75, colsample_bynode=0.75, colsample_bylevel=0.75,
tree_method='gpu_hist', predictor='gpu_predictor', random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_tr, y_tr)
val_pred = model.predict(X_val)
val_acc = accuracy_score(y_val, val_pred)
val_scores.append(val_acc)
test_preds += model.predict(X_test_npy)
final_preds = (test_preds >= 3).astype(int)
#convert binary of predictions back to objects for introvert = 0 and extrovert = 1
def debin(df):
df = df.copy()
def binary(x):
if x==0:
return "Introvert"
else:
return "Extrovert"
df["Personality"]=df["Personality"].apply(binary).astype(object)
return df
print(f"CV Scores: {val_scores}")
print(f"Average CV Accuracy: {npy.mean(val_scores):.4f}")
Training fold 1... Training fold 2... Training fold 3... Training fold 4... Training fold 5... CV Scores: [0.968421052631579, 0.9657219973009447, 0.9654520917678813, 0.9692307692307692, 0.9700323974082073] Average CV Accuracy: 0.9678
#Submission output
output = pd.DataFrame({'id': df_test.id, "Personality": final_preds})
output = debin(output)
output.to_csv('submission.csv', index=False)