#Preface: Data Science Machine Learning Project deploying Mobile Phone Price data accquired from Kaggle to predict classes of Mobile Phone
#prices based on hardware specifications.
import warnings
import scipy as sp
import pandas as pd
import numpy as npy
import matplotlib.pyplot as mpl
import seaborn as sn
from sklearn.inspection import permutation_importance as perm
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
print('All libraries available, setup successful.')
All libraries available, setup successful.
#Defining df as working data set from kaggle data set folder.
df = pd.read_csv("Mobile Phone Pricing Dataset/Mobile Phone Pricing.csv", encoding='ascii', delimiter=',')
Research Goal¶
The goal of this project is to build a predictive model that can also act as a prescriptive model built from the Mobile Phone Pricing data provided. The predictive model will be able to forecast the prices of newer models of phones based on the specifications that the model uses to determine the price_range class. The same model can double as a perscriptive model to determine if a model of phone is worth the price:hardware-specification ratio.
df.head() #Header of the data, first five
battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 842 | 0 | 2.2 | 0 | 1 | 0 | 7 | 0.6 | 188 | 2 | ... | 20 | 756 | 2549 | 9 | 7 | 19 | 0 | 0 | 1 | 1 |
1 | 1021 | 1 | 0.5 | 1 | 0 | 1 | 53 | 0.7 | 136 | 3 | ... | 905 | 1988 | 2631 | 17 | 3 | 7 | 1 | 1 | 0 | 2 |
2 | 563 | 1 | 0.5 | 1 | 2 | 1 | 41 | 0.9 | 145 | 5 | ... | 1263 | 1716 | 2603 | 11 | 2 | 9 | 1 | 1 | 0 | 2 |
3 | 615 | 1 | 2.5 | 0 | 0 | 0 | 10 | 0.8 | 131 | 6 | ... | 1216 | 1786 | 2769 | 16 | 8 | 11 | 1 | 0 | 0 | 2 |
4 | 1821 | 1 | 1.2 | 0 | 13 | 1 | 44 | 0.6 | 141 | 2 | ... | 1208 | 1212 | 1411 | 8 | 2 | 15 | 1 | 1 | 0 | 1 |
5 rows × 21 columns
df.tail() #Tail of the data table, last five
battery_power | blue | clock_speed | dual_sim | fc | four_g | int_memory | m_dep | mobile_wt | n_cores | ... | px_height | px_width | ram | sc_h | sc_w | talk_time | three_g | touch_screen | wifi | price_range | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1995 | 794 | 1 | 0.5 | 1 | 0 | 1 | 2 | 0.8 | 106 | 6 | ... | 1222 | 1890 | 668 | 13 | 4 | 19 | 1 | 1 | 0 | 0 |
1996 | 1965 | 1 | 2.6 | 1 | 0 | 0 | 39 | 0.2 | 187 | 4 | ... | 915 | 1965 | 2032 | 11 | 10 | 16 | 1 | 1 | 1 | 2 |
1997 | 1911 | 0 | 0.9 | 1 | 1 | 1 | 36 | 0.7 | 108 | 8 | ... | 868 | 1632 | 3057 | 9 | 1 | 5 | 1 | 1 | 0 | 3 |
1998 | 1512 | 0 | 0.9 | 0 | 4 | 1 | 46 | 0.1 | 145 | 5 | ... | 336 | 670 | 869 | 18 | 10 | 19 | 1 | 1 | 1 | 0 |
1999 | 510 | 1 | 2.0 | 1 | 5 | 1 | 45 | 0.9 | 168 | 6 | ... | 483 | 754 | 3919 | 19 | 4 | 2 | 1 | 1 | 1 | 3 |
5 rows × 21 columns
df.info() #Shows data frame information within panda based view.
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 battery_power 2000 non-null int64 1 blue 2000 non-null int64 2 clock_speed 2000 non-null float64 3 dual_sim 2000 non-null int64 4 fc 2000 non-null int64 5 four_g 2000 non-null int64 6 int_memory 2000 non-null int64 7 m_dep 2000 non-null float64 8 mobile_wt 2000 non-null int64 9 n_cores 2000 non-null int64 10 pc 2000 non-null int64 11 px_height 2000 non-null int64 12 px_width 2000 non-null int64 13 ram 2000 non-null int64 14 sc_h 2000 non-null int64 15 sc_w 2000 non-null int64 16 talk_time 2000 non-null int64 17 three_g 2000 non-null int64 18 touch_screen 2000 non-null int64 19 wifi 2000 non-null int64 20 price_range 2000 non-null int64 dtypes: float64(2), int64(19) memory usage: 328.3 KB
df.shape #Shape of the data, 2000 entries, 21 variables
(2000, 21)
Metadata of data set provided by kaggle user¶
Description of columns:
- battery_power: Battery Capacity in mAh
- blue: Has Bluetooth or not
- clock_speed: Processor speed.
- dual_sim: Has dual sim support or not.
- fc: Front camera megapixels.
- four_g: Has 4G or not.
- int_memory: Internal Memory in GB.
- m_deep: Mobile depth in cm.
- mobile_wt: Weight in gm.
- n_cores: Processor Core Count.
- pc: Primary Camera megapixels.
- px_height: Pixel Resolution height.
- px_width: Pixel Resolution width.
- ram: Ram in MB.
- sc_h: Mobile Screen height in cm.
- sc_w: Mobile Screen width in cm
- talk_time: Time a single battery charge will last. In hours.
- three_g: Has 3G or not.
- touch_screen: Has touch screen or not.
- wifi: Has WiFi or not
- Price_range: This is the target
- 0=low cost
- 1=medium cost
- 2=high cost
- 3=very high cost
print('Start of Data Exploration Phase:')
Start of Data Exploration Phase:
print('Correlation heatmap of variables:')
#Only looking at the numeric or binary values to find outliers and correlations that are of interest for data cleaning and refinement.
#Select for columns with numeric values, those with string values will be omitted.
num_df = df.select_dtypes(include=[npy.number])
sn.color_palette("colorblind")
if num_df.shape[1] >= 4:
mpl.figure(figsize=(12,10))
sn.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='rocket')
mpl.title('correlation heatmap of numeric features')
mpl.tight_layout()
mpl.show()
else:
print('Not enough numeric features for correlation analysis')
Correlation heatmap of variables:
Correlation Heatmap deployed to detect correlations between features with other features, and features with phone_price_range: We are going to be bias in this analysis and factor out a large number of the correlations with correlation coefficients >0.10 since the output variable of interest is price_range. Based on the output of the Correlation Heatmap of Numeric Features, focusing on the features correlating with price range, there seems to be a high correlation with 'ram', 'px_height', 'px_width' and 'battery_power.' In addition to the relatively higher correlation with the price range variable, 'px_width' and 'px_height' have a fairly high correlation with each other, resulting in the correlation coefficient of 0.51. For this study we will explore the relationship between these features and further extrapolate how it can affect the resulting price_range.
print("Sample Distribution of selected features:")
#selectors for specific columns
selected = df[["price_range","battery_power","ram","px_height","px_width"]]
price = df["price_range"]
bat = df["battery_power"]
ram = df["ram"]
height = df["px_height"]
width = df["px_width"]
display(selected.describe())
sn.catplot(data = df, x="price_range", kind="box")
sn.catplot(data = df, x="battery_power", kind="box")
sn.catplot(data = df, x="ram", kind="box")
sn.catplot(data = df, x="px_height", kind="box")
sn.catplot(data = df, x="px_width", kind="box")
Sample Distribution of selected features:
price_range | battery_power | ram | px_height | px_width | |
---|---|---|---|---|---|
count | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 | 2000.000000 |
mean | 1.500000 | 1238.518500 | 2124.213000 | 645.108000 | 1251.515500 |
std | 1.118314 | 439.418206 | 1084.732044 | 443.780811 | 432.199447 |
min | 0.000000 | 501.000000 | 256.000000 | 0.000000 | 500.000000 |
25% | 0.750000 | 851.750000 | 1207.500000 | 282.750000 | 874.750000 |
50% | 1.500000 | 1226.000000 | 2146.500000 | 564.000000 | 1247.000000 |
75% | 2.250000 | 1615.250000 | 3064.500000 | 947.250000 | 1633.000000 |
max | 3.000000 | 1998.000000 | 3998.000000 | 1960.000000 | 1998.000000 |
<seaborn.axisgrid.FacetGrid at 0x1f952562d50>
Pair plot with selected features: As we can observe from the pair plot array produced there is a barely noticeable visual correlation between the Price_range and ram feature. All other features have no noticeable visual correlation with the price_range variable. Regardless, we are performing a pair-plot test to explore their weight in the model being produced to predict the price of a mobile phone model
print("Box Plots distributions of Features in respect to Price Range categories")
sn.catplot(data = df, x="price_range", y="battery_power", kind="box")
sn.catplot(data = df, x="price_range", y="ram", kind="box")
sn.catplot(data = df, x="price_range", y="px_height", kind="box")
sn.catplot(data = df, x="price_range", y="px_width", kind="box")
Box Plots distributions of Features in respect to Price Range categories
<seaborn.axisgrid.FacetGrid at 0x1f953fe3750>
#Dimensionality reduced to 4 features based on exploration data aggregated.
reduce = df[['price_range', 'ram', 'px_width', 'px_height', 'battery_power']]
pairplot = sn.PairGrid(reduce)
pairplot.map_diag(sn.histplot)
pairplot.map(sn.scatterplot)
<seaborn.axisgrid.PairGrid at 0x1f951bc7a10>
#Analyze the predictive power of each feature before constructing ML model.
sn.lmplot(y="ram", x="price_range", data=df)
sn.lmplot(y="px_width", x="price_range", data=df)
sn.lmplot(y="px_height", x="price_range", data=df)
sn.lmplot(y="battery_power", x="price_range", data=df)
<seaborn.axisgrid.FacetGrid at 0x1f952ca7d90>
4 Feature Model Build¶
We will be utilzing the ram, px_width, px_height and battery_power features to build the predictive/perscriptive model for Mobile Phone Prices. The model will predict which of the 4 classes of Mobile Phone Price_Range it falls into based on the 4 Features explored and picked out. 80% of the data will be used to train the model and 20% will be used to test the model for its accuracy.
#Further support reduction of feature choice using permutation feature importance plot
x = df.drop('price_range', axis=1)
y = df['price_range']
model = rfc()
model.fit(x,y)
result = perm(model, x, y, n_repeats=20, random_state=42)
perm_importance=pd.Series(result.importances_mean, index=x.columns).sort_values(ascending=False)
perm_importance.plot(kind="bar", figsize=(10, 5))
mpl.title("Permutation Feature Importance")
mpl.show()
#Programming the training and testin sets for the perscriptive model
x=reduce.drop('price_range', axis=1)
y=df['price_range']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Shape of Training Set:")
display(x_train.shape)
print("Shape of Testing Set:")
display(x_test.shape)
Shape of Training Set:
(1600, 4)
Shape of Testing Set:
(400, 4)
#bias model building with 4 chosen features
modelx4 = rfc()
scores = cross_val_score(modelx4, x_train, y_train, cv=3)
print("Cross Value Summary:")
print("Cross Value Scores:", scores)
print("Average Cross Value Score:", scores.mean())
print("Range of Cross Value Scores:", (scores.max()-scores.min()))
modelx4.fit(x_train, y_train)
test_accuracy = modelx4.score(x_test, y_test)
y_pred=modelx4.predict(x_test)
print("Accuracy based on test set:", test_accuracy)
Cross Value Summary: Cross Value Scores: [0.90636704 0.90619137 0.87804878] Average Cross Value Score: 0.8968690637641035 Range of Cross Value Scores: 0.028318260710696985 Accuracy based on test set: 0.93
The cross value of the model has a high cross value score mean, which indicates the model performs well. The range of the cross value scores is also small, so the overall, the cross-value scores supports the 4F Model performance. The accuracy of the model further supports its performance with a accuracy score >90%
print("Testing for strength and accuracy of model -")
#Confusion matrix of Rainforest Classifier model with 4 features
cmx = confusion_matrix(y_test, y_pred)
mpl.figure(figsize=(10,8))
sn.heatmap(cmx, annot=True, fmt='d', cmap='rocket')
mpl.title('Confusion Matrix')
mpl.xlabel('Predicted')
mpl.ylabel('Actual')
mpl.show()
#Receiving Operating Characterisitc Curve for Multiple Price Range Classes based model.
#Class meanings of Price Ranges: 0 - Low Cost, 1 - Middle Cost, 2 - High Cost, 3 - Premium Cost
def price_indicator(i): #function to define each class based on metadata information provided by Kaggle
if i==0:
return "Low Cost"
if i==1:
return "Middle Cost"
if i==2:
return "High Cost"
if i==3:
return "Premium Cost"
prices_c = sorted(y.unique())
y_test_bin=label_binarize(y_test, classes=prices_c)
n_classes = y_test_bin.shape[1]
y_score = modelx4.predict_proba(x_test)
mpl.figure(figsize=(12,10))
for i in range(n_classes):
fpr, tpr, _ = roc_curve(y_test_bin[:,i], y_score[:,i])
roc_auc=auc(fpr,tpr)
mpl.plot(fpr, tpr, label=f'Class {price_indicator(prices_c[i])} (AUC = {roc_auc:.2f})')
mpl.plot([0,1], [0,1], 'k--')
mpl.xlabel('False Positive Rate')
mpl.ylabel('True Positive Rate')
mpl.title('ROC Curves for Price Range Classes')
mpl.legend(loc='lower right')
mpl.show()
Testing for strength and accuracy of model -