#Preface: Data Science Machine Learning Project deploying Mobile Phone Price data accquired from Kaggle to predict classes of Mobile Phone
#prices based on hardware specifications.
import warnings

import scipy as sp 
import pandas as pd 
import numpy as npy
import matplotlib.pyplot as mpl
import seaborn as sn
from sklearn.inspection import permutation_importance as perm
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV 
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

print('All libraries available, setup successful.')

All libraries available, setup successful.

#Defining df as working data set from kaggle data set folder. 
df = pd.read_csv("Mobile Phone Pricing Dataset/Mobile Phone Pricing.csv", encoding='ascii', delimiter=',')

df.head() #Header of the data, first five

df.tail() #Tail of the data table, last five

df.info() #Shows data frame information within panda based view.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_screen   2000 non-null   int64  
 19  wifi           2000 non-null   int64  
 20  price_range    2000 non-null   int64  
dtypes: float64(2), int64(19)
memory usage: 328.3 KB

df.shape #Shape of the data, 2000 entries, 21 variables

(2000, 21)

print('Start of Data Exploration Phase:')

Start of Data Exploration Phase:

print('Correlation heatmap of variables:')

#Only looking at the numeric or binary values to find outliers and correlations that are of interest for data cleaning and refinement.

#Select for columns with numeric values, those with string values will be omitted. 
num_df = df.select_dtypes(include=[npy.number])

sn.color_palette("colorblind")

if num_df.shape[1] >= 4:
    mpl.figure(figsize=(12,10)) 
    sn.heatmap(num_df.corr(), annot=True, fmt='.2f', cmap='rocket')
    mpl.title('correlation heatmap of numeric features')
    mpl.tight_layout()
    mpl.show()
else:
    print('Not enough numeric features for correlation analysis')

Correlation heatmap of variables:

print("Sample Distribution of selected features:")

#selectors for specific columns
selected = df[["price_range","battery_power","ram","px_height","px_width"]]
price = df["price_range"]
bat = df["battery_power"]
ram = df["ram"]
height = df["px_height"]
width = df["px_width"]


display(selected.describe())
sn.catplot(data = df, x="price_range", kind="box")
sn.catplot(data = df, x="battery_power", kind="box")
sn.catplot(data = df, x="ram", kind="box")
sn.catplot(data = df, x="px_height", kind="box")
sn.catplot(data = df, x="px_width", kind="box")

Sample Distribution of selected features:

<seaborn.axisgrid.FacetGrid at 0x1f952562d50>

print("Box Plots distributions of Features in respect to Price Range categories")

sn.catplot(data = df, x="price_range", y="battery_power", kind="box")

sn.catplot(data = df, x="price_range", y="ram", kind="box")

sn.catplot(data = df, x="price_range", y="px_height", kind="box")

sn.catplot(data = df, x="price_range", y="px_width", kind="box")

Box Plots distributions of Features in respect to Price Range categories

<seaborn.axisgrid.FacetGrid at 0x1f953fe3750>

#Dimensionality reduced to 4 features based on exploration data aggregated. 
reduce = df[['price_range', 'ram', 'px_width', 'px_height', 'battery_power']]
pairplot = sn.PairGrid(reduce)
pairplot.map_diag(sn.histplot)
pairplot.map(sn.scatterplot)

<seaborn.axisgrid.PairGrid at 0x1f951bc7a10>

#Analyze the predictive power of each feature before constructing ML model. 
sn.lmplot(y="ram", x="price_range", data=df)
sn.lmplot(y="px_width", x="price_range", data=df)
sn.lmplot(y="px_height", x="price_range", data=df)
sn.lmplot(y="battery_power", x="price_range", data=df)

<seaborn.axisgrid.FacetGrid at 0x1f952ca7d90>

#Further support reduction of feature choice using permutation feature importance plot
x = df.drop('price_range', axis=1)
y = df['price_range']

model = rfc()
model.fit(x,y)

result = perm(model, x, y, n_repeats=20, random_state=42)

perm_importance=pd.Series(result.importances_mean, index=x.columns).sort_values(ascending=False)

perm_importance.plot(kind="bar", figsize=(10, 5))
mpl.title("Permutation Feature Importance")
mpl.show()

#Programming the training and testin sets for the perscriptive model
x=reduce.drop('price_range', axis=1)
y=df['price_range']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print("Shape of Training Set:") 
display(x_train.shape)
print("Shape of Testing Set:")
display(x_test.shape)

Shape of Training Set:

(1600, 4)

Shape of Testing Set:

(400, 4)

#bias model building with 4 chosen features

modelx4 = rfc()
scores = cross_val_score(modelx4, x_train, y_train, cv=3)

print("Cross Value Summary:")
print("Cross Value Scores:", scores)
print("Average Cross Value Score:", scores.mean())
print("Range of Cross Value Scores:", (scores.max()-scores.min()))

modelx4.fit(x_train, y_train)
test_accuracy = modelx4.score(x_test, y_test)

y_pred=modelx4.predict(x_test)

print("Accuracy based on test set:", test_accuracy)

Cross Value Summary:
Cross Value Scores: [0.90636704 0.90619137 0.87804878]
Average Cross Value Score: 0.8968690637641035
Range of Cross Value Scores: 0.028318260710696985
Accuracy based on test set: 0.93

print("Testing for strength and accuracy of model -")

#Confusion matrix of Rainforest Classifier model with 4 features
cmx = confusion_matrix(y_test, y_pred)
mpl.figure(figsize=(10,8))
sn.heatmap(cmx, annot=True, fmt='d', cmap='rocket')
mpl.title('Confusion Matrix')
mpl.xlabel('Predicted')
mpl.ylabel('Actual')
mpl.show()

#Receiving Operating Characterisitc Curve for Multiple Price Range Classes based model. 
#Class meanings of Price Ranges: 0 - Low Cost, 1 - Middle Cost, 2 - High Cost, 3 - Premium Cost

def price_indicator(i): #function to define each class based on metadata information provided by Kaggle
    if i==0:
        return "Low Cost"
    if i==1:
        return "Middle Cost"
    if i==2: 
        return "High Cost"
    if i==3:
        return "Premium Cost"
    

prices_c = sorted(y.unique())
y_test_bin=label_binarize(y_test, classes=prices_c)
n_classes = y_test_bin.shape[1]

y_score = modelx4.predict_proba(x_test)

mpl.figure(figsize=(12,10))
for i in range(n_classes):
        fpr, tpr, _ = roc_curve(y_test_bin[:,i], y_score[:,i])
        roc_auc=auc(fpr,tpr)
        mpl.plot(fpr, tpr, label=f'Class {price_indicator(prices_c[i])} (AUC = {roc_auc:.2f})')
mpl.plot([0,1], [0,1], 'k--')
mpl.xlabel('False Positive Rate')
mpl.ylabel('True Positive Rate')
mpl.title('ROC Curves for Price Range Classes')
mpl.legend(loc='lower right')
mpl.show()

Testing for strength and accuracy of model -

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	...	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
0	842	0	2.2	0	1	0	7	0.6	188	2	...	20	756	2549	9	7	19	0	0	1	1
1	1021	1	0.5	1	0	1	53	0.7	136	3	...	905	1988	2631	17	3	7	1	1	0	2
2	563	1	0.5	1	2	1	41	0.9	145	5	...	1263	1716	2603	11	2	9	1	1	0	2
3	615	1	2.5	0	0	0	10	0.8	131	6	...	1216	1786	2769	16	8	11	1	0	0	2
4	1821	1	1.2	0	13	1	44	0.6	141	2	...	1208	1212	1411	8	2	15	1	1	0	1

	battery_power	blue	clock_speed	dual_sim	fc	four_g	int_memory	m_dep	mobile_wt	n_cores	...	px_height	px_width	ram	sc_h	sc_w	talk_time	three_g	touch_screen	wifi	price_range
1995	794	1	0.5	1	0	1	2	0.8	106	6	...	1222	1890	668	13	4	19	1	1	0	0
1996	1965	1	2.6	1	0	0	39	0.2	187	4	...	915	1965	2032	11	10	16	1	1	1	2
1997	1911	0	0.9	1	1	1	36	0.7	108	8	...	868	1632	3057	9	1	5	1	1	0	3
1998	1512	0	0.9	0	4	1	46	0.1	145	5	...	336	670	869	18	10	19	1	1	1	0
1999	510	1	2.0	1	5	1	45	0.9	168	6	...	483	754	3919	19	4	2	1	1	1	3

	price_range	battery_power	ram	px_height	px_width
count	2000.000000	2000.000000	2000.000000	2000.000000	2000.000000
mean	1.500000	1238.518500	2124.213000	645.108000	1251.515500
std	1.118314	439.418206	1084.732044	443.780811	432.199447
min	0.000000	501.000000	256.000000	0.000000	500.000000
25%	0.750000	851.750000	1207.500000	282.750000	874.750000
50%	1.500000	1226.000000	2146.500000	564.000000	1247.000000
75%	2.250000	1615.250000	3064.500000	947.250000	1633.000000
max	3.000000	1998.000000	3998.000000	1960.000000	1998.000000

Research Goal¶

Metadata of data set provided by kaggle user¶

4 Feature Model Build¶