## Data Analysis Phase
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
## Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)
data = pd.read_csv("WineQT.csv")
data
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | Id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 0 |
1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 | 1 |
2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 | 2 |
3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 | 3 |
4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 | 4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 | 1592 |
1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 | 1593 |
1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 | 1594 |
1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 | 1595 |
1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 | 1597 |
1143 rows × 13 columns
data.drop('Id', axis =1, inplace=True)
data
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 | 5 |
2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 | 5 |
3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 | 6 |
4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 | 5 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 | 6 |
1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 | 6 |
1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 | 5 |
1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 | 6 |
1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 | 5 |
1143 rows × 12 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1143 entries, 0 to 1142 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 fixed acidity 1143 non-null float64 1 volatile acidity 1143 non-null float64 2 citric acid 1143 non-null float64 3 residual sugar 1143 non-null float64 4 chlorides 1143 non-null float64 5 free sulfur dioxide 1143 non-null float64 6 total sulfur dioxide 1143 non-null float64 7 density 1143 non-null float64 8 pH 1143 non-null float64 9 sulphates 1143 non-null float64 10 alcohol 1143 non-null float64 11 quality 1143 non-null int64 dtypes: float64(11), int64(1) memory usage: 107.3 KB
# To check for null values in a column
data.isnull().sum()
fixed acidity 0 volatile acidity 0 citric acid 0 residual sugar 0 chlorides 0 free sulfur dioxide 0 total sulfur dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
data.describe()
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 | 1143.000000 |
mean | 8.311111 | 0.531339 | 0.268364 | 2.532152 | 0.086933 | 15.615486 | 45.914698 | 0.996730 | 3.311015 | 0.657708 | 10.442111 | 5.657043 |
std | 1.747595 | 0.179633 | 0.196686 | 1.355917 | 0.047267 | 10.250486 | 32.782130 | 0.001925 | 0.156664 | 0.170399 | 1.082196 | 0.805824 |
min | 4.600000 | 0.120000 | 0.000000 | 0.900000 | 0.012000 | 1.000000 | 6.000000 | 0.990070 | 2.740000 | 0.330000 | 8.400000 | 3.000000 |
25% | 7.100000 | 0.392500 | 0.090000 | 1.900000 | 0.070000 | 7.000000 | 21.000000 | 0.995570 | 3.205000 | 0.550000 | 9.500000 | 5.000000 |
50% | 7.900000 | 0.520000 | 0.250000 | 2.200000 | 0.079000 | 13.000000 | 37.000000 | 0.996680 | 3.310000 | 0.620000 | 10.200000 | 6.000000 |
75% | 9.100000 | 0.640000 | 0.420000 | 2.600000 | 0.090000 | 21.000000 | 61.000000 | 0.997845 | 3.400000 | 0.730000 | 11.100000 | 6.000000 |
max | 15.900000 | 1.580000 | 1.000000 | 15.500000 | 0.611000 | 68.000000 | 289.000000 | 1.003690 | 4.010000 | 2.000000 | 14.900000 | 8.000000 |
# Value counts
class_counts = pd.Series(data['quality']).value_counts()
print(class_counts)
quality 5 483 6 462 7 143 4 33 8 16 3 6 Name: count, dtype: int64
# Bar Plotting
sns.countplot(x=data['quality'])
plt.xlabel('Classes')
plt.ylabel('Count')
plt.title('Initial Class Distribution')
plt.show()
# Low range columns
plt.figure(figsize=(10,5))
sns.boxplot(data = data[['pH', 'sulphates', 'chlorides', 'density', 'volatile acidity', 'citric acid']])
plt.show()
# Medium range columns
plt.figure(figsize=(10,5))
sns.boxplot(data = data[['fixed acidity', 'residual sugar', 'alcohol']])
plt.show()
# High range columns
plt.figure(figsize=(10,5))
sns.boxplot(data = data[['free sulfur dioxide', 'total sulfur dioxide']])
plt.show()
As we can see there are many outliers in each section, so we have to remove them ither using z score or IQR technique, Here we are using IQR (Inter Quartile Range) lower and upper bound
Will see first if any class will be removed by removing outlier then we'll not remove it, so we are applying outlier removal function on dataset copy first
# Function to remove outliers
data2 = data.copy()
def out_rem(col,data2):
quantile1, quantile3= np.percentile(sorted(data2[col]),[25,75])
iqr=quantile3-quantile1
lower_bound_val = float(quantile1 -(1.5 * iqr))
upper_bound_val = float(quantile3 +(1.5 * iqr))
return lower_bound_val, upper_bound_val
# Removing outliers
print("Initial shape", data2.shape)
for i in data2.columns:
lb, ub = out_rem(i,data2)
data2 = data2[(data2[i] >= lb) & (data2[i] <= ub)]
print("Final shape", data2.shape)
Initial shape (1143, 12) Final shape (787, 12)
# Distribution of class after outlier removal
class_counts = pd.Series(data2['quality']).value_counts()
print(class_counts)
quality 5 345 6 336 7 88 4 18 Name: count, dtype: int64
for i in data.columns:
# plt.figure(figsize=(4,3))
sns.displot(data[i], kde = True, height =2, aspect = 1.5)
plt.show()
We can clearly see that class 3 and 8 are vanished completely because of outlier removal and hence it is not advisable to remove outlier in this case.
# Calculating the correlation matrix
correlation_matrix = data.corr()
# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()
# Setting up the fetaures and target
X = data.drop('quality', axis = 1)
y = data['quality']
X
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 |
1 | 7.8 | 0.880 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.99680 | 3.20 | 0.68 | 9.8 |
2 | 7.8 | 0.760 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.99700 | 3.26 | 0.65 | 9.8 |
3 | 11.2 | 0.280 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.99800 | 3.16 | 0.58 | 9.8 |
4 | 7.4 | 0.700 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.99780 | 3.51 | 0.56 | 9.4 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1138 | 6.3 | 0.510 | 0.13 | 2.3 | 0.076 | 29.0 | 40.0 | 0.99574 | 3.42 | 0.75 | 11.0 |
1139 | 6.8 | 0.620 | 0.08 | 1.9 | 0.068 | 28.0 | 38.0 | 0.99651 | 3.42 | 0.82 | 9.5 |
1140 | 6.2 | 0.600 | 0.08 | 2.0 | 0.090 | 32.0 | 44.0 | 0.99490 | 3.45 | 0.58 | 10.5 |
1141 | 5.9 | 0.550 | 0.10 | 2.2 | 0.062 | 39.0 | 51.0 | 0.99512 | 3.52 | 0.76 | 11.2 |
1142 | 5.9 | 0.645 | 0.12 | 2.0 | 0.075 | 32.0 | 44.0 | 0.99547 | 3.57 | 0.71 | 10.2 |
1143 rows × 11 columns
from sklearn.preprocessing import MinMaxScaler
# Create an instance of MinMaxScaler
scaler = MinMaxScaler()
# Select the columns to be scaled
columns_to_scale = X.columns
# Fit the scaler to the selected columns
scaler.fit(X[columns_to_scale])
# Transform the selected columns
X[columns_to_scale] = scaler.transform(X[columns_to_scale])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler()
X_train_res, y_train_res = os.fit_resample(X_train,y_train)
X_train_res
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.539823 | 0.212329 | 0.39 | 0.089041 | 0.156928 | 0.104478 | 0.091873 | 0.626285 | 0.118110 | 0.101796 | 0.184615 |
1 | 0.477876 | 0.301370 | 0.24 | 0.089041 | 0.111853 | 0.268657 | 0.183746 | 0.662996 | 0.346457 | 0.137725 | 0.261538 |
2 | 0.141593 | 0.397260 | 0.15 | 0.287671 | 0.106845 | 0.179104 | 0.074205 | 0.451542 | 0.629921 | 0.161677 | 0.538462 |
3 | 0.734513 | 0.157534 | 0.49 | 0.335616 | 0.090150 | 0.059701 | 0.102473 | 0.831865 | 0.362205 | 0.197605 | 0.553846 |
4 | 0.203540 | 0.253425 | 0.19 | 0.054795 | 0.111853 | 0.179104 | 0.070671 | 0.396476 | 0.503937 | 0.185629 | 0.215385 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2299 | 0.035398 | 0.205479 | 0.24 | 0.075342 | 0.080134 | 0.268657 | 0.155477 | 0.119677 | 0.771654 | 0.245509 | 0.861538 |
2300 | 0.539823 | 0.157534 | 0.53 | 0.116438 | 0.096828 | 0.059701 | 0.035336 | 0.523495 | 0.322835 | 0.191617 | 0.400000 |
2301 | 0.539823 | 0.157534 | 0.53 | 0.116438 | 0.096828 | 0.059701 | 0.035336 | 0.523495 | 0.322835 | 0.191617 | 0.400000 |
2302 | 0.539823 | 0.157534 | 0.53 | 0.116438 | 0.096828 | 0.059701 | 0.035336 | 0.523495 | 0.322835 | 0.191617 | 0.400000 |
2303 | 0.292035 | 0.157534 | 0.46 | 0.184932 | 0.110184 | 0.208955 | 0.109541 | 0.530837 | 0.480315 | 0.317365 | 0.676923 |
2304 rows × 11 columns
# Checking for distribution of classes
# Value counts
class_counts = pd.Series(y_train_res).value_counts()
print(class_counts)
quality 5 384 6 384 7 384 4 384 8 384 3 384 Name: count, dtype: int64
# Bar Plotting
sns.countplot(x=y_train_res)
plt.xlabel('Classes')
plt.ylabel('Count')
plt.title('Class Distribution after resampling')
plt.show()
# Function that takes a model as input
def process_data(model, X_train_res, y_train_res):
model.fit(X_train_res, y_train_res)
pass
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
def reports(model, X, y):
y_pred = model.predict(X)
cm = confusion_matrix(y, y_pred)
classes = np.unique(y)
print("Test classification report for test is \n \n", classification_report(y, y_pred))
print("\n \nConfusion matrix for test is \n \n", cm)
print("\n \nHeat Map for the test result is")
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=classes, yticklabels=classes)
plt.title('Confusion Matrix for test Heatmap')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()
# Value counts for test data
class_counts = pd.Series(y_test).value_counts()
print(class_counts)
quality 5 99 6 89 7 31 4 5 8 3 3 2 Name: count, dtype: int64
mlpc = MLPClassifier(random_state = 50)
process_data(mlpc, X_train_res, y_train_res)
reports(mlpc, X_train_res, y_train_res)
c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
Test classification report for test is precision recall f1-score support 3 0.95 1.00 0.98 384 4 0.69 0.71 0.70 384 5 0.59 0.58 0.58 384 6 0.45 0.36 0.40 384 7 0.64 0.59 0.61 384 8 0.76 0.94 0.84 384 accuracy 0.69 2304 macro avg 0.68 0.69 0.69 2304 weighted avg 0.68 0.69 0.69 2304 Confusion matrix for test is [[384 0 0 0 0 0] [ 0 271 61 41 11 0] [ 11 67 221 59 22 4] [ 5 44 91 138 72 34] [ 3 11 0 70 226 74] [ 0 0 0 0 23 361]] Heat Map for the test result is
reports(mlpc, X_test, y_test)
c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Test classification report for test is precision recall f1-score support 3 0.00 0.00 0.00 2 4 0.00 0.00 0.00 5 5 0.66 0.84 0.74 99 6 0.60 0.60 0.60 89 7 0.69 0.35 0.47 31 8 0.00 0.00 0.00 3 accuracy 0.64 229 macro avg 0.33 0.30 0.30 229 weighted avg 0.61 0.64 0.62 229 Confusion matrix for test is [[ 0 0 2 0 0 0] [ 0 0 4 1 0 0] [ 0 0 83 16 0 0] [ 0 0 32 53 4 0] [ 0 0 4 16 11 0] [ 0 0 0 2 1 0]] Heat Map for the test result is
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty="l2", C=1)
ovr = OneVsRestClassifier(lr)
process_data(ovr, X_train_res, y_train_res)
reports(ovr, X_train_res, y_train_res)
Test classification report for test is precision recall f1-score support 3 0.73 1.00 0.84 384 4 0.46 0.44 0.45 384 5 0.48 0.52 0.50 384 6 0.42 0.15 0.22 384 7 0.60 0.39 0.48 384 8 0.63 1.00 0.78 384 accuracy 0.58 2304 macro avg 0.55 0.58 0.54 2304 weighted avg 0.55 0.58 0.54 2304 Confusion matrix for test is [[384 0 0 0 0 0] [ 75 170 104 24 11 0] [ 49 77 198 25 21 14] [ 14 92 93 58 67 60] [ 4 30 20 31 151 148] [ 0 0 0 0 0 384]] Heat Map for the test result is
reports(ovr, X_test, y_test)
Test classification report for test is precision recall f1-score support 3 0.08 0.50 0.13 2 4 0.04 0.40 0.07 5 5 0.80 0.62 0.70 99 6 0.64 0.16 0.25 89 7 0.29 0.35 0.32 31 8 0.07 0.67 0.13 3 accuracy 0.40 229 macro avg 0.32 0.45 0.27 229 weighted avg 0.64 0.40 0.45 229 Confusion matrix for test is [[ 1 1 0 0 0 0] [ 1 2 0 0 1 1] [ 7 20 61 6 3 2] [ 3 25 15 14 22 10] [ 1 4 0 2 11 13] [ 0 0 0 0 1 2]] Heat Map for the test result is
Source: vaibhavmalik05/Wine-dataset