## Data Analysis Phase

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
## Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None)


data = pd.read_csv("WineQT.csv")
data


data.drop('Id', axis =1, inplace=True)
data


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1143 non-null   float64
 1   volatile acidity      1143 non-null   float64
 2   citric acid           1143 non-null   float64
 3   residual sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free sulfur dioxide   1143 non-null   float64
 6   total sulfur dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 107.3 KB


# To check for null values in a column
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


data.describe()


# Value counts

class_counts = pd.Series(data['quality']).value_counts()
print(class_counts)

quality
5    483
6    462
7    143
4     33
8     16
3      6
Name: count, dtype: int64


# Bar Plotting
sns.countplot(x=data['quality'])
plt.xlabel('Classes')
plt.ylabel('Count')
plt.title('Initial Class Distribution')
plt.show()


# Low range columns
plt.figure(figsize=(10,5))
sns.boxplot(data = data[['pH', 'sulphates', 'chlorides', 'density', 'volatile acidity', 'citric acid']])
plt.show()


# Medium range columns

plt.figure(figsize=(10,5))
sns.boxplot(data = data[['fixed acidity', 'residual sugar', 'alcohol']])
plt.show()


# High range columns

plt.figure(figsize=(10,5))
sns.boxplot(data = data[['free sulfur dioxide', 'total sulfur dioxide']])
plt.show()


# Function to remove outliers
data2 = data.copy()
def out_rem(col,data2):
    quantile1, quantile3= np.percentile(sorted(data2[col]),[25,75])
    iqr=quantile3-quantile1
    lower_bound_val = float(quantile1 -(1.5 * iqr))
    upper_bound_val = float(quantile3 +(1.5 * iqr))
    return lower_bound_val, upper_bound_val


# Removing outliers
print("Initial shape", data2.shape)
for i in data2.columns:
    lb, ub = out_rem(i,data2)
    data2 = data2[(data2[i] >= lb) & (data2[i] <= ub)]
print("Final shape", data2.shape)

Initial shape (1143, 12)
Final shape (787, 12)


# Distribution of class after outlier removal

class_counts = pd.Series(data2['quality']).value_counts()
print(class_counts)

quality
5    345
6    336
7     88
4     18
Name: count, dtype: int64


for i in data.columns:
    # plt.figure(figsize=(4,3))
    sns.displot(data[i], kde = True, height =2, aspect = 1.5)
    plt.show()


# Calculating the correlation matrix
correlation_matrix = data.corr()
# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.show()


# Setting up the fetaures and target

X = data.drop('quality', axis = 1)
y = data['quality']

X


from sklearn.preprocessing import MinMaxScaler

# Create an instance of MinMaxScaler
scaler = MinMaxScaler()

# Select the columns to be scaled
columns_to_scale = X.columns

# Fit the scaler to the selected columns
scaler.fit(X[columns_to_scale])

# Transform the selected columns
X[columns_to_scale] = scaler.transform(X[columns_to_scale])


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=52)


from imblearn.over_sampling import RandomOverSampler

os = RandomOverSampler()
X_train_res, y_train_res = os.fit_resample(X_train,y_train)


X_train_res


# Checking for distribution of classes

# Value counts

class_counts = pd.Series(y_train_res).value_counts()
print(class_counts)

quality
5    384
6    384
7    384
4    384
8    384
3    384
Name: count, dtype: int64


# Bar Plotting
sns.countplot(x=y_train_res)
plt.xlabel('Classes')
plt.ylabel('Count')
plt.title('Class Distribution after resampling')
plt.show()


# Function that takes a model as input
def process_data(model, X_train_res, y_train_res):
    model.fit(X_train_res, y_train_res)
    pass


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier

def reports(model, X, y):
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    classes = np.unique(y)
    print("Test classification report for test is \n \n", classification_report(y, y_pred))
    print("\n \nConfusion matrix for test is \n \n", cm)
    print("\n \nHeat Map for the test result is")
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=classes, yticklabels=classes)
    plt.title('Confusion Matrix for test Heatmap')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()


# Value counts for test data

class_counts = pd.Series(y_test).value_counts()
print(class_counts)

quality
5    99
6    89
7    31
4     5
8     3
3     2
Name: count, dtype: int64


mlpc = MLPClassifier(random_state = 50)
process_data(mlpc, X_train_res, y_train_res)
reports(mlpc, X_train_res, y_train_res)

c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  warnings.warn(

Test classification report for test is 
 
               precision    recall  f1-score   support

           3       0.95      1.00      0.98       384
           4       0.69      0.71      0.70       384
           5       0.59      0.58      0.58       384
           6       0.45      0.36      0.40       384
           7       0.64      0.59      0.61       384
           8       0.76      0.94      0.84       384

    accuracy                           0.69      2304
   macro avg       0.68      0.69      0.69      2304
weighted avg       0.68      0.69      0.69      2304


Confusion matrix for test is 
 
 [[384   0   0   0   0   0]
 [  0 271  61  41  11   0]
 [ 11  67 221  59  22   4]
 [  5  44  91 138  72  34]
 [  3  11   0  70 226  74]
 [  0   0   0   0  23 361]]

 
Heat Map for the test result is


reports(mlpc, X_test, y_test)

c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
c:\Users\H263429\AppData\Local\miniconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Test classification report for test is 
 
               precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         5
           5       0.66      0.84      0.74        99
           6       0.60      0.60      0.60        89
           7       0.69      0.35      0.47        31
           8       0.00      0.00      0.00         3

    accuracy                           0.64       229
   macro avg       0.33      0.30      0.30       229
weighted avg       0.61      0.64      0.62       229


Confusion matrix for test is 
 
 [[ 0  0  2  0  0  0]
 [ 0  0  4  1  0  0]
 [ 0  0 83 16  0  0]
 [ 0  0 32 53  4  0]
 [ 0  0  4 16 11  0]
 [ 0  0  0  2  1  0]]

 
Heat Map for the test result is


from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty="l2", C=1)
ovr = OneVsRestClassifier(lr)
process_data(ovr, X_train_res, y_train_res)
reports(ovr, X_train_res, y_train_res)

Test classification report for test is 
 
               precision    recall  f1-score   support

           3       0.73      1.00      0.84       384
           4       0.46      0.44      0.45       384
           5       0.48      0.52      0.50       384
           6       0.42      0.15      0.22       384
           7       0.60      0.39      0.48       384
           8       0.63      1.00      0.78       384

    accuracy                           0.58      2304
   macro avg       0.55      0.58      0.54      2304
weighted avg       0.55      0.58      0.54      2304


Confusion matrix for test is 
 
 [[384   0   0   0   0   0]
 [ 75 170 104  24  11   0]
 [ 49  77 198  25  21  14]
 [ 14  92  93  58  67  60]
 [  4  30  20  31 151 148]
 [  0   0   0   0   0 384]]

 
Heat Map for the test result is


reports(ovr, X_test, y_test)

Test classification report for test is 
 
               precision    recall  f1-score   support

           3       0.08      0.50      0.13         2
           4       0.04      0.40      0.07         5
           5       0.80      0.62      0.70        99
           6       0.64      0.16      0.25        89
           7       0.29      0.35      0.32        31
           8       0.07      0.67      0.13         3

    accuracy                           0.40       229
   macro avg       0.32      0.45      0.27       229
weighted avg       0.64      0.40      0.45       229


Confusion matrix for test is 
 
 [[ 1  1  0  0  0  0]
 [ 1  2  0  0  1  1]
 [ 7 20 61  6  3  2]
 [ 3 25 15 14 22 10]
 [ 1  4  0  2 11 13]
 [ 0  0  0  0  1  2]]

 
Heat Map for the test result is

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality	Id
0	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5	0
1	7.8	0.880	0.00	2.6	0.098	25.0	67.0	0.99680	3.20	0.68	9.8	5	1
2	7.8	0.760	0.04	2.3	0.092	15.0	54.0	0.99700	3.26	0.65	9.8	5	2
3	11.2	0.280	0.56	1.9	0.075	17.0	60.0	0.99800	3.16	0.58	9.8	6	3
4	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...
1138	6.3	0.510	0.13	2.3	0.076	29.0	40.0	0.99574	3.42	0.75	11.0	6	1592
1139	6.8	0.620	0.08	1.9	0.068	28.0	38.0	0.99651	3.42	0.82	9.5	6	1593
1140	6.2	0.600	0.08	2.0	0.090	32.0	44.0	0.99490	3.45	0.58	10.5	5	1594
1141	5.9	0.550	0.10	2.2	0.062	39.0	51.0	0.99512	3.52	0.76	11.2	6	1595
1142	5.9	0.645	0.12	2.0	0.075	32.0	44.0	0.99547	3.57	0.71	10.2	5	1597

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5
1	7.8	0.880	0.00	2.6	0.098	25.0	67.0	0.99680	3.20	0.68	9.8	5
2	7.8	0.760	0.04	2.3	0.092	15.0	54.0	0.99700	3.26	0.65	9.8	5
3	11.2	0.280	0.56	1.9	0.075	17.0	60.0	0.99800	3.16	0.58	9.8	6
4	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4	5
...	...	...	...	...	...	...	...	...	...	...	...	...
1138	6.3	0.510	0.13	2.3	0.076	29.0	40.0	0.99574	3.42	0.75	11.0	6
1139	6.8	0.620	0.08	1.9	0.068	28.0	38.0	0.99651	3.42	0.82	9.5	6
1140	6.2	0.600	0.08	2.0	0.090	32.0	44.0	0.99490	3.45	0.58	10.5	5
1141	5.9	0.550	0.10	2.2	0.062	39.0	51.0	0.99512	3.52	0.76	11.2	6
1142	5.9	0.645	0.12	2.0	0.075	32.0	44.0	0.99547	3.57	0.71	10.2	5

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
count	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000	1143.000000
mean	8.311111	0.531339	0.268364	2.532152	0.086933	15.615486	45.914698	0.996730	3.311015	0.657708	10.442111	5.657043
std	1.747595	0.179633	0.196686	1.355917	0.047267	10.250486	32.782130	0.001925	0.156664	0.170399	1.082196	0.805824
min	4.600000	0.120000	0.000000	0.900000	0.012000	1.000000	6.000000	0.990070	2.740000	0.330000	8.400000	3.000000
25%	7.100000	0.392500	0.090000	1.900000	0.070000	7.000000	21.000000	0.995570	3.205000	0.550000	9.500000	5.000000
50%	7.900000	0.520000	0.250000	2.200000	0.079000	13.000000	37.000000	0.996680	3.310000	0.620000	10.200000	6.000000
75%	9.100000	0.640000	0.420000	2.600000	0.090000	21.000000	61.000000	0.997845	3.400000	0.730000	11.100000	6.000000
max	15.900000	1.580000	1.000000	15.500000	0.611000	68.000000	289.000000	1.003690	4.010000	2.000000	14.900000	8.000000

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol
0	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4
1	7.8	0.880	0.00	2.6	0.098	25.0	67.0	0.99680	3.20	0.68	9.8
2	7.8	0.760	0.04	2.3	0.092	15.0	54.0	0.99700	3.26	0.65	9.8
3	11.2	0.280	0.56	1.9	0.075	17.0	60.0	0.99800	3.16	0.58	9.8
4	7.4	0.700	0.00	1.9	0.076	11.0	34.0	0.99780	3.51	0.56	9.4
...	...	...	...	...	...	...	...	...	...	...	...
1138	6.3	0.510	0.13	2.3	0.076	29.0	40.0	0.99574	3.42	0.75	11.0
1139	6.8	0.620	0.08	1.9	0.068	28.0	38.0	0.99651	3.42	0.82	9.5
1140	6.2	0.600	0.08	2.0	0.090	32.0	44.0	0.99490	3.45	0.58	10.5
1141	5.9	0.550	0.10	2.2	0.062	39.0	51.0	0.99512	3.52	0.76	11.2
1142	5.9	0.645	0.12	2.0	0.075	32.0	44.0	0.99547	3.57	0.71	10.2

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol
0	0.539823	0.212329	0.39	0.089041	0.156928	0.104478	0.091873	0.626285	0.118110	0.101796	0.184615
1	0.477876	0.301370	0.24	0.089041	0.111853	0.268657	0.183746	0.662996	0.346457	0.137725	0.261538
2	0.141593	0.397260	0.15	0.287671	0.106845	0.179104	0.074205	0.451542	0.629921	0.161677	0.538462
3	0.734513	0.157534	0.49	0.335616	0.090150	0.059701	0.102473	0.831865	0.362205	0.197605	0.553846
4	0.203540	0.253425	0.19	0.054795	0.111853	0.179104	0.070671	0.396476	0.503937	0.185629	0.215385
...	...	...	...	...	...	...	...	...	...	...	...
2299	0.035398	0.205479	0.24	0.075342	0.080134	0.268657	0.155477	0.119677	0.771654	0.245509	0.861538
2300	0.539823	0.157534	0.53	0.116438	0.096828	0.059701	0.035336	0.523495	0.322835	0.191617	0.400000
2301	0.539823	0.157534	0.53	0.116438	0.096828	0.059701	0.035336	0.523495	0.322835	0.191617	0.400000
2302	0.539823	0.157534	0.53	0.116438	0.096828	0.059701	0.035336	0.523495	0.322835	0.191617	0.400000
2303	0.292035	0.157534	0.46	0.184932	0.110184	0.208955	0.109541	0.530837	0.480315	0.317365	0.676923

Wine quality prediction

Distribution of classes¶

So we can see that the data is imbalanced, hence we should apply oversampling¶

Outlier Detection¶

Removing Outliers¶

Distribution plot¶

Feature Engineering¶

We can see fixed acidity is positively correlated with density and negatively correlated with pH values.¶

Train-test split¶

Oversampling¶

Modelling¶

Classification¶

Train data prediction¶

Test data prediction¶

Prediction using one vs rest (Used for multiclass classification)¶

Model : Logistic Regression¶

Regularization parameter: L2¶

Train data prediction¶

Test data prediction¶