import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import wordnet
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from wordcloud import WordCloud
from textblob import TextBlob
import spacy
from collections import Counter
import warnings
warnings.filterwarnings("ignore")


df = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")
df_valid = pd.read_csv("Valid.csv")


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


df.describe()


df.isnull().sum()

text     0
label    0
dtype: int64


def label_dist(df):
    class_counts = pd.Series(df['label']).value_counts()
    print(class_counts)
    sns.countplot(x=df['label'])
    plt.xlabel('Lables')
    plt.ylabel('Count')
    plt.title('Initial Label Distribution')
    plt.show()


label_dist(df)

label
0    20019
1    19981
Name: count, dtype: int64


stop_words = set(stopwords.words('english'))

def word_cloud(df):
    word_corp= []
    for i in df.index:
        cleaned_text = re.sub(r'[^a-zA-Z\s]', '', df['text'][i])
        words = [word.lower() for word in nltk.word_tokenize(cleaned_text) if wordnet.synsets(word)]
        filt_words = [word for word in words if word not in stop_words]
        word_corp.extend(filt_words)
    word_frequencies = Counter(word_corp)

    # Select the top 100 words
    top_100_words = dict(word_frequencies.most_common(100))

    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(top_100_words)

    # Plot the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()


word_cloud(df)


def scatter_plot(df):
    sentiment_score = [TextBlob(df['text'][i]).sentiment.polarity for i in df.index]
    text_len = [len(df['text'][i]) for i in df.index]
    df['sentiment_score'] = sentiment_score
    df['text_len'] = text_len
    
    sns.set(style="darkgrid")

    # Create the scatter plot
    sns.scatterplot(data=df, x='text_len', y='sentiment_score')

    # Set labels and title
    plt.xlabel('Text Length')
    plt.ylabel('Sentiment Score')
    plt.title('Relationship between Text Length and Sentiment Score')

    # Display the plot
    plt.show()


scatter_plot(df)


def sent_analysis_plot(df):
    # Map sentiment scores to sentiments (positive, negative, neutral) based on a threshold
    df['sentiment'] = pd.cut(df['sentiment_score'], bins=[-float('inf'), -0.1, 0.1, float('inf')], labels=['negative', 'neutral', 'positive'])

    # Plot the sentiment analysis using a countplot
    sns.countplot(x='sentiment', data=df)
    plt.title('Sentiment Analysis')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.show()


sent_analysis_plot(df)


def custom_tokenizer(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = [word.lower() for word in nltk.word_tokenize(cleaned_text) if wordnet.synsets(word)]
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words


tfidf = TfidfVectorizer(tokenizer= custom_tokenizer, stop_words=list(stop_words))


tfidf_matrix = tfidf.fit_transform(df['text'])


filename = 'fitted_tfidf_vectorizer.pkl'
with open(filename, 'wb') as file:
    pickle.dump(tfidf, file)


feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=feature_names)
tfidf_df


X = tfidf_matrix
y = df['label']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


SVC_classifier = SVC()  # Use any classifier of your choice
SVC_classifier.fit(X_train, y_train)

SVC()

SVC()


filename = "classification_model_SVC.pkl"
with open(filename, "wb") as file:
    pickle.dump(SVC_classifier, file)


y_pred_train = SVC_classifier.predict(X_train)


from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_train, y_pred_train)
report = classification_report(y_train, y_pred_train)

print("Accuracy for Train dataset:", accuracy)
print("Classification Report (Train datset):")
print(report)

Accuracy for Train dataset: 0.98846875
Classification Report (Train datset):
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     16053
           1       0.99      0.99      0.99     15947

    accuracy                           0.99     32000
   macro avg       0.99      0.99      0.99     32000
weighted avg       0.99      0.99      0.99     32000


y_pred = SVC_classifier.predict(X_test)


from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.89375
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.89      3966
           1       0.89      0.90      0.90      4034

    accuracy                           0.89      8000
   macro avg       0.89      0.89      0.89      8000
weighted avg       0.89      0.89      0.89      8000


X_valid_data = df_valid['text']
y_valid_data = df_valid['label']


# Transform the text data of the test dataset using the fitted TF-IDF vectorizer
test_tfidf_matrix = tfidf.transform(X_valid_data)


valid_predictions = SVC_classifier.predict(test_tfidf_matrix)


from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_valid_data, valid_predictions)
report = classification_report(y_valid_data, valid_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.891
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2486
           1       0.88      0.91      0.89      2514

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000


from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_valid_data, valid_predictions)
report = classification_report(y_valid_data, valid_predictions)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.892
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2486
           1       0.88      0.91      0.89      2514

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000


# Tokenization


# Lemmatization


# TFIDF


#Model

	label
count	40000.000000
mean	0.499525
std	0.500006
min	0.000000
25%	0.000000
50%	0.000000
75%	1.000000
max	1.000000

	0	1	10	100	1000	10000	100000	1000000	1000000000	1000th	...	zoological	zoologist	zoology	zoom	zoomed	zooming	zu	zulu	zuni	zurich
0	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
39995	0.0	0.03188	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
39996	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
39997	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
39998	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
39999	0.0	0.00000	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

Sentiment Analysis

Import required libraries¶

EDA¶

Reading Data¶

Checking for null values in dataset¶

Label Distribution¶

Word Cloud¶

Scatter plot for sentiment score¶

Sentiment Analysis Count Plot¶

TFIDF¶

Modelling¶

SVC( Support vector classification ) Classification¶

Prediction and accuracy for SVC model¶