import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.corpus import wordnet
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from wordcloud import WordCloud
from textblob import TextBlob
import spacy
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
df = pd.read_csv("Train.csv")
df_test = pd.read_csv("Test.csv")
df_valid = pd.read_csv("Valid.csv")
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 40000 entries, 0 to 39999 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 text 40000 non-null object 1 label 40000 non-null int64 dtypes: int64(1), object(1) memory usage: 625.1+ KB
df.describe()
label | |
---|---|
count | 40000.000000 |
mean | 0.499525 |
std | 0.500006 |
min | 0.000000 |
25% | 0.000000 |
50% | 0.000000 |
75% | 1.000000 |
max | 1.000000 |
df.isnull().sum()
text 0 label 0 dtype: int64
def label_dist(df):
class_counts = pd.Series(df['label']).value_counts()
print(class_counts)
sns.countplot(x=df['label'])
plt.xlabel('Lables')
plt.ylabel('Count')
plt.title('Initial Label Distribution')
plt.show()
label_dist(df)
label 0 20019 1 19981 Name: count, dtype: int64
stop_words = set(stopwords.words('english'))
def word_cloud(df):
word_corp= []
for i in df.index:
cleaned_text = re.sub(r'[^a-zA-Z\s]', '', df['text'][i])
words = [word.lower() for word in nltk.word_tokenize(cleaned_text) if wordnet.synsets(word)]
filt_words = [word for word in words if word not in stop_words]
word_corp.extend(filt_words)
word_frequencies = Counter(word_corp)
# Select the top 100 words
top_100_words = dict(word_frequencies.most_common(100))
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(top_100_words)
# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
word_cloud(df)
def scatter_plot(df):
sentiment_score = [TextBlob(df['text'][i]).sentiment.polarity for i in df.index]
text_len = [len(df['text'][i]) for i in df.index]
df['sentiment_score'] = sentiment_score
df['text_len'] = text_len
sns.set(style="darkgrid")
# Create the scatter plot
sns.scatterplot(data=df, x='text_len', y='sentiment_score')
# Set labels and title
plt.xlabel('Text Length')
plt.ylabel('Sentiment Score')
plt.title('Relationship between Text Length and Sentiment Score')
# Display the plot
plt.show()
scatter_plot(df)
def sent_analysis_plot(df):
# Map sentiment scores to sentiments (positive, negative, neutral) based on a threshold
df['sentiment'] = pd.cut(df['sentiment_score'], bins=[-float('inf'), -0.1, 0.1, float('inf')], labels=['negative', 'neutral', 'positive'])
# Plot the sentiment analysis using a countplot
sns.countplot(x='sentiment', data=df)
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()
sent_analysis_plot(df)
Defining Custom Tokeinzer for TFIDF- Includes text cleaning, word tokenizing and lemmatization
Here we have used wordnet to include only those words in the text which have some meaning
def custom_tokenizer(text):
cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
words = [word.lower() for word in nltk.word_tokenize(cleaned_text) if wordnet.synsets(word)]
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
return lemmatized_words
Initialize and fit TFIDF vectorizer along with stop words
tfidf = TfidfVectorizer(tokenizer= custom_tokenizer, stop_words=list(stop_words))
tfidf_matrix = tfidf.fit_transform(df['text'])
Save TFIDF vectorizer for future use
filename = 'fitted_tfidf_vectorizer.pkl'
with open(filename, 'wb') as file:
pickle.dump(tfidf, file)
feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=feature_names)
tfidf_df
0 | 1 | 10 | 100 | 1000 | 10000 | 100000 | 1000000 | 1000000000 | 1000th | ... | zoological | zoologist | zoology | zoom | zoomed | zooming | zu | zulu | zuni | zurich | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
39995 | 0.0 | 0.03188 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
39996 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
39997 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
39998 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
39999 | 0.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
40000 rows × 39709 columns
Definining dependent/Independent variables and splitting into test and train
X = tfidf_matrix
y = df['label']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
SVC_classifier = SVC() # Use any classifier of your choice
SVC_classifier.fit(X_train, y_train)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
Saving trained SVC model for future use
filename = "classification_model_SVC.pkl"
with open(filename, "wb") as file:
pickle.dump(SVC_classifier, file)
Train Dataset
y_pred_train = SVC_classifier.predict(X_train)
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_train, y_pred_train)
report = classification_report(y_train, y_pred_train)
print("Accuracy for Train dataset:", accuracy)
print("Classification Report (Train datset):")
print(report)
Accuracy for Train dataset: 0.98846875 Classification Report (Train datset): precision recall f1-score support 0 0.99 0.99 0.99 16053 1 0.99 0.99 0.99 15947 accuracy 0.99 32000 macro avg 0.99 0.99 0.99 32000 weighted avg 0.99 0.99 0.99 32000
Test Datset
y_pred = SVC_classifier.predict(X_test)
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)
Accuracy: 0.89375 Classification Report: precision recall f1-score support 0 0.90 0.89 0.89 3966 1 0.89 0.90 0.90 4034 accuracy 0.89 8000 macro avg 0.89 0.89 0.89 8000 weighted avg 0.89 0.89 0.89 8000
Valdiation data
X_valid_data = df_valid['text']
y_valid_data = df_valid['label']
# Transform the text data of the test dataset using the fitted TF-IDF vectorizer
test_tfidf_matrix = tfidf.transform(X_valid_data)
valid_predictions = SVC_classifier.predict(test_tfidf_matrix)
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_valid_data, valid_predictions)
report = classification_report(y_valid_data, valid_predictions)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)
Accuracy: 0.891 Classification Report: precision recall f1-score support 0 0.90 0.88 0.89 2486 1 0.88 0.91 0.89 2514 accuracy 0.89 5000 macro avg 0.89 0.89 0.89 5000 weighted avg 0.89 0.89 0.89 5000
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_valid_data, valid_predictions)
report = classification_report(y_valid_data, valid_predictions)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)
Accuracy: 0.892 Classification Report: precision recall f1-score support 0 0.90 0.88 0.89 2486 1 0.88 0.91 0.89 2514 accuracy 0.89 5000 macro avg 0.89 0.89 0.89 5000 weighted avg 0.89 0.89 0.89 5000
# Tokenization
# Lemmatization
# TFIDF
#Model