import numpy as np
import nltk
import string
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')
[nltk_data] Downloading package punkt to [nltk_data] C:\Users\H263429\AppData\Roaming\nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package stopwords to [nltk_data] C:\Users\H263429\AppData\Roaming\nltk_data... [nltk_data] Package stopwords is already up-to-date! [nltk_data] Downloading package wordnet to [nltk_data] C:\Users\H263429\AppData\Roaming\nltk_data... [nltk_data] Package wordnet is already up-to-date!
True
# Open the .txt file using a context manager
with open("Corpus.txt", "r") as file:
Corpus = file.read()
# print(content)
The string.punctuation attribute contains all the punctuation characters. We use the str.maketrans() method to create a translation table that maps each punctuation character to None, effectively removing them from the text. The translate() method then applies the translation table to the text, removing the punctuation.
This is not included in the function process below so we can update word tokens and sentence tokens based on the user input
# Define the punctuation marks to retain
punctuation_to_retain = ",!"
# Create a translation table with punctuation marks to remove
translator = str.maketrans('', '', string.punctuation.replace(punctuation_to_retain, ''))
# Remove punctuation marks except the ones to retain
text_wo_punc = Corpus.translate(translator)
global word_token, sent_tokens
word_tokens = nltk.word_tokenize(text_wo_punc)
sent_tokens = nltk.sent_tokenize(Corpus)
# Function to remove the stop words and Perform Lemmatization
def process(Corpus):
# Lowering, stopwords/punctuation removal, tokeinzing
stop_words = set(stopwords.words('english'))
corp = [word.lower() for word in word_tokens if word.lower() not in stop_words]
# Lemmatize the text
lemmatizer = WordNetLemmatizer()
lem_corp = [lemmatizer.lemmatize(word) for word in corp]
return lem_corp
# Function to greet our user with Hi, Hello
Input_greet = ["hi", "hello", "sup", "what's up", "greetings", "hey", "hi,", "hello,", "sup,", "what's up,", "greetings!", "hey,"]
output_greet = ['Hi', "Hey there!", "I am glad you are talking to me", "Hey, I am here to help"]
def greet(sentence):
return [random.choice(output_greet) for word in sentence.split(" ") if word.lower() in Input_greet][0]
# Function to generate the response based on user input and perform tfidf on the data
from sklearn.metrics.pairwise import cosine_similarity
n_not_und = 0 # Variable that is taken to set the limit of number of attempts where our bot does not know the answer
def response(user_response):
robo1_response=''
TfidfVec = TfidfVectorizer(tokenizer = process, stop_words = 'english')
tfidf = TfidfVec.fit_transform(sent_tokens)
vals = cosine_similarity(tfidf[-1], tfidf) ## Determining similarity of last index w.r.t. others
idx = vals.argsort()[0][-2] # retrieves the index of the document that has the second highest cosine similarity value with the last document in the valsarray. (Here [0] gives the array of indices (because our aim to find the index here) and [-2] determines of which location we have to find indices (i.e. secodn last))
req_tfidf = np.sort(vals[0])[-2]
if(req_tfidf == 0):
robo1_response = robo1_response + "I am sorry! I don't understand you"
n_not_und = n_not_und + 1
return robo1_response
else:
robo1_response = robo1_response + sent_tokens[idx]
return robo1_response
# Function to interact with the client
Status =True
print("Hello I am Bot Vaibhav Malik, How I can help you, In case you need to quit just type Bye, or thank you.")
while Status:
user_response = input().strip()
user_response = user_response.lower()
if user_response in ['bye', 'thanks', 'thank you']:
Status = False
print ("Vaibhav: Thank you for Contacting me, Please contact again. ")
else:
if user_response in Input_greet:
print("Vaibhav: ", greet(user_response))
else:
sent_tokens.append(user_response)
word_tokens = word_tokens + nltk.word_tokenize(user_response)
print("Vaibhav: ", response(user_response))
if (n_not_und >= 3):
print( "Vaibhav: Sorry I am leaving, I am unable to understand your query ")
Status = False
Hello I am Bot Vaibhav Malik, How I can help you, In case you need to quit just type Bye, or thank you. Vaibhav: Hi Vaibhav: Both fields play vital roles in leveraging the power of data to understand patterns, make informed decisions, and solve complex problems across various domains. Vaibhav: [6][26] Modern usage In 2012, technologists Thomas H. Davenport and DJ Patil declared "Data Scientist: The Sexiest Job of the 21st Century",[27] a catchphrase that was picked up even by major-city newspapers like the New York Times[28] and the Boston Globe. Vaibhav: In contrast, data science deals with quantitative and qualitative data (e.g., from images, text, sensors, transactions, customer information, etc.) Vaibhav: Thank you for Contacting me, Please contact again.
Source: vaibhavmalik05/Basic-Chatbot