import numpy as np
import nltk
import string
import random
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings("ignore")


nltk.download("punkt")
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\H263429\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\H263429\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\H263429\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True


# Open the .txt file using a context manager
with open("Corpus.txt", "r") as file:
    Corpus = file.read()
    # print(content)


# Define the punctuation marks to retain
punctuation_to_retain = ",!"

# Create a translation table with punctuation marks to remove
translator = str.maketrans('', '', string.punctuation.replace(punctuation_to_retain, ''))

# Remove punctuation marks except the ones to retain
text_wo_punc = Corpus.translate(translator)
global word_token, sent_tokens
word_tokens = nltk.word_tokenize(text_wo_punc)
sent_tokens = nltk.sent_tokenize(Corpus)


# Function to remove the stop words and Perform Lemmatization

def process(Corpus):

    # Lowering, stopwords/punctuation removal, tokeinzing
    stop_words = set(stopwords.words('english'))
    corp = [word.lower() for word in word_tokens if word.lower() not in stop_words]

    # Lemmatize the text
    lemmatizer  = WordNetLemmatizer()

    lem_corp = [lemmatizer.lemmatize(word) for word in corp]

    return lem_corp


# Function to greet our user with Hi, Hello

Input_greet = ["hi", "hello", "sup", "what's up", "greetings", "hey", "hi,", "hello,", "sup,", "what's up,", "greetings!", "hey,"]
output_greet = ['Hi', "Hey there!", "I am glad you are talking to me", "Hey, I am here to help"]

def greet(sentence):
    return [random.choice(output_greet) for word in sentence.split(" ") if word.lower() in Input_greet][0]


# Function to generate the response based on user input and perform tfidf on the data

from sklearn.metrics.pairwise import cosine_similarity

n_not_und = 0   # Variable that is taken to set the limit of number of attempts where our bot does not know the answer

def response(user_response):
    robo1_response=''
    TfidfVec = TfidfVectorizer(tokenizer = process, stop_words = 'english')
    tfidf = TfidfVec.fit_transform(sent_tokens)
    vals = cosine_similarity(tfidf[-1], tfidf)   ## Determining similarity of last index w.r.t. others
    idx = vals.argsort()[0][-2]  # retrieves the index of the document that has the second highest cosine similarity value with the last document in the valsarray. (Here [0] gives the array of indices (because our aim to find the index here) and [-2] determines of which location we have to find indices (i.e. secodn last))
    req_tfidf = np.sort(vals[0])[-2]
    if(req_tfidf == 0):
        robo1_response = robo1_response + "I am sorry! I don't understand you"
        n_not_und = n_not_und + 1
        return robo1_response
    else:
        robo1_response = robo1_response + sent_tokens[idx]
        return robo1_response


# Function to interact with the client

Status =True
print("Hello I am Bot Vaibhav Malik, How I can help you, In case you need to quit just type Bye, or thank you.")
while Status:
    user_response = input().strip()
    user_response = user_response.lower()
    if user_response in ['bye', 'thanks', 'thank you']:
        Status = False
        print ("Vaibhav: Thank you for Contacting me, Please contact again. ")

    else:
        if user_response in Input_greet:
            print("Vaibhav: ", greet(user_response))

        else:
            sent_tokens.append(user_response)
            word_tokens = word_tokens + nltk.word_tokenize(user_response)
            print("Vaibhav: ", response(user_response))
            if (n_not_und >= 3):
                print( "Vaibhav: Sorry I am leaving, I am unable to understand your query ")
                Status = False

Hello I am Bot Vaibhav Malik, How I can help you, In case you need to quit just type Bye, or thank you.
Vaibhav:  Hi
Vaibhav:  Both fields play vital roles in leveraging the power of data to understand patterns, make informed decisions, and solve complex problems across various domains.
Vaibhav:  [6][26]

Modern usage
In 2012, technologists Thomas H. Davenport and DJ Patil declared "Data Scientist: The Sexiest Job of the 21st Century",[27] a catchphrase that was picked up even by major-city newspapers like the New York Times[28] and the Boston Globe.
Vaibhav:  In contrast, data science deals with quantitative and qualitative data (e.g., from images, text, sensors, transactions, customer information, etc.)
Vaibhav: Thank you for Contacting me, Please contact again.

Basic Chatbot

Preprocessing¶

Remove punctuation and tokenize the words¶