arfat-xyz commited on
Commit
047944d
1 Parent(s): 5b1594b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +397 -0
app.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # example 1
2
+ from textwrap3 import wrap
3
+ import torch
4
+ import random
5
+ import numpy as np
6
+ import nltk
7
+ nltk.download('punkt')
8
+ nltk.download('brown')
9
+ nltk.download('wordnet')
10
+ from nltk.corpus import wordnet as wn
11
+ from nltk.tokenize import sent_tokenize
12
+ nltk.download('stopwords')
13
+ from nltk.corpus import stopwords
14
+ import string
15
+ import pke
16
+ import traceback
17
+ from flashtext import KeywordProcessor
18
+ from similarity.normalized_levenshtein import NormalizedLevenshtein
19
+ normalized_levenshtein = NormalizedLevenshtein()
20
+ from collections import OrderedDict
21
+ from sklearn.metrics.pairwise import cosine_similarity
22
+ import nltk
23
+ nltk.download('omw-1.4')
24
+ import gradio as gr
25
+ question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1')
26
+ question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1')
27
+ question_model = question_model.to(device)
28
+
29
+ # filter keywords
30
+ !wget https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz
31
+ !tar -xvf s2v_reddit_2015_md.tar.gz
32
+ import numpy as np
33
+ from sense2vec import Sense2Vec
34
+ s2v = Sense2Vec().from_disk('s2v_old')
35
+ from sentence_transformers import SentenceTransformer
36
+
37
+
38
+ text = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company
39
+ Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve
40
+ system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin
41
+ rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet,
42
+ Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and
43
+ transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly
44
+ believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but
45
+ the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising
46
+ that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency."""
47
+
48
+ for wrp in wrap(text, 150):
49
+ print (wrp)
50
+ print ("\n")
51
+
52
+
53
+ # summerization with t5
54
+ from transformers import T5ForConditionalGeneration,T5Tokenizer
55
+ summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
56
+ summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
57
+
58
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
59
+ summary_model = summary_model.to(device)
60
+
61
+
62
+
63
+ def set_seed(seed: int):
64
+ random.seed(seed)
65
+ np.random.seed(seed)
66
+ torch.manual_seed(seed)
67
+ torch.cuda.manual_seed_all(seed)
68
+
69
+ set_seed(42)
70
+
71
+
72
+
73
+ def postprocesstext (content):
74
+ final=""
75
+ for sent in sent_tokenize(content):
76
+ sent = sent.capitalize()
77
+ final = final +" "+sent
78
+ return final
79
+
80
+
81
+ def summarizer(text,model,tokenizer):
82
+ text = text.strip().replace("\n"," ")
83
+ text = "summarize: "+text
84
+ # print (text)
85
+ max_len = 512
86
+ encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
87
+
88
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
89
+
90
+ outs = model.generate(input_ids=input_ids,
91
+ attention_mask=attention_mask,
92
+ early_stopping=True,
93
+ num_beams=3,
94
+ num_return_sequences=1,
95
+ no_repeat_ngram_size=2,
96
+ min_length = 75,
97
+ max_length=300)
98
+
99
+
100
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
101
+ summary = dec[0]
102
+ summary = postprocesstext(summary)
103
+ summary= summary.strip()
104
+
105
+ return summary
106
+
107
+
108
+ summarized_text = summarizer(text,summary_model,summary_tokenizer)
109
+
110
+
111
+ print ("\noriginal Text >>")
112
+ for wrp in wrap(text, 150):
113
+ print (wrp)
114
+ print ("\n")
115
+ print ("Summarized Text >>")
116
+ for wrp in wrap(summarized_text, 150):
117
+ print (wrp)
118
+ print ("\n")
119
+
120
+
121
+
122
+ # answer span extraction
123
+
124
+
125
+ def get_nouns_multipartite(content):
126
+ out=[]
127
+ try:
128
+ extractor = pke.unsupervised.MultipartiteRank()
129
+ extractor.load_document(input=content,language='en')
130
+ # not contain punctuation marks or stopwords as candidates.
131
+ pos = {'PROPN','NOUN'}
132
+ #pos = {'PROPN','NOUN'}
133
+ stoplist = list(string.punctuation)
134
+ stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
135
+ stoplist += stopwords.words('english')
136
+ # extractor.candidate_selection(pos=pos, stoplist=stoplist)
137
+ extractor.candidate_selection(pos=pos)
138
+ # 4. build the Multipartite graph and rank candidates using random walk,
139
+ # alpha controls the weight adjustment mechanism, see TopicRank for
140
+ # threshold/method parameters.
141
+ extractor.candidate_weighting(alpha=1.1,
142
+ threshold=0.75,
143
+ method='average')
144
+ keyphrases = extractor.get_n_best(n=15)
145
+
146
+
147
+ for val in keyphrases:
148
+ out.append(val[0])
149
+ except:
150
+ out = []
151
+ traceback.print_exc()
152
+
153
+ return out
154
+
155
+
156
+
157
+ def get_keywords(originaltext,summarytext):
158
+ keywords = get_nouns_multipartite(originaltext)
159
+ print ("keywords unsummarized: ",keywords)
160
+ keyword_processor = KeywordProcessor()
161
+ for keyword in keywords:
162
+ keyword_processor.add_keyword(keyword)
163
+
164
+ keywords_found = keyword_processor.extract_keywords(summarytext)
165
+ keywords_found = list(set(keywords_found))
166
+ print ("keywords_found in summarized: ",keywords_found)
167
+
168
+ important_keywords =[]
169
+ for keyword in keywords:
170
+ if keyword in keywords_found:
171
+ important_keywords.append(keyword)
172
+
173
+ return important_keywords[:10]
174
+
175
+
176
+ imp_keywords = get_keywords(text,summarized_text)
177
+ print (imp_keywords)
178
+
179
+
180
+
181
+ def get_question(context,answer,model,tokenizer):
182
+ text = "context: {} answer: {}".format(context,answer)
183
+ encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device)
184
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
185
+
186
+ outs = model.generate(input_ids=input_ids,
187
+ attention_mask=attention_mask,
188
+ early_stopping=True,
189
+ num_beams=5,
190
+ num_return_sequences=1,
191
+ no_repeat_ngram_size=2,
192
+ max_length=72)
193
+
194
+
195
+ dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs]
196
+
197
+
198
+ Question = dec[0].replace("question:","")
199
+ Question= Question.strip()
200
+ return Question
201
+
202
+
203
+
204
+ for wrp in wrap(summarized_text, 150):
205
+ print (wrp)
206
+ print ("\n")
207
+
208
+ for answer in imp_keywords:
209
+ ques = get_question(summarized_text,answer,question_model,question_tokenizer)
210
+ print (ques)
211
+ print (answer.capitalize())
212
+ print ("\n")
213
+
214
+
215
+
216
+
217
+ # filter keywords
218
+
219
+ # paraphrase-distilroberta-base-v1
220
+ sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3')
221
+
222
+
223
+
224
+
225
+
226
+ def filter_same_sense_words(original,wordlist):
227
+ filtered_words=[]
228
+ base_sense =original.split('|')[1]
229
+ print (base_sense)
230
+ for eachword in wordlist:
231
+ if eachword[0].split('|')[1] == base_sense:
232
+ filtered_words.append(eachword[0].split('|')[0].replace("_", " ").title().strip())
233
+ return filtered_words
234
+
235
+ def get_highest_similarity_score(wordlist,wrd):
236
+ score=[]
237
+ for each in wordlist:
238
+ score.append(normalized_levenshtein.similarity(each.lower(),wrd.lower()))
239
+ return max(score)
240
+
241
+ def sense2vec_get_words(word,s2v,topn,question):
242
+ output = []
243
+ print ("word ",word)
244
+ try:
245
+ sense = s2v.get_best_sense(word, senses= ["NOUN", "PERSON","PRODUCT","LOC","ORG","EVENT","NORP","WORK OF ART","FAC","GPE","NUM","FACILITY"])
246
+ most_similar = s2v.most_similar(sense, n=topn)
247
+ # print (most_similar)
248
+ output = filter_same_sense_words(sense,most_similar)
249
+ print ("Similar ",output)
250
+ except:
251
+ output =[]
252
+
253
+ threshold = 0.6
254
+ final=[word]
255
+ checklist =question.split()
256
+ for x in output:
257
+ if get_highest_similarity_score(final,x)<threshold and x not in final and x not in checklist:
258
+ final.append(x)
259
+
260
+ return final[1:]
261
+
262
+ def mmr(doc_embedding, word_embeddings, words, top_n, lambda_param):
263
+
264
+ # Extract similarity within words, and between words and the document
265
+ word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
266
+ word_similarity = cosine_similarity(word_embeddings)
267
+
268
+ # Initialize candidates and already choose best keyword/keyphrase
269
+ keywords_idx = [np.argmax(word_doc_similarity)]
270
+ candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
271
+
272
+ for _ in range(top_n - 1):
273
+ # Extract similarities within candidates and
274
+ # between candidates and selected keywords/phrases
275
+ candidate_similarities = word_doc_similarity[candidates_idx, :]
276
+ target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
277
+
278
+ # Calculate MMR
279
+ mmr = (lambda_param) * candidate_similarities - (1-lambda_param) * target_similarities.reshape(-1, 1)
280
+ mmr_idx = candidates_idx[np.argmax(mmr)]
281
+
282
+ # Update keywords & candidates
283
+ keywords_idx.append(mmr_idx)
284
+ candidates_idx.remove(mmr_idx)
285
+
286
+ return [words[idx] for idx in keywords_idx]
287
+
288
+ def get_distractors_wordnet(word):
289
+ distractors=[]
290
+ try:
291
+ syn = wn.synsets(word,'n')[0]
292
+
293
+ word= word.lower()
294
+ orig_word = word
295
+ if len(word.split())>0:
296
+ word = word.replace(" ","_")
297
+ hypernym = syn.hypernyms()
298
+ if len(hypernym) == 0:
299
+ return distractors
300
+ for item in hypernym[0].hyponyms():
301
+ name = item.lemmas()[0].name()
302
+ #print ("name ",name, " word",orig_word)
303
+ if name == orig_word:
304
+ continue
305
+ name = name.replace("_"," ")
306
+ name = " ".join(w.capitalize() for w in name.split())
307
+ if name is not None and name not in distractors:
308
+ distractors.append(name)
309
+ except:
310
+ print ("Wordnet distractors not found")
311
+ return distractors
312
+
313
+ def get_distractors (word,origsentence,sense2vecmodel,sentencemodel,top_n,lambdaval):
314
+ distractors = sense2vec_get_words(word,sense2vecmodel,top_n,origsentence)
315
+ print ("distractors ",distractors)
316
+ if len(distractors) ==0:
317
+ return distractors
318
+ distractors_new = [word.capitalize()]
319
+ distractors_new.extend(distractors)
320
+ # print ("distractors_new .. ",distractors_new)
321
+
322
+ embedding_sentence = origsentence+ " "+word.capitalize()
323
+ # embedding_sentence = word
324
+ keyword_embedding = sentencemodel.encode([embedding_sentence])
325
+ distractor_embeddings = sentencemodel.encode(distractors_new)
326
+
327
+ # filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors,4,0.7)
328
+ max_keywords = min(len(distractors_new),5)
329
+ filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors_new,max_keywords,lambdaval)
330
+ # filtered_keywords = filtered_keywords[1:]
331
+ final = [word.capitalize()]
332
+ for wrd in filtered_keywords:
333
+ if wrd.lower() !=word.lower():
334
+ final.append(wrd.capitalize())
335
+ final = final[1:]
336
+ return final
337
+
338
+ sent = "What cryptocurrency did Musk rarely tweet about?"
339
+ keyword = "Bitcoin"
340
+
341
+ # sent = "What did Musk say he was working with to improve system transaction efficiency?"
342
+ # keyword= "Dogecoin"
343
+
344
+
345
+ # sent = "What company did Musk say would not accept bitcoin payments?"
346
+ # keyword= "Tesla"
347
+
348
+
349
+ # sent = "What has Musk often tweeted in support of?"
350
+ # keyword = "Cryptocurrency"
351
+
352
+ print (get_distractors(keyword,sent,s2v,sentence_transformer_model,40,0.2))
353
+
354
+
355
+
356
+
357
+ context = gr.inputs.Textbox(lines=10, placeholder="Enter paragraph/content here...")
358
+ output = gr.outputs.HTML( label="Question and Answers")
359
+ radiobutton = gr.inputs.Radio(["Wordnet", "Sense2Vec"])
360
+
361
+ def generate_question(context,radiobutton):
362
+ summary_text = summarizer(context,summary_model,summary_tokenizer)
363
+ for wrp in wrap(summary_text, 100):
364
+ print (wrp)
365
+ # np = getnounphrases(summary_text,sentence_transformer_model,3)
366
+ np = get_keywords(context,summary_text)
367
+ print ("\n\nNoun phrases",np)
368
+ output=""
369
+ for answer in np:
370
+ ques = get_question(summary_text,answer,question_model,question_tokenizer)
371
+ if radiobutton=="Wordnet":
372
+ distractors = get_distractors_wordnet(answer)
373
+ else:
374
+ distractors = get_distractors(answer.capitalize(),ques,s2v,sentence_transformer_model,40,0.2)
375
+ # output= output + ques + "\n" + "Ans: "+answer.capitalize() + "\n\n"
376
+ output = output + "<b style='color:blue;'>" + ques + "</b>"
377
+ output = output + "<br>"
378
+ output = output + "<b style='color:green;'>" + "Ans: " +answer.capitalize()+ "</b>"+"<br>"
379
+ if len(distractors)>0:
380
+ for distractor in distractors[:4]:
381
+ output = output + "<b style='color:brown;'>" + distractor+ "</b>"+"<br>"
382
+ output = output + "<br>"
383
+
384
+ summary ="Summary: "+ summary_text
385
+ for answer in np:
386
+ summary = summary.replace(answer,"<b>"+answer+"</b>" + "<br>")
387
+ summary = summary.replace(answer.capitalize(),"<b>"+answer.capitalize()+"</b>")
388
+ output = output + "<p>"+summary+"</p>"
389
+ output = output + "<br>"
390
+ return output
391
+
392
+
393
+ iface = gr.Interface(
394
+ fn=generate_question,
395
+ inputs=[context,radiobutton],
396
+ outputs=output)
397
+ iface.launch(debug=True)