# example 1 import patoolib import wget from textwrap3 import wrap import torch import random import numpy as np import nltk nltk.download('punkt') nltk.download('brown') nltk.download('wordnet') from nltk.corpus import wordnet as wn from nltk.tokenize import sent_tokenize nltk.download('stopwords') from nltk.corpus import stopwords import string import pke import traceback from flashtext import KeywordProcessor from similarity.normalized_levenshtein import NormalizedLevenshtein normalized_levenshtein = NormalizedLevenshtein() from collections import OrderedDict from sklearn.metrics.pairwise import cosine_similarity import nltk nltk.download('omw-1.4') import gradio as gr from transformers import T5ForConditionalGeneration,T5Tokenizer summary_model = T5ForConditionalGeneration.from_pretrained('t5-base') summary_tokenizer = T5Tokenizer.from_pretrained('t5-base') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") summary_model = summary_model.to(device) question_model = T5ForConditionalGeneration.from_pretrained('ramsrigouthamg/t5_squad_v1') question_tokenizer = T5Tokenizer.from_pretrained('ramsrigouthamg/t5_squad_v1') question_model = question_model.to(device) # filter keywords wget.download('https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz') # tar -xvf s2v_reddit_2015_md.tar.gz patoolib.extract_archive("s2v_reddit_2015_md.tar.gz", outdir("/")) import numpy as np from sense2vec import Sense2Vec s2v = Sense2Vec().from_disk('s2v_old') from sentence_transformers import SentenceTransformer text = """Elon Musk has shown again he can influence the digital currency market with just his tweets. After saying that his electric vehicle-making company Tesla will not accept payments in Bitcoin because of environmental concerns, he tweeted that he was working with developers of Dogecoin to improve system transaction efficiency. Following the two distinct statements from him, the world's largest cryptocurrency hit a two-month low, while Dogecoin rallied by about 20 percent. The SpaceX CEO has in recent months often tweeted in support of Dogecoin, but rarely for Bitcoin. In a recent tweet, Musk put out a statement from Tesla that it was “concerned” about the rapidly increasing use of fossil fuels for Bitcoin (price in India) mining and transaction, and hence was suspending vehicle purchases using the cryptocurrency. A day later he again tweeted saying, “To be clear, I strongly believe in crypto, but it can't drive a massive increase in fossil fuel use, especially coal”. It triggered a downward spiral for Bitcoin value but the cryptocurrency has stabilised since. A number of Twitter users welcomed Musk's statement. One of them said it's time people started realising that Dogecoin “is here to stay” and another referred to Musk's previous assertion that crypto could become the world's future currency.""" for wrp in wrap(text, 150): print (wrp) print ("\n") # summerization with t5 from transformers import T5ForConditionalGeneration,T5Tokenizer summary_model = T5ForConditionalGeneration.from_pretrained('t5-base') summary_tokenizer = T5Tokenizer.from_pretrained('t5-base') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") summary_model = summary_model.to(device) def set_seed(seed: int): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) set_seed(42) def postprocesstext (content): final="" for sent in sent_tokenize(content): sent = sent.capitalize() final = final +" "+sent return final def summarizer(text,model,tokenizer): text = text.strip().replace("\n"," ") text = "summarize: "+text # print (text) max_len = 512 encoding = tokenizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] outs = model.generate(input_ids=input_ids, attention_mask=attention_mask, early_stopping=True, num_beams=3, num_return_sequences=1, no_repeat_ngram_size=2, min_length = 75, max_length=300) dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] summary = dec[0] summary = postprocesstext(summary) summary= summary.strip() return summary summarized_text = summarizer(text,summary_model,summary_tokenizer) print ("\noriginal Text >>") for wrp in wrap(text, 150): print (wrp) print ("\n") print ("Summarized Text >>") for wrp in wrap(summarized_text, 150): print (wrp) print ("\n") # answer span extraction def get_nouns_multipartite(content): out=[] try: extractor = pke.unsupervised.MultipartiteRank() extractor.load_document(input=content,language='en') # not contain punctuation marks or stopwords as candidates. pos = {'PROPN','NOUN'} #pos = {'PROPN','NOUN'} stoplist = list(string.punctuation) stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-'] stoplist += stopwords.words('english') # extractor.candidate_selection(pos=pos, stoplist=stoplist) extractor.candidate_selection(pos=pos) # 4. build the Multipartite graph and rank candidates using random walk, # alpha controls the weight adjustment mechanism, see TopicRank for # threshold/method parameters. extractor.candidate_weighting(alpha=1.1, threshold=0.75, method='average') keyphrases = extractor.get_n_best(n=15) for val in keyphrases: out.append(val[0]) except: out = [] traceback.print_exc() return out def get_keywords(originaltext,summarytext): keywords = get_nouns_multipartite(originaltext) print ("keywords unsummarized: ",keywords) keyword_processor = KeywordProcessor() for keyword in keywords: keyword_processor.add_keyword(keyword) keywords_found = keyword_processor.extract_keywords(summarytext) keywords_found = list(set(keywords_found)) print ("keywords_found in summarized: ",keywords_found) important_keywords =[] for keyword in keywords: if keyword in keywords_found: important_keywords.append(keyword) return important_keywords[:10] imp_keywords = get_keywords(text,summarized_text) print (imp_keywords) def get_question(context,answer,model,tokenizer): text = "context: {} answer: {}".format(context,answer) encoding = tokenizer.encode_plus(text,max_length=384, pad_to_max_length=False,truncation=True, return_tensors="pt").to(device) input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"] outs = model.generate(input_ids=input_ids, attention_mask=attention_mask, early_stopping=True, num_beams=5, num_return_sequences=1, no_repeat_ngram_size=2, max_length=72) dec = [tokenizer.decode(ids,skip_special_tokens=True) for ids in outs] Question = dec[0].replace("question:","") Question= Question.strip() return Question for wrp in wrap(summarized_text, 150): print (wrp) print ("\n") for answer in imp_keywords: ques = get_question(summarized_text,answer,question_model,question_tokenizer) print (ques) print (answer.capitalize()) print ("\n") # filter keywords # paraphrase-distilroberta-base-v1 sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3') def filter_same_sense_words(original,wordlist): filtered_words=[] base_sense =original.split('|')[1] print (base_sense) for eachword in wordlist: if eachword[0].split('|')[1] == base_sense: filtered_words.append(eachword[0].split('|')[0].replace("_", " ").title().strip()) return filtered_words def get_highest_similarity_score(wordlist,wrd): score=[] for each in wordlist: score.append(normalized_levenshtein.similarity(each.lower(),wrd.lower())) return max(score) def sense2vec_get_words(word,s2v,topn,question): output = [] print ("word ",word) try: sense = s2v.get_best_sense(word, senses= ["NOUN", "PERSON","PRODUCT","LOC","ORG","EVENT","NORP","WORK OF ART","FAC","GPE","NUM","FACILITY"]) most_similar = s2v.most_similar(sense, n=topn) # print (most_similar) output = filter_same_sense_words(sense,most_similar) print ("Similar ",output) except: output =[] threshold = 0.6 final=[word] checklist =question.split() for x in output: if get_highest_similarity_score(final,x)0: word = word.replace(" ","_") hypernym = syn.hypernyms() if len(hypernym) == 0: return distractors for item in hypernym[0].hyponyms(): name = item.lemmas()[0].name() #print ("name ",name, " word",orig_word) if name == orig_word: continue name = name.replace("_"," ") name = " ".join(w.capitalize() for w in name.split()) if name is not None and name not in distractors: distractors.append(name) except: print ("Wordnet distractors not found") return distractors def get_distractors (word,origsentence,sense2vecmodel,sentencemodel,top_n,lambdaval): distractors = sense2vec_get_words(word,sense2vecmodel,top_n,origsentence) print ("distractors ",distractors) if len(distractors) ==0: return distractors distractors_new = [word.capitalize()] distractors_new.extend(distractors) # print ("distractors_new .. ",distractors_new) embedding_sentence = origsentence+ " "+word.capitalize() # embedding_sentence = word keyword_embedding = sentencemodel.encode([embedding_sentence]) distractor_embeddings = sentencemodel.encode(distractors_new) # filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors,4,0.7) max_keywords = min(len(distractors_new),5) filtered_keywords = mmr(keyword_embedding, distractor_embeddings,distractors_new,max_keywords,lambdaval) # filtered_keywords = filtered_keywords[1:] final = [word.capitalize()] for wrd in filtered_keywords: if wrd.lower() !=word.lower(): final.append(wrd.capitalize()) final = final[1:] return final sent = "What cryptocurrency did Musk rarely tweet about?" keyword = "Bitcoin" # sent = "What did Musk say he was working with to improve system transaction efficiency?" # keyword= "Dogecoin" # sent = "What company did Musk say would not accept bitcoin payments?" # keyword= "Tesla" # sent = "What has Musk often tweeted in support of?" # keyword = "Cryptocurrency" print (get_distractors(keyword,sent,s2v,sentence_transformer_model,40,0.2)) context = gr.inputs.Textbox(lines=10, placeholder="Enter paragraph/content here...") output = gr.outputs.HTML( label="Question and Answers") radiobutton = gr.inputs.Radio(["Wordnet", "Sense2Vec"]) def generate_question(context,radiobutton): summary_text = summarizer(context,summary_model,summary_tokenizer) for wrp in wrap(summary_text, 100): print (wrp) # np = getnounphrases(summary_text,sentence_transformer_model,3) np = get_keywords(context,summary_text) print ("\n\nNoun phrases",np) output="" for answer in np: ques = get_question(summary_text,answer,question_model,question_tokenizer) if radiobutton=="Wordnet": distractors = get_distractors_wordnet(answer) else: distractors = get_distractors(answer.capitalize(),ques,s2v,sentence_transformer_model,40,0.2) # output= output + ques + "\n" + "Ans: "+answer.capitalize() + "\n\n" output = output + "" + ques + "" output = output + "
" output = output + "" + "Ans: " +answer.capitalize()+ ""+"
" if len(distractors)>0: for distractor in distractors[:4]: output = output + "" + distractor+ ""+"
" output = output + "
" summary ="Summary: "+ summary_text for answer in np: summary = summary.replace(answer,""+answer+"" + "
") summary = summary.replace(answer.capitalize(),""+answer.capitalize()+"") output = output + "

"+summary+"

" output = output + "
" return output iface = gr.Interface( fn=generate_question, inputs=[context,radiobutton], outputs=output) iface.launch(debug=True)