pritamdeka's picture
Update app.py
3bdb4e1
raw
history blame
8.43 kB
import nltk
import re
import nltkmodule
from newspaper import Article
from newspaper import fulltext
import requests
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from torch.utils.data import DataLoader
import math
from sentence_transformers import models, losses
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import *
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import scipy.spatial
import networkx as nx
from nltk.tokenize import sent_tokenize
import scispacy
import spacy
import en_core_sci_lg
import string
from nltk.stem.wordnet import WordNetLemmatizer
import gradio as gr
nlp = en_core_sci_lg.load()
sp = en_core_sci_lg.load()
all_stopwords = sp.Defaults.stop_words
def remove_stopwords(sen):
sen_new = " ".join([i for i in sen if i not in stop_words])
return sen_new
def keyphrase_generator(article_link, model_1, model_2, max_num_keywords):
element=[]
final_textrank_list=[]
document=[]
text_doc=[]
score_list=[]
sum_list=[]
model_1 = SentenceTransformer(model_1)
model_2 = SentenceTransformer(model_2)
url = article_link
html = requests.get(url).text
article = fulltext(html)
corpus=sent_tokenize(article)
indicator_list=['concluded','concludes','in a study', 'concluding','conclude','in sum','in a recent study','therefore','thus','so','hence',
'as a result','accordingly','consequently','in short','proves that','shows that','suggests that','demonstrates that','found that','observed that',
'indicated that','suggested that','demonstrated that']
count_dict={}
for l in corpus:
c=0
for l2 in indicator_list:
if l.find(l2)!=-1:#then it is a substring
c=1
break
if c:#
count_dict[l]=1
else:
count_dict[l]=0
for sent, score in count_dict.items():
score_list.append(score)
clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist()
corpus_embeddings = model_1.encode(clean_sentences_new)
sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)])
for i in range(len(clean_sentences_new)):
for j in range(len(clean_sentences_new)):
if i != j:
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0]
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
sentences=((scores[i],s) for i,s in enumerate(corpus))
for elem in sentences:
element.append(elem[0])
for sc, lst in zip(score_list, element): ########## taking the scores from both the lists
sum1=sc+lst
sum_list.append(sum1)
x=sorted(((sum_list[i],s) for i,s in enumerate(corpus)), reverse=True)
for elem in x:
final_textrank_list.append(elem[1])
a=int((10*len(final_textrank_list))/100.0)
if(a<5):
total=5
else:
total=int(a)
for i in range(total):
document.append(final_textrank_list[i])
doc=" ".join(document)
for i in document:
doc_1=nlp(i)
text_doc.append([X.text for X in doc_1.ents])
entity_list = [item for sublist in text_doc for item in sublist]
entity_list = [word for word in entity_list if not word in all_stopwords]
entity_list=list(dict.fromkeys(entity_list))
doc_embedding = model_2.encode([doc])
candidates=entity_list
candidate_embeddings = model_2.encode(candidates)
distances = cosine_similarity(doc_embedding, candidate_embeddings)
top_n = max_num_keywords
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords = '\n'.join(keyword_list)
return keywords
igen=gr.Interface(keyphrase_generator,
inputs=[gr.inputs.Textbox(lines=1, placeholder="Provide an online health article web link here",default="", label="Article web link"),
gr.inputs.Dropdown(choices=['sentence-transformers/all-mpnet-base-v2',
'sentence-transformers/all-mpnet-base-v1',
'sentence-transformers/all-distilroberta-v1',
'pritamdeka/S-Bluebert-snli-multinli-stsb',
'pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
'sentence-transformers/stsb-mpnet-base-v2',
'sentence-transformers/stsb-roberta-base-v2',
'sentence-transformers/stsb-distilroberta-base-v2',
'sentence-transformers/nli-roberta-base-v2',
'sentence-transformers/nli-mpnet-base-v2',
'sentence-transformers/nli-distilroberta-base-v2'],
type="value",
default='pritamdeka/S-Biomed-Roberta-snli-multinli-stsb',
label="Select any model for TextRank from the list below"),
gr.inputs.Dropdown(choices=['sentence-transformers/paraphrase-mpnet-base-v2',
'sentence-transformers/paraphrase-distilroberta-base-v1',
'sentence-transformers/paraphrase-xlm-r-multilingual-v1',
'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
'sentence-transformers/paraphrase-albert-small-v2',
'sentence-transformers/paraphrase-albert-base-v2',
'sentence-transformers/paraphrase-MiniLM-L12-v2',
'sentence-transformers/paraphrase-MiniLM-L6-v2',
'sentence-transformers/all-MiniLM-L12-v2',
'sentence-transformers/all-distilroberta-v1',
'sentence-transformers/paraphrase-TinyBERT-L6-v2',
'sentence-transformers/paraphrase-MiniLM-L3-v2',
'sentence-transformers/all-MiniLM-L6-v2'],
type="value",
default='sentence-transformers/paraphrase-distilroberta-base-v1',
label="Select any model for keyphrases from the list below"),
gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")],
outputs=gr.outputs.Textbox(type="auto", label="Output"), theme="peach",
title="Health Article Keyphrase Generator",
description="Generates the keyphrases from an online health article which best describes the article.",
article= "The work is based on a part of the paper provided <a href=https://dl.acm.org/doi/10.1145/3487664.3487701>here</a>."
"\t It uses the TextRank algorithm with <a href=https://www.sbert.net/>SBERT</a> to first find the top sentences and then extracts the keyphrases"
"\t from those sentences using <a href = https://allenai.github.io/scispacy/>scispaCy</a> and SBERT."
"\t The list of SBERT models provided can be found in <a href=www.sbert.net/docs/pretrained_models.html>SBERT Pre-trained models hub</a>."
"\t The default model names are provided which can be changed from the list of models available. "
"\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.")
igen.launch(share=True)