|
import nltk |
|
import re |
|
import nltkmodule |
|
|
|
from nltk.tokenize import word_tokenize |
|
from sentence_transformers import SentenceTransformer |
|
import pandas as pd |
|
import numpy as np |
|
from pandas import ExcelWriter |
|
from torch.utils.data import DataLoader |
|
import math |
|
from sentence_transformers import models, losses |
|
from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer |
|
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator |
|
from sentence_transformers.readers import * |
|
import logging |
|
import glob |
|
from datetime import datetime |
|
import sys |
|
from nltk.corpus import stopwords |
|
stop_words = stopwords.words('english') |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import scipy.spatial |
|
import networkx as nx |
|
from nltk.tokenize import sent_tokenize |
|
import scispacy |
|
import spacy |
|
import en_core_sci_lg |
|
from spacy import displacy |
|
from scispacy.abbreviation import AbbreviationDetector |
|
from scispacy.umls_linking import UmlsEntityLinker |
|
from transformers import AutoTokenizer, AutoModel |
|
import statistics |
|
import string |
|
from nltk.stem.wordnet import WordNetLemmatizer |
|
import gradio as gr |
|
|
|
nlp = en_core_sci_lg.load() |
|
sp = en_core_sci_lg.load() |
|
all_stopwords = sp.Defaults.stop_words |
|
|
|
|
|
def remove_stopwords(sen): |
|
sen_new = " ".join([i for i in sen if i not in stop_words]) |
|
return sen_new |
|
|
|
def keyphrase_generator(article, model_1, model_2, max_num_keywords): |
|
element=[] |
|
document=[] |
|
text=[] |
|
model_1 = SentenceTransformer(model_1) |
|
model_2 = SentenceTransformer(model_2) |
|
corpus=sent_tokenize(article) |
|
clean_sentences_new = pd.Series(corpus).str.replace("[^a-zA-Z]", " ").tolist() |
|
corpus_embeddings = model_1.encode(clean_sentences_new) |
|
sim_mat = np.zeros([len(clean_sentences_new), len(clean_sentences_new)]) |
|
for i in range(len(clean_sentences_new)): |
|
for j in range(len(clean_sentences_new)): |
|
if i != j: |
|
sim_mat[i][j] = cosine_similarity(corpus_embeddings[i].reshape(1,768), corpus_embeddings[j].reshape(1,768))[0,0] |
|
nx_graph = nx.from_numpy_array(sim_mat) |
|
scores = nx.pagerank(nx_graph) |
|
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(corpus)), reverse=True) |
|
for elem in ranked_sentences: |
|
element.append(elem[1]) |
|
a=int((10*len(element))/100.0) |
|
if(a<5): |
|
total=5 |
|
else: |
|
total=int(a) |
|
for i in range(total): |
|
document.append(element[i]) |
|
doc=" ".join(document) |
|
for i in document: |
|
doc_1=nlp(i) |
|
text.append([X.text for X in doc_1.ents]) |
|
entity_list = [item for sublist in text for item in sublist] |
|
entity_list = [word for word in entity_list if not word in all_stopwords] |
|
entity_list=list(dict.fromkeys(entity_list)) |
|
doc_embedding = model_2.encode([doc]) |
|
candidates=entity_list |
|
candidate_embeddings = model_2.encode(candidates) |
|
distances = cosine_similarity(doc_embedding, candidate_embeddings) |
|
top_n = max_num_keywords |
|
keyword_list = [candidates[index] for index in distances.argsort()[0][-top_n:]] |
|
keywords = '\n'.join(keyword_list) |
|
return keywords |
|
|
|
|
|
gr.Interface(keyphrase_generator, |
|
inputs=[gr.inputs.Textbox(lines=10, placeholder="Copy article text here",default="", label="article text"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-mpnet-base-v2", label="Model for TextRank (e.g. all-mpnet-base-v2)"),gr.inputs.Textbox(lines=1, placeholder="SBERT model",default="all-distilroberta-v1",label="Model for keyphrases (e.g. all-distilroberta-v1)"),gr.inputs.Slider(minimum=5, maximum=30, step=1, default=10, label="Max Keywords")], |
|
outputs="text", theme=None, title="Scientifc Article Keyphrase Generator", article="Generates the keyphrases from an article which best describes the article." |
|
"\t The work is part of the paper ""." |
|
"\t It uses the TextRank algorithm to first find the top sentences and then extracts the keyphrases from those sentences." |
|
"\t The list of SBERT models required in the textboxes can be found in https://www.sbert.net/docs/pretrained_models.html." |
|
"\t The default model names are provided which can be changed from the list of pretrained models. " |
|
"\t The value of output keyphrases can be changed. The default value is 10, minimum is 5 and a maximum value of 30.").launch(share=True) |