import nltk nltk.download('stopwords') nltk.download('wordnet') nltk.download('punkt') from nltk.corpus import stopwords,wordnet from nltk.tokenize import sent_tokenize from flashtext import KeywordProcessor import regex as re import string import subprocess import logging try: import pke logging.error("importing pke info") except: logging.error("installing pke info") subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git']) subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en']) import pke def tokenize_sentence(text): sentences=sent_tokenize(text) sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20] return sentences def get_noun_adj_verb(text): output = [] try: extractor = pke.unsupervised.MultipartiteRank() extractor.load_document(input=text, language='en',normalization=None) # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB' extractor.candidate_selection(pos={'NOUN', 'VERB', 'ADJ'}) # candidate weighting, extractor.candidate_weighting(threshold=0.9,method='average',alpha=1.1) #extract top n keyphrases = extractor.get_n_best(n=5) for val in keyphrases: output.append(val[0]) except Exception as e: print("found exception",e) return list(set(output))