File size: 2,714 Bytes
4a6b1d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51fb126
 
 
 
4a6b1d2
 
 
 
 
 
51fb126
4a6b1d2
 
 
51fb126
4a6b1d2
51fb126
 
 
4a6b1d2
51fb126
 
 
 
 
4a6b1d2
51fb126
 
 
 
 
 
 
 
4a6b1d2
 
 
 
 
 
 
 
51fb126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords,wordnet
from nltk.tokenize import sent_tokenize
import string
import subprocess
import logging

try:
    import pke
    logging.error("importing pke info")
except:
    logging.error("installing pke info")
    subprocess.run(['pip3', 'install','git+https://github.com/boudinfl/pke.git'])
    subprocess.run(['python3' ,'-m' ,'spacy' ,'download' ,'en'])
    import pke

stoplist = list(string.punctuation)
stoplist += pke.lang.stopwords.get('en')
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')

def tokenize_sentence(text):
    sentences=sent_tokenize(text)
    sentences=[s.strip().lstrip().rstrip() for s in sentences if len(s) > 20]
    return sentences

def get_multipartiterank_topics(text):
    output = []
    try:
        extractor = pke.unsupervised.MultipartiteRank()
        extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
        # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
        extractor.candidate_selection(pos={'NOUN','VERB','ADJ'})
        extractor.candidate_weighting(threshold=0.7,method='average',alpha=1.1)
        keyphrases = extractor.get_n_best(n=5)

        for val in keyphrases:
            output.append(val[0])
    except Exception as e:
        print("found exception",e)
    return list(set(output))

def get_topicrank_topics(text):
    output = []
    try:
        extractor = pke.unsupervised.TopicRank()
        extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
        # keyphrase candidate selection #'ADJ' 'ADP' 'ADV' 'AUX' 'DET' 'NOUN' 'NUM' 'PART' 'PROPN' 'PUNCT' 'VERB'
        extractor.candidate_selection(pos={'NOUN', 'ADJ'})
        extractor.candidate_weighting(threshold=0.7,method='average')
        keyphrases = extractor.get_n_best(n=5)

        for val in keyphrases:
            output.append(val[0])
    except Exception as e:
        print("found exception",e)
    return list(set(output))

def get_yake_topics(text):
    #statistics model --very poor performance
    output = []
    try:
        extractor = pke.unsupervised.YAKE()
        extractor.load_document(input=text, language='en',normalization=None,stoplist=stoplist)
        extractor.candidate_selection(n=3)
        extractor.candidate_weighting(window=2)
        keyphrases = extractor.get_n_best(n=5,threshold=0.9)

        for val in keyphrases:
            output.append(val[0])
    except Exception as e:
        print("found exception",e)
    return list(set(output))