File size: 4,599 Bytes
1d43b95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from collections import defaultdict

from pke.data_structures import Candidate
from pke.readers import RawTextReader, SpacyDocReader, PreprocessedReader

from nltk import RegexpParser
from nltk.stem.snowball import SnowballStemmer
import pke
from pke.lang import stopwords, langcodes

from string import punctuation
import os
import logging
import spacy

nlp = spacy.load("en_core_web_sm")

from pke.base import LoadFile
class LoadFileNew(LoadFile):

    def load_document(self, input, language=None, stoplist=None,
                      normalization='stemming', spacy_model=nlp):
        # Reset object for new document
        self.__init__()

        # get the language parameter
        if language is None:
            language = 'en'

        # set the language of the document
        self.language = language

        # word normalization (filling self.sentences[].stems)
        self.normalization = normalization

        # initialize the stoplist
        if stoplist:
            self.stoplist = stoplist
        else:
            self.stoplist = stopwords.get(self.language)

        # check whether input is a spacy doc object instance
        if isinstance(input, spacy.tokens.doc.Doc):
            parser = SpacyDocReader()
            sents = parser.read(spacy_doc=input)
            logging.error('check whether input is a spacy doc object instance')
        # check whether input is a string
        elif isinstance(input, str):
            parser = RawTextReader(language=self.language)
            sents = parser.read(text=input, spacy_model=spacy_model)
            logging.error('check whether input is a string')
        # check whether input is processed text
        elif isinstance(input, list) and all(isinstance(item, list) for item in input):
            parser = PreprocessedReader()
            sents = parser.read(list_of_sentence_tuples=input)
            logging.error('check whether input is processed text')
        else:
            logging.error('Cannot process input. It is neither a spacy doc or a string: {}'.format(type(input)))
            # TODO raise TypeError('Cannot process input. It is neither a spacy doc, a string or a list of tuple: {}'.format(type(input)))) ?
            return

        # populate the sentences
        self.sentences = sents

        # TODO: this code could go into Reader.normalize ? Hum, not sure
        if self.normalization == 'stemming':
            # fall back to porter if english language (or unavailable languages) is used
            try:
                langcode = langcodes.get(self.language)
                if langcode == "english":
                    langcode = 'porter'
                stemmer = SnowballStemmer(langcode)
            except ValueError:
                logging.error('No stemmer available for \'{}\' language -> fall back to porter.'.format(self.language))
                stemmer = SnowballStemmer("porter")

            # populate Sentence.stems
            for i, sentence in enumerate(self.sentences):
                self.sentences[i].stems = [stemmer.stem(w).lower() for w in sentence.words]

        else:
            for i, sentence in enumerate(self.sentences):
                self.sentences[i].stems = [w.lower() for w in sentence.words]

        return self.sentences

#
test = LoadFileNew()
text="On May 4, the Red Planet was rocked by a roughly magnitude 5 temblor, the largest Marsquake detected to date, NASA’s Jet Propulsion Laboratory in Pasadena, Calif., reports. The shaking lasted for more than six hours and released more than 10 times the energy of the previous record-holding quake.The U.S. space agency’s InSight lander, which has been studying Mars’ deep interior since touching down on the planet in 2018 (SN: 11/26/18), recorded the event. The quake probably originated near the Cerberus Fossae region, which is more than 1,000 kilometers from the lander.Cerberus Fossae is known for its fractured surface and frequent rockfalls. It makes sense that the ground would be shifting there, says geophysicist Philippe Lognonné, principal investigator of the Seismic Experiment for Interior Structure, InSight’s seismometer. “It’s an ancient volcanic bulge.Just like earthquakes reveal information about our planet’s interior structure, Marsquakes can be used to probe what lies beneath Mars’ surface (SN: 7/22/21). And a lot can be learned from studying this whopper of a quake, says Lognonné, of the Institut de Physique du Globe de Paris. “The signal is so good, we’ll be able to work on the details."
out=test.load_document(input=text)