|
from .TermCandidate import TermCandidate |
|
|
|
|
|
class TermCandidateExtractor: |
|
def __init__(self, doc): |
|
self.doc = doc |
|
|
|
def __iter__(self): |
|
for sent in self.doc.sents: |
|
for candidate in self._get_candidates_in_sent(sent, self.doc): |
|
yield candidate |
|
|
|
def _get_candidates_in_sent(self, sent, doc): |
|
root = list(filter(lambda token: token.dep_ == "ROOT", sent))[0] |
|
|
|
excluded_children = [] |
|
candidates = [] |
|
|
|
def get_candidates(node, doc): |
|
|
|
if (node.pos_ in ["PROPN", "NOUN"]) and node.pos_ not in ["PRON"]: |
|
term_candidates = TermCandidate(doc[node.i:node.i + 1]) |
|
|
|
for child in node.children: |
|
|
|
start_index = min(node.i, child.i) |
|
end_index = max(node.i, child.i) |
|
|
|
if child.dep_ == "compound" or child.dep_ == "amod": |
|
subtree_tokens = list(child.subtree) |
|
if all([c.dep_ == "compound" for c in subtree_tokens]): |
|
start_index = min([c.i for c in subtree_tokens]) |
|
term_candidates.append(doc[start_index:end_index + 1]) |
|
|
|
if not child.dep_ == "amod": |
|
term_candidates.append(doc[start_index:start_index + 1]) |
|
excluded_children.append(child) |
|
|
|
if child.dep_ == "prep" and child.text == "of": |
|
end_index = max([c.i for c in child.subtree]) |
|
term_candidates.append(doc[start_index:end_index + 1]) |
|
|
|
candidates.append(term_candidates) |
|
|
|
for child in node.children: |
|
if child in excluded_children: |
|
continue |
|
get_candidates(child, doc) |
|
|
|
get_candidates(root, doc) |
|
|
|
return candidates |
|
|