|
from inputs.fields.field import Field |
|
import logging |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class RawTokenField(Field): |
|
"""This Class preserves raw text of tokens |
|
""" |
|
def __init__(self, namespace, source_key): |
|
"""This function sets namesapce of field, dataset source key |
|
|
|
Arguments: |
|
namespace {str} -- namesapce of field |
|
source_key {str} -- indicate key in text data |
|
""" |
|
|
|
super().__init__() |
|
self.namespace = str(namespace) |
|
self.source_key = str(source_key) |
|
|
|
def count_vocab_items(self, counter, sentences): |
|
""" `RawTokenField` doesn't update counter |
|
|
|
Arguments: |
|
counter {dict} -- counter |
|
sentences {list} -- text content after preprocessing |
|
""" |
|
|
|
pass |
|
|
|
def index(self, instance, vocab, sentences): |
|
"""This function doesn't use vocabulary, |
|
perserve raw text of sentences(tokens) |
|
|
|
Arguments: |
|
instance {dict} -- numerical represenration of text data |
|
vocab {Vocabulary} -- vocabulary |
|
sentences {list} -- text content after preprocessing |
|
""" |
|
|
|
for sentence in sentences: |
|
instance[self.namespace].append([token for token in sentence[self.source_key]]) |
|
|
|
logger.info("Index sentences {} to construct instance namespace {} successfully.".format( |
|
self.source_key, self.namespace)) |
|
|