Spaces:
Sleeping
Sleeping
File size: 2,907 Bytes
cb7427c 29003f1 cb7427c 2bdc1ae cb7427c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from source.config import Config
class Vocab:
"""
Offers word2index and index2word functionality after counting words in input sentences.
Allows choosing the size of the vocabulary by taking the most common words. Explicitly reserves four indices:
<pad>, <sos>, <eos> and <unk>.
"""
def __init__(self, sentence_splitter=None):
"""
Args:
sentence_splitter: tokenizing function
"""
self.config = Config()
self.counter = Counter()
self.word2index = dict()
self.index2word = dict()
self.size = 0
# predefined tokens
self.PADDING_INDEX = 0
self.SOS = 1
self.EOS = 2
self.UNKNOWN_WORD_INDEX = 3
if sentence_splitter is None:
# matches sequences of characters including ones between < >
word_regex = r'(?:\w+|<\w+>)'
# tokenize the string into words
sentence_splitter = RegexpTokenizer(word_regex).tokenize
self.splitter = sentence_splitter
def add_sentence(self, sentence: str):
"""
Update word counts from sentence after tokenizing it into words
"""
self.counter.update(self.splitter(sentence))
def word_to_index(self, word: str) -> int:
""" Map word to index from word2index dictionary in vocabulary
Args:
word (str): word to be mapped
Returns:
int: index matched to the word
"""
try:
return self.word2index[word]
except KeyError:
return self.UNKNOWN_WORD_INDEX
def index_to_word(self, index: int) -> str:
""" Map word to index from index2word dictionary in vocabulary
Args:
word (str): index to be mapped
Returns:
str: word matched to the index
"""
try:
return self.index2word[index]
except KeyError:
return self.index2word[self.UNKNOWN_WORD_INDEX]
def load_vocab(self, filepath: str):
""" Load the word2index and index2word dictionaries from a text file.
Args:
file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt')
Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line
"""
self.word2index = dict()
self.index2word = dict()
try:
with open(filepath) as file:
for line in file:
line = line.strip().split(' ')
word, index = line[0], line[1]
self.word2index[word] = int(index)
self.index2word[int(index)] = word
except Exception as e:
print(f"Error loading vocabulary from file {filepath}: {e}")
|