Spaces:

nssharmaofficial
/

ImageCaption

Sleeping

App Files Files Community

nssharmaofficial commited on Mar 30

Commit

2bdc1ae

•

1 Parent(s): 92562f2

Fix vocab filepath

Browse files

Files changed (2) hide show

source/config.py +0 -1
source/vocab.py +2 -51

source/config.py CHANGED Viewed

@@ -20,4 +20,3 @@ class Config(object):
         self.ENCODER_WEIGHT_FILE = 'source/weights/encoder-32B-512H-1L-e5.pt'
         self.DECODER_WEIGHT_FILE = 'source/weights/decoder-32B-512H-1L-e5.pt'
-        self.ROOT = os.path.join(os.path.expanduser('~'), 'Huggingface', 'ImageCaption')


20	self.ENCODER_WEIGHT_FILE = 'source/weights/encoder-32B-512H-1L-e5.pt'
21	self.DECODER_WEIGHT_FILE = 'source/weights/decoder-32B-512H-1L-e5.pt'
22

source/vocab.py CHANGED Viewed

@@ -41,53 +41,6 @@ class Vocab:
         """
         self.counter.update(self.splitter(sentence))
-    def build_vocab(self, vocab_size: int, file_name: str):
-        """ Build vocabulary dictionaries word2index and index2word from a text file at config.ROOT path
-        Args:
-            vocab_size (int): size of vocabulary (including 4 predefined tokens: <pad>, <sos>, <eos>, <unk>)
-            file_name (str): name of the text file from which the vocabulary will be built.
-                Note: the lines in file are assumed to be in form: 'word SPACE index' and
-                it asssumes a header line (for example: 'captions.txt')
-        """
-        filepath = os.path.join(self.config.ROOT, file_name)
-        try:
-            with open(filepath, 'r', encoding='utf-8') as file:
-                for i, line in enumerate(file):
-                    # ignore header line
-                    if i == 0:
-                        continue
-                    caption = line.strip().lower().split(",", 1)[1]  # id=0, caption=1
-                    self.add_sentence(caption)
-        except Exception as e:
-            print(f"Error processing file {filepath}: {e}")
-            return
-        # adding predefined tokens in the vocabulary
-        self._add_predefined_tokens()
-        words = self.counter.most_common(vocab_size - 4)
-        # (index + 4) because first 4 tokens are the predefined ones
-        for index, (word, _) in enumerate(words, start=4):
-            self.word2index[word] = index
-            self.index2word[index] = word
-        self.size = len(self.word2index)
-        # adding predefined tokens in the vocabulary
-        self.index2word[self.PADDING_INDEX] = '<pad>'
-        self.word2index['<pad>'] = self.PADDING_INDEX
-        self.index2word[self.SOS] = '<sos>'
-        self.word2index['<sos>'] = self.SOS
-        self.index2word[self.EOS] = '<eos>'
-        self.word2index['<eos>'] = self.EOS
-        self.index2word[self.UNKNOWN_WORD_INDEX] = '<unk>'
-        self.word2index['<unk>'] = self.UNKNOWN_WORD_INDEX
     def word_to_index(self, word: str) -> int:
         """ Map word to index from word2index dictionary in vocabulary
@@ -116,16 +69,14 @@ class Vocab:
         except KeyError:
             return self.index2word[self.UNKNOWN_WORD_INDEX]
-    def load_vocab(self, file_name: str):
-        """ Load the word2index and index2word dictionaries from a text file at config.ROOT path
         Args:
             file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt')
                 Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line
         """
-        filepath = os.path.join(self.config.ROOT, file_name)
         self.word2index = dict()
         self.index2word = dict()

         """
         self.counter.update(self.splitter(sentence))
     def word_to_index(self, word: str) -> int:
         """ Map word to index from word2index dictionary in vocabulary
         except KeyError:
             return self.index2word[self.UNKNOWN_WORD_INDEX]
+    def load_vocab(self, filepath: str):
+        """ Load the word2index and index2word dictionaries from a text file.
         Args:
             file_name (str): name of the text file where the vocabulary is saved (i.e 'word2index.txt')
                 Note: the lines in file are assumed to be in form: 'word SPACE index' and it asssumes a header line
         """
         self.word2index = dict()
         self.index2word = dict()