tangledgroup
/

tangled-llama-m-128k-v0.1

mtasic85 commited on Dec 8, 2024

Commit

fe25a83

1 Parent(s): 9f6dadd

prepare pretrain datasets

Files changed (1) hide show

scripts/prepare_pretrain_datasets.py CHANGED Viewed

@@ -27,9 +27,9 @@ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
         chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
         num_workers=32,
         reorder_files=False,
-        # This is important to inform LitData that we are encoding contiguous 1D array (tokens).
-        # LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
-        item_loader=TokensLoader(block_size=block_size),
     )
 #

         chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
         num_workers=32,
         reorder_files=False,
+        ## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
+        ## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
+        # item_loader=TokensLoader(block_size=block_size),
     )
 #