mtasic85 commited on
Commit
fe25a83
·
1 Parent(s): 9f6dadd

prepare pretrain datasets

Browse files
scripts/prepare_pretrain_datasets.py CHANGED
@@ -27,9 +27,9 @@ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
27
  chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
28
  num_workers=32,
29
  reorder_files=False,
30
- # This is important to inform LitData that we are encoding contiguous 1D array (tokens).
31
- # LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
32
- item_loader=TokensLoader(block_size=block_size),
33
  )
34
 
35
  #
 
27
  chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
28
  num_workers=32,
29
  reorder_files=False,
30
+ ## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
31
+ ## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
32
+ # item_loader=TokensLoader(block_size=block_size),
33
  )
34
 
35
  #