prepare pretrain datasets
Browse files
scripts/prepare_pretrain_datasets.py
CHANGED
@@ -27,9 +27,9 @@ for i, (block_size, subchunk_size) in enumerate([(4097, 4000)]):
|
|
27 |
chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
28 |
num_workers=32,
|
29 |
reorder_files=False,
|
30 |
-
|
31 |
-
|
32 |
-
item_loader=TokensLoader(block_size=block_size),
|
33 |
)
|
34 |
|
35 |
#
|
|
|
27 |
chunk_size=chunk_size, # Number of tokens to store by chunks. This is roughly 64MB of tokens per chunk.
|
28 |
num_workers=32,
|
29 |
reorder_files=False,
|
30 |
+
## This is important to inform LitData that we are encoding contiguous 1D array (tokens).
|
31 |
+
## LitData skips storing metadata for each sample e.g all the tokens are concatenated to form one large tensor.
|
32 |
+
# item_loader=TokensLoader(block_size=block_size),
|
33 |
)
|
34 |
|
35 |
#
|