devve1 commited on
Commit
6f68bae
1 Parent(s): 5c0c14b

Update cluster_semantic_chunker.py

Browse files
Files changed (1) hide show
  1. cluster_semantic_chunker.py +1 -2
cluster_semantic_chunker.py CHANGED
@@ -11,8 +11,7 @@ from infinity_emb import AsyncEmbeddingEngine
11
 
12
  def bert_token_count(tokenizer, text: str)-> int:
13
  """Returns the number of tokens in a text string."""
14
- print(f'NUM TOKENS : {len(tokenizer(text, add_special_tokens=True))}')
15
- return len(tokenizer(text, add_special_tokens=True))
16
 
17
  class ClusterSemanticChunker(BaseChunker):
18
  def __init__(self, embed_model: AsyncEmbeddingEngine =None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):
 
11
 
12
  def bert_token_count(tokenizer, text: str)-> int:
13
  """Returns the number of tokens in a text string."""
14
+ return len(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text, add_special_tokens=True)))
 
15
 
16
  class ClusterSemanticChunker(BaseChunker):
17
  def __init__(self, embed_model: AsyncEmbeddingEngine =None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):