Update cluster_semantic_chunker.py
Browse files
cluster_semantic_chunker.py
CHANGED
@@ -11,8 +11,7 @@ from infinity_emb import AsyncEmbeddingEngine
|
|
11 |
|
12 |
def bert_token_count(tokenizer, text: str)-> int:
|
13 |
"""Returns the number of tokens in a text string."""
|
14 |
-
|
15 |
-
return len(tokenizer(text, add_special_tokens=True))
|
16 |
|
17 |
class ClusterSemanticChunker(BaseChunker):
|
18 |
def __init__(self, embed_model: AsyncEmbeddingEngine =None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):
|
|
|
11 |
|
12 |
def bert_token_count(tokenizer, text: str)-> int:
|
13 |
"""Returns the number of tokens in a text string."""
|
14 |
+
return len(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text, add_special_tokens=True)))
|
|
|
15 |
|
16 |
class ClusterSemanticChunker(BaseChunker):
|
17 |
def __init__(self, embed_model: AsyncEmbeddingEngine =None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):
|