Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Paused

devve1 commited on 27 days ago

Commit

c446ecb

•

1 Parent(s): 7e7d581

Update cluster_semantic_chunker.py

Files changed (1) hide show

cluster_semantic_chunker.py CHANGED Viewed

@@ -7,12 +7,16 @@ from recursive_token_chunker import RecursiveTokenChunker
 from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
 class ClusterSemanticChunker(BaseChunker):
-    def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=openai_token_count):
         self.splitter = RecursiveTokenChunker(
             chunk_size=min_chunk_size,
             chunk_overlap=0,
-            length_function=openai_token_count,
             separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
             )

 from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
+def bert_token_count(string: str)-> int:
+    """Returns the number of tokens in a text string."""
+    return len(tokenizer(string, add_special_tokens=True).input_ids[0])
 class ClusterSemanticChunker(BaseChunker):
+    def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):
         self.splitter = RecursiveTokenChunker(
             chunk_size=min_chunk_size,
             chunk_overlap=0,
+            length_function=length_function,
             separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
             )