devve1 commited on
Commit
c446ecb
1 Parent(s): 7e7d581

Update cluster_semantic_chunker.py

Browse files
Files changed (1) hide show
  1. cluster_semantic_chunker.py +6 -2
cluster_semantic_chunker.py CHANGED
@@ -7,12 +7,16 @@ from recursive_token_chunker import RecursiveTokenChunker
7
 
8
  from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
9
 
 
 
 
 
10
  class ClusterSemanticChunker(BaseChunker):
11
- def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=openai_token_count):
12
  self.splitter = RecursiveTokenChunker(
13
  chunk_size=min_chunk_size,
14
  chunk_overlap=0,
15
- length_function=openai_token_count,
16
  separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
17
  )
18
 
 
7
 
8
  from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
9
 
10
+ def bert_token_count(string: str)-> int:
11
+ """Returns the number of tokens in a text string."""
12
+ return len(tokenizer(string, add_special_tokens=True).input_ids[0])
13
+
14
  class ClusterSemanticChunker(BaseChunker):
15
+ def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):
16
  self.splitter = RecursiveTokenChunker(
17
  chunk_size=min_chunk_size,
18
  chunk_overlap=0,
19
+ length_function=length_function,
20
  separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
21
  )
22