Update cluster_semantic_chunker.py
Browse files
cluster_semantic_chunker.py
CHANGED
@@ -7,12 +7,16 @@ from recursive_token_chunker import RecursiveTokenChunker
|
|
7 |
|
8 |
from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
|
9 |
|
|
|
|
|
|
|
|
|
10 |
class ClusterSemanticChunker(BaseChunker):
|
11 |
-
def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=
|
12 |
self.splitter = RecursiveTokenChunker(
|
13 |
chunk_size=min_chunk_size,
|
14 |
chunk_overlap=0,
|
15 |
-
length_function=
|
16 |
separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
|
17 |
)
|
18 |
|
|
|
7 |
|
8 |
from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
|
9 |
|
10 |
+
def bert_token_count(string: str)-> int:
|
11 |
+
"""Returns the number of tokens in a text string."""
|
12 |
+
return len(tokenizer(string, add_special_tokens=True).input_ids[0])
|
13 |
+
|
14 |
class ClusterSemanticChunker(BaseChunker):
|
15 |
+
def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=bert_token_count):
|
16 |
self.splitter = RecursiveTokenChunker(
|
17 |
chunk_size=min_chunk_size,
|
18 |
chunk_overlap=0,
|
19 |
+
length_function=length_function,
|
20 |
separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
|
21 |
)
|
22 |
|