Spaces:
Sleeping
Sleeping
danicafisher
commited on
Commit
•
008dbaf
1
Parent(s):
768b51c
Update app.py
Browse files
app.py
CHANGED
@@ -35,12 +35,22 @@ text_splitter = RecursiveCharacterTextSplitter(
|
|
35 |
)
|
36 |
rag_documents = text_splitter.split_documents(documents)
|
37 |
|
38 |
-
# Alternative chunking: Tokens (more accurate for OpenAI models)
|
39 |
-
token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
)
|
42 |
-
|
43 |
-
# TO DO ^^ test
|
44 |
|
45 |
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
|
46 |
|
|
|
35 |
)
|
36 |
rag_documents = text_splitter.split_documents(documents)
|
37 |
|
38 |
+
# # Alternative chunking: Tokens (more accurate for OpenAI models)
|
39 |
+
# token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
40 |
+
# encoding="cl100k_base", chunk_size=100, chunk_overlap=0
|
41 |
+
# )
|
42 |
+
# token_rag_documents = token_text_splitter.split_documents(documents)
|
43 |
+
# # TO DO ^^ test
|
44 |
+
|
45 |
+
# Split the documents by character
|
46 |
+
text_splitter = CharacterTextSplitter(
|
47 |
+
separator="\n\n",
|
48 |
+
chunk_size=1000,
|
49 |
+
chunk_overlap=200,
|
50 |
+
length_function=len,
|
51 |
+
is_separator_regex=False,
|
52 |
)
|
53 |
+
character_rag_documents = text_splitter.split_documents(documents)
|
|
|
54 |
|
55 |
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
|
56 |
|