Spaces:
Sleeping
Sleeping
danicafisher
commited on
Commit
•
3139a4f
1
Parent(s):
008dbaf
Update app.py
Browse files
app.py
CHANGED
@@ -26,31 +26,26 @@ for filename in os.listdir(directory):
|
|
26 |
docs = loader.load()
|
27 |
documents.extend(docs)
|
28 |
|
29 |
-
# Split the documents
|
30 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
31 |
-
chunk_size=500,
|
32 |
-
chunk_overlap=40,
|
33 |
-
length_function=len,
|
34 |
-
is_separator_regex=False
|
35 |
-
)
|
36 |
-
rag_documents = text_splitter.split_documents(documents)
|
37 |
-
|
38 |
-
# # Alternative chunking: Tokens (more accurate for OpenAI models)
|
39 |
-
# token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
40 |
-
# encoding="cl100k_base", chunk_size=100, chunk_overlap=0
|
41 |
-
# )
|
42 |
-
# token_rag_documents = token_text_splitter.split_documents(documents)
|
43 |
-
# # TO DO ^^ test
|
44 |
-
|
45 |
# Split the documents by character
|
46 |
-
|
47 |
separator="\n\n",
|
48 |
chunk_size=1000,
|
49 |
chunk_overlap=200,
|
50 |
length_function=len,
|
51 |
is_separator_regex=False,
|
52 |
)
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
|
56 |
|
|
|
26 |
docs = loader.load()
|
27 |
documents.extend(docs)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Split the documents by character
|
30 |
+
character_text_splitter = CharacterTextSplitter(
|
31 |
separator="\n\n",
|
32 |
chunk_size=1000,
|
33 |
chunk_overlap=200,
|
34 |
length_function=len,
|
35 |
is_separator_regex=False,
|
36 |
)
|
37 |
+
rag_documents = character_text_splitter.split_documents(documents)
|
38 |
+
|
39 |
+
# Split the documents recursively
|
40 |
+
recursive_text_splitter = RecursiveCharacterTextSplitter(
|
41 |
+
chunk_size=500,
|
42 |
+
chunk_overlap=40,
|
43 |
+
length_function=len,
|
44 |
+
is_separator_regex=False
|
45 |
+
)
|
46 |
+
# rag_documents = recursive_text_splitter.split_documents(documents)
|
47 |
+
|
48 |
+
|
49 |
|
50 |
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
|
51 |
|