danicafisher commited on
Commit
3139a4f
1 Parent(s): 008dbaf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -18
app.py CHANGED
@@ -26,31 +26,26 @@ for filename in os.listdir(directory):
26
  docs = loader.load()
27
  documents.extend(docs)
28
 
29
- # Split the documents
30
- text_splitter = RecursiveCharacterTextSplitter(
31
- chunk_size=500,
32
- chunk_overlap=40,
33
- length_function=len,
34
- is_separator_regex=False
35
- )
36
- rag_documents = text_splitter.split_documents(documents)
37
-
38
- # # Alternative chunking: Tokens (more accurate for OpenAI models)
39
- # token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
40
- # encoding="cl100k_base", chunk_size=100, chunk_overlap=0
41
- # )
42
- # token_rag_documents = token_text_splitter.split_documents(documents)
43
- # # TO DO ^^ test
44
-
45
  # Split the documents by character
46
- text_splitter = CharacterTextSplitter(
47
  separator="\n\n",
48
  chunk_size=1000,
49
  chunk_overlap=200,
50
  length_function=len,
51
  is_separator_regex=False,
52
  )
53
- character_rag_documents = text_splitter.split_documents(documents)
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  embedding = OpenAIEmbeddings(model="text-embedding-3-small")
56
 
 
26
  docs = loader.load()
27
  documents.extend(docs)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  # Split the documents by character
30
+ character_text_splitter = CharacterTextSplitter(
31
  separator="\n\n",
32
  chunk_size=1000,
33
  chunk_overlap=200,
34
  length_function=len,
35
  is_separator_regex=False,
36
  )
37
+ rag_documents = character_text_splitter.split_documents(documents)
38
+
39
+ # Split the documents recursively
40
+ recursive_text_splitter = RecursiveCharacterTextSplitter(
41
+ chunk_size=500,
42
+ chunk_overlap=40,
43
+ length_function=len,
44
+ is_separator_regex=False
45
+ )
46
+ # rag_documents = recursive_text_splitter.split_documents(documents)
47
+
48
+
49
 
50
  embedding = OpenAIEmbeddings(model="text-embedding-3-small")
51