danicafisher commited on
Commit
008dbaf
1 Parent(s): 768b51c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -35,12 +35,22 @@ text_splitter = RecursiveCharacterTextSplitter(
35
  )
36
  rag_documents = text_splitter.split_documents(documents)
37
 
38
- # Alternative chunking: Tokens (more accurate for OpenAI models)
39
- token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
40
- encoding="cl100k_base", chunk_size=100, chunk_overlap=0
 
 
 
 
 
 
 
 
 
 
 
41
  )
42
- token_rag_documents = token_text_splitter.split_documents(documents)
43
- # TO DO ^^ test
44
 
45
  embedding = OpenAIEmbeddings(model="text-embedding-3-small")
46
 
 
35
  )
36
  rag_documents = text_splitter.split_documents(documents)
37
 
38
+ # # Alternative chunking: Tokens (more accurate for OpenAI models)
39
+ # token_text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
40
+ # encoding="cl100k_base", chunk_size=100, chunk_overlap=0
41
+ # )
42
+ # token_rag_documents = token_text_splitter.split_documents(documents)
43
+ # # TO DO ^^ test
44
+
45
+ # Split the documents by character
46
+ text_splitter = CharacterTextSplitter(
47
+ separator="\n\n",
48
+ chunk_size=1000,
49
+ chunk_overlap=200,
50
+ length_function=len,
51
+ is_separator_regex=False,
52
  )
53
+ character_rag_documents = text_splitter.split_documents(documents)
 
54
 
55
  embedding = OpenAIEmbeddings(model="text-embedding-3-small")
56