Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

devve1 commited on 24 days ago

Commit

89d259a

•

1 Parent(s): 40ce4f6

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -64,28 +64,29 @@ def rigorous_document_search(document: str, target: str):
     return reference, start_index, end_index
-def get_chunks_and_metadata(self, splitter):
-        # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters.
-        documents = []
-        metadatas = []
-        for corpus_id in self.corpus_list:
-            corpus_path = corpus_id
-            if self.corpora_id_paths is not None:
-                corpus_path = self.corpora_id_paths[corpus_id]
-            with open(corpus_path, 'r') as file:
-                corpus = file.read()
-            current_documents = splitter.split_text(corpus)
-            current_metadatas = []
-            for document in current_documents:
-                try:
-                    _, start_index, end_index = rigorous_document_search(corpus, document)
-                except:
-                    print(f"Error in finding {document} in {corpus_id}")
-                    raise Exception(f"Error in finding {document} in {corpus_id}")
-                # start_index, end_index = find_target_in_document(corpus, document)
-                current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
-            documents.extend(current_documents)
-            metadatas.extend(current_metadatas)
-        return documents, metadatas

     return reference, start_index, end_index
+def get_chunks_and_metadata(self, document, splitter):
+    documents = splitter.split_text(document)
+    metadatas = []
+    search_start_position = 0
+    for document in documents:
+        try:
+            match_data = rigorous_document_search(document[search_start_position:], document)
+            if match_data is not None:
+                reference, relative_start_index, relative_end_index = match_data
+                start_index = search_start_position + relative_start_index
+                end_index = search_start_position + relative_end_index
+                # Update the search start position for the next chunk
+                search_start_position = end_index
+                metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
+            else:
+                raise Exception(f"Error in finding {document} in {corpus_id}")
+        except Exception as e:
+            print(f"Error in finding {document} in {corpus_id}: {e}")
+            raise e
+    return documents, metadatas