Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

devve1 commited on 25 days ago

Commit

21c2659

•

1 Parent(s): 86d6332

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -62,4 +62,30 @@ def rigorous_document_search(document: str, target: str):
     start_index = document.find(reference)
     end_index = start_index + len(reference)
-    return reference, start_index, end_index

     start_index = document.find(reference)
     end_index = start_index + len(reference)
+    return reference, start_index, end_index
+def get_chunks_and_metadata(self, splitter):
+        # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters.
+        documents = []
+        metadatas = []
+        for corpus_id in self.corpus_list:
+            corpus_path = corpus_id
+            if self.corpora_id_paths is not None:
+                corpus_path = self.corpora_id_paths[corpus_id]
+            with open(corpus_path, 'r') as file:
+                corpus = file.read()
+            current_documents = splitter.split_text(corpus)
+            current_metadatas = []
+            for document in current_documents:
+                try:
+                    _, start_index, end_index = rigorous_document_search(corpus, document)
+                except:
+                    print(f"Error in finding {document} in {corpus_id}")
+                    raise Exception(f"Error in finding {document} in {corpus_id}")
+                # start_index, end_index = find_target_in_document(corpus, document)
+                current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
+            documents.extend(current_documents)
+            metadatas.extend(current_metadatas)
+        return documents, metadatas