devve1 commited on
Commit
21c2659
1 Parent(s): 86d6332

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +27 -1
utils.py CHANGED
@@ -62,4 +62,30 @@ def rigorous_document_search(document: str, target: str):
62
  start_index = document.find(reference)
63
  end_index = start_index + len(reference)
64
 
65
- return reference, start_index, end_index
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  start_index = document.find(reference)
63
  end_index = start_index + len(reference)
64
 
65
+ return reference, start_index, end_index
66
+
67
+ def get_chunks_and_metadata(self, splitter):
68
+ # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters.
69
+ documents = []
70
+ metadatas = []
71
+ for corpus_id in self.corpus_list:
72
+ corpus_path = corpus_id
73
+ if self.corpora_id_paths is not None:
74
+ corpus_path = self.corpora_id_paths[corpus_id]
75
+
76
+ with open(corpus_path, 'r') as file:
77
+ corpus = file.read()
78
+
79
+ current_documents = splitter.split_text(corpus)
80
+ current_metadatas = []
81
+ for document in current_documents:
82
+ try:
83
+ _, start_index, end_index = rigorous_document_search(corpus, document)
84
+ except:
85
+ print(f"Error in finding {document} in {corpus_id}")
86
+ raise Exception(f"Error in finding {document} in {corpus_id}")
87
+ # start_index, end_index = find_target_in_document(corpus, document)
88
+ current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
89
+ documents.extend(current_documents)
90
+ metadatas.extend(current_metadatas)
91
+ return documents, metadatas