devve1 commited on
Commit
89d259a
1 Parent(s): 40ce4f6

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +24 -23
utils.py CHANGED
@@ -64,28 +64,29 @@ def rigorous_document_search(document: str, target: str):
64
 
65
  return reference, start_index, end_index
66
 
67
- def get_chunks_and_metadata(self, splitter):
68
- # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. This isn't pratically an issue for chunks over 1000 characters.
69
- documents = []
70
- metadatas = []
71
- for corpus_id in self.corpus_list:
72
- corpus_path = corpus_id
73
- if self.corpora_id_paths is not None:
74
- corpus_path = self.corpora_id_paths[corpus_id]
75
 
76
- with open(corpus_path, 'r') as file:
77
- corpus = file.read()
 
 
 
78
 
79
- current_documents = splitter.split_text(corpus)
80
- current_metadatas = []
81
- for document in current_documents:
82
- try:
83
- _, start_index, end_index = rigorous_document_search(corpus, document)
84
- except:
85
- print(f"Error in finding {document} in {corpus_id}")
86
- raise Exception(f"Error in finding {document} in {corpus_id}")
87
- # start_index, end_index = find_target_in_document(corpus, document)
88
- current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
89
- documents.extend(current_documents)
90
- metadatas.extend(current_metadatas)
91
- return documents, metadatas
 
 
 
 
64
 
65
  return reference, start_index, end_index
66
 
67
+ def get_chunks_and_metadata(self, document, splitter):
68
+ documents = splitter.split_text(document)
69
+ metadatas = []
 
 
 
 
 
70
 
71
+ search_start_position = 0
72
+
73
+ for document in documents:
74
+ try:
75
+ match_data = rigorous_document_search(document[search_start_position:], document)
76
 
77
+ if match_data is not None:
78
+ reference, relative_start_index, relative_end_index = match_data
79
+ start_index = search_start_position + relative_start_index
80
+ end_index = search_start_position + relative_end_index
81
+
82
+ # Update the search start position for the next chunk
83
+ search_start_position = end_index
84
+
85
+ metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
86
+ else:
87
+ raise Exception(f"Error in finding {document} in {corpus_id}")
88
+ except Exception as e:
89
+ print(f"Error in finding {document} in {corpus_id}: {e}")
90
+ raise e
91
+
92
+ return documents, metadatas