import re from thefuzz import fuzz from thefuzz import process def find_query_despite_whitespace(document, query): # Normalize spaces and newlines in the query normalized_query = re.sub(r'\s+', ' ', query).strip() # Create a regex pattern from the normalized query to match any whitespace characters between words pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split()) # Compile the regex to ignore case and search for it in the document regex = re.compile(pattern, re.IGNORECASE) match = regex.search(document) if match: return document[match.start(): match.end()], match.start(), match.end() else: return None def rigorous_document_search(document: str, target: str): """ This function performs a rigorous search of a target string within a document. It handles issues related to whitespace, changes in grammar, and other minor text alterations. The function first checks for an exact match of the target in the document. If no exact match is found, it performs a raw search that accounts for variations in whitespace. If the raw search also fails, it splits the document into sentences and uses fuzzy matching to find the sentence that best matches the target. Args: document (str): The document in which to search for the target. target (str): The string to search for within the document. Returns: tuple: A tuple containing the best match found in the document, its start index, and its end index. If no match is found, returns None. """ if target.endswith('.'): target = target[:-1] if target in document: start_index = document.find(target) end_index = start_index + len(target) return target, start_index, end_index else: raw_search = find_query_despite_whitespace(document, target) if raw_search is not None: return raw_search # Split the text into sentences sentences = re.split(r'[.!?]\s*|\n', document) # Find the sentence that matches the query best best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio) if best_match[1] < 98: return None reference = best_match[0] start_index = document.find(reference) end_index = start_index + len(reference) return reference, start_index, end_index def get_chunks_and_metadata(self, document, splitter): documents = splitter.split_text(document) metadatas = [] search_start_position = 0 for document in documents: try: match_data = rigorous_document_search(document[search_start_position:], document) if match_data is not None: reference, relative_start_index, relative_end_index = match_data start_index = search_start_position + relative_start_index end_index = search_start_position + relative_end_index # Update the search start position for the next chunk search_start_position = end_index metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id}) else: raise Exception(f"Error in finding {document} in {corpus_id}") except Exception as e: print(f"Error in finding {document} in {corpus_id}: {e}") raise e return documents, metadatas