Spaces:
Starting
on
T4
Starting
on
T4
import re | |
from thefuzz import fuzz | |
from thefuzz import process | |
def find_query_despite_whitespace(document, query): | |
# Normalize spaces and newlines in the query | |
normalized_query = re.sub(r'\s+', ' ', query).strip() | |
# Create a regex pattern from the normalized query to match any whitespace characters between words | |
pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split()) | |
# Compile the regex to ignore case and search for it in the document | |
regex = re.compile(pattern, re.IGNORECASE) | |
match = regex.search(document) | |
if match: | |
return document[match.start(): match.end()], match.start(), match.end() | |
else: | |
return None | |
def rigorous_document_search(document: str, target: str): | |
""" | |
This function performs a rigorous search of a target string within a document. | |
It handles issues related to whitespace, changes in grammar, and other minor text alterations. | |
The function first checks for an exact match of the target in the document. | |
If no exact match is found, it performs a raw search that accounts for variations in whitespace. | |
If the raw search also fails, it splits the document into sentences and uses fuzzy matching | |
to find the sentence that best matches the target. | |
Args: | |
document (str): The document in which to search for the target. | |
target (str): The string to search for within the document. | |
Returns: | |
tuple: A tuple containing the best match found in the document, its start index, and its end index. | |
If no match is found, returns None. | |
""" | |
if target.endswith('.'): | |
target = target[:-1] | |
if target in document: | |
start_index = document.find(target) | |
end_index = start_index + len(target) | |
return target, start_index, end_index | |
else: | |
raw_search = find_query_despite_whitespace(document, target) | |
if raw_search is not None: | |
return raw_search | |
# Split the text into sentences | |
sentences = re.split(r'[.!?]\s*|\n', document) | |
# Find the sentence that matches the query best | |
best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio) | |
if best_match[1] < 98: | |
return None | |
reference = best_match[0] | |
start_index = document.find(reference) | |
end_index = start_index + len(reference) | |
return reference, start_index, end_index | |
def get_chunks_and_metadata(self, document, splitter): | |
documents = splitter.split_text(document) | |
metadatas = [] | |
search_start_position = 0 | |
for document in documents: | |
try: | |
match_data = rigorous_document_search(document[search_start_position:], document) | |
if match_data is not None: | |
reference, relative_start_index, relative_end_index = match_data | |
start_index = search_start_position + relative_start_index | |
end_index = search_start_position + relative_end_index | |
# Update the search start position for the next chunk | |
search_start_position = end_index | |
metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id}) | |
else: | |
raise Exception(f"Error in finding {document} in {corpus_id}") | |
except Exception as e: | |
print(f"Error in finding {document} in {corpus_id}: {e}") | |
raise e | |
return documents, metadatas |