devve1's picture
Update utils.py
89d259a verified
raw
history blame contribute delete
No virus
3.55 kB
import re
from thefuzz import fuzz
from thefuzz import process
def find_query_despite_whitespace(document, query):
# Normalize spaces and newlines in the query
normalized_query = re.sub(r'\s+', ' ', query).strip()
# Create a regex pattern from the normalized query to match any whitespace characters between words
pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split())
# Compile the regex to ignore case and search for it in the document
regex = re.compile(pattern, re.IGNORECASE)
match = regex.search(document)
if match:
return document[match.start(): match.end()], match.start(), match.end()
else:
return None
def rigorous_document_search(document: str, target: str):
"""
This function performs a rigorous search of a target string within a document.
It handles issues related to whitespace, changes in grammar, and other minor text alterations.
The function first checks for an exact match of the target in the document.
If no exact match is found, it performs a raw search that accounts for variations in whitespace.
If the raw search also fails, it splits the document into sentences and uses fuzzy matching
to find the sentence that best matches the target.
Args:
document (str): The document in which to search for the target.
target (str): The string to search for within the document.
Returns:
tuple: A tuple containing the best match found in the document, its start index, and its end index.
If no match is found, returns None.
"""
if target.endswith('.'):
target = target[:-1]
if target in document:
start_index = document.find(target)
end_index = start_index + len(target)
return target, start_index, end_index
else:
raw_search = find_query_despite_whitespace(document, target)
if raw_search is not None:
return raw_search
# Split the text into sentences
sentences = re.split(r'[.!?]\s*|\n', document)
# Find the sentence that matches the query best
best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)
if best_match[1] < 98:
return None
reference = best_match[0]
start_index = document.find(reference)
end_index = start_index + len(reference)
return reference, start_index, end_index
def get_chunks_and_metadata(self, document, splitter):
documents = splitter.split_text(document)
metadatas = []
search_start_position = 0
for document in documents:
try:
match_data = rigorous_document_search(document[search_start_position:], document)
if match_data is not None:
reference, relative_start_index, relative_end_index = match_data
start_index = search_start_position + relative_start_index
end_index = search_start_position + relative_end_index
# Update the search start position for the next chunk
search_start_position = end_index
metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
else:
raise Exception(f"Error in finding {document} in {corpus_id}")
except Exception as e:
print(f"Error in finding {document} in {corpus_id}: {e}")
raise e
return documents, metadatas