|
import re |
|
from thefuzz import fuzz |
|
from thefuzz import process |
|
|
|
def find_query_despite_whitespace(document, query): |
|
|
|
|
|
normalized_query = re.sub(r'\s+', ' ', query).strip() |
|
|
|
|
|
pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split()) |
|
|
|
|
|
regex = re.compile(pattern, re.IGNORECASE) |
|
match = regex.search(document) |
|
|
|
if match: |
|
return document[match.start(): match.end()], match.start(), match.end() |
|
else: |
|
return None |
|
|
|
def rigorous_document_search(document: str, target: str): |
|
""" |
|
This function performs a rigorous search of a target string within a document. |
|
It handles issues related to whitespace, changes in grammar, and other minor text alterations. |
|
The function first checks for an exact match of the target in the document. |
|
If no exact match is found, it performs a raw search that accounts for variations in whitespace. |
|
If the raw search also fails, it splits the document into sentences and uses fuzzy matching |
|
to find the sentence that best matches the target. |
|
|
|
Args: |
|
document (str): The document in which to search for the target. |
|
target (str): The string to search for within the document. |
|
|
|
Returns: |
|
tuple: A tuple containing the best match found in the document, its start index, and its end index. |
|
If no match is found, returns None. |
|
""" |
|
if target.endswith('.'): |
|
target = target[:-1] |
|
|
|
if target in document: |
|
start_index = document.find(target) |
|
end_index = start_index + len(target) |
|
return target, start_index, end_index |
|
else: |
|
raw_search = find_query_despite_whitespace(document, target) |
|
if raw_search is not None: |
|
return raw_search |
|
|
|
|
|
sentences = re.split(r'[.!?]\s*|\n', document) |
|
|
|
|
|
best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio) |
|
|
|
if best_match[1] < 98: |
|
return None |
|
|
|
reference = best_match[0] |
|
|
|
start_index = document.find(reference) |
|
end_index = start_index + len(reference) |
|
|
|
return reference, start_index, end_index |
|
|
|
def get_chunks_and_metadata(self, document, splitter): |
|
documents = splitter.split_text(document) |
|
metadatas = [] |
|
|
|
search_start_position = 0 |
|
|
|
for document in documents: |
|
try: |
|
match_data = rigorous_document_search(document[search_start_position:], document) |
|
|
|
if match_data is not None: |
|
reference, relative_start_index, relative_end_index = match_data |
|
start_index = search_start_position + relative_start_index |
|
end_index = search_start_position + relative_end_index |
|
|
|
|
|
search_start_position = end_index |
|
|
|
metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id}) |
|
else: |
|
raise Exception(f"Error in finding {document} in {corpus_id}") |
|
except Exception as e: |
|
print(f"Error in finding {document} in {corpus_id}: {e}") |
|
raise e |
|
|
|
return documents, metadatas |