Create utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from thefuzz import fuzz
|
2 |
+
from thefuzz import process
|
3 |
+
|
4 |
+
def rigorous_document_search(document: str, target: str):
|
5 |
+
"""
|
6 |
+
This function performs a rigorous search of a target string within a document.
|
7 |
+
It handles issues related to whitespace, changes in grammar, and other minor text alterations.
|
8 |
+
The function first checks for an exact match of the target in the document.
|
9 |
+
If no exact match is found, it performs a raw search that accounts for variations in whitespace.
|
10 |
+
If the raw search also fails, it splits the document into sentences and uses fuzzy matching
|
11 |
+
to find the sentence that best matches the target.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
document (str): The document in which to search for the target.
|
15 |
+
target (str): The string to search for within the document.
|
16 |
+
|
17 |
+
Returns:
|
18 |
+
tuple: A tuple containing the best match found in the document, its start index, and its end index.
|
19 |
+
If no match is found, returns None.
|
20 |
+
"""
|
21 |
+
if target.endswith('.'):
|
22 |
+
target = target[:-1]
|
23 |
+
|
24 |
+
if target in document:
|
25 |
+
start_index = document.find(target)
|
26 |
+
end_index = start_index + len(target)
|
27 |
+
return target, start_index, end_index
|
28 |
+
else:
|
29 |
+
raw_search = find_query_despite_whitespace(document, target)
|
30 |
+
if raw_search is not None:
|
31 |
+
return raw_search
|
32 |
+
|
33 |
+
# Split the text into sentences
|
34 |
+
sentences = re.split(r'[.!?]\s*|\n', document)
|
35 |
+
|
36 |
+
# Find the sentence that matches the query best
|
37 |
+
best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)
|
38 |
+
|
39 |
+
if best_match[1] < 98:
|
40 |
+
return None
|
41 |
+
|
42 |
+
reference = best_match[0]
|
43 |
+
|
44 |
+
start_index = document.find(reference)
|
45 |
+
end_index = start_index + len(reference)
|
46 |
+
|
47 |
+
return reference, start_index, end_index
|