devve1 commited on
Commit
99f6b00
1 Parent(s): 12c55e6

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +47 -0
utils.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from thefuzz import fuzz
2
+ from thefuzz import process
3
+
4
+ def rigorous_document_search(document: str, target: str):
5
+ """
6
+ This function performs a rigorous search of a target string within a document.
7
+ It handles issues related to whitespace, changes in grammar, and other minor text alterations.
8
+ The function first checks for an exact match of the target in the document.
9
+ If no exact match is found, it performs a raw search that accounts for variations in whitespace.
10
+ If the raw search also fails, it splits the document into sentences and uses fuzzy matching
11
+ to find the sentence that best matches the target.
12
+
13
+ Args:
14
+ document (str): The document in which to search for the target.
15
+ target (str): The string to search for within the document.
16
+
17
+ Returns:
18
+ tuple: A tuple containing the best match found in the document, its start index, and its end index.
19
+ If no match is found, returns None.
20
+ """
21
+ if target.endswith('.'):
22
+ target = target[:-1]
23
+
24
+ if target in document:
25
+ start_index = document.find(target)
26
+ end_index = start_index + len(target)
27
+ return target, start_index, end_index
28
+ else:
29
+ raw_search = find_query_despite_whitespace(document, target)
30
+ if raw_search is not None:
31
+ return raw_search
32
+
33
+ # Split the text into sentences
34
+ sentences = re.split(r'[.!?]\s*|\n', document)
35
+
36
+ # Find the sentence that matches the query best
37
+ best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)
38
+
39
+ if best_match[1] < 98:
40
+ return None
41
+
42
+ reference = best_match[0]
43
+
44
+ start_index = document.find(reference)
45
+ end_index = start_index + len(reference)
46
+
47
+ return reference, start_index, end_index