Update utils.py
Browse files
utils.py
CHANGED
@@ -1,6 +1,24 @@
|
|
|
|
1 |
from thefuzz import fuzz
|
2 |
from thefuzz import process
|
3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
def rigorous_document_search(document: str, target: str):
|
5 |
"""
|
6 |
This function performs a rigorous search of a target string within a document.
|
|
|
1 |
+
import re
|
2 |
from thefuzz import fuzz
|
3 |
from thefuzz import process
|
4 |
|
5 |
+
def find_query_despite_whitespace(document, query):
|
6 |
+
|
7 |
+
# Normalize spaces and newlines in the query
|
8 |
+
normalized_query = re.sub(r'\s+', ' ', query).strip()
|
9 |
+
|
10 |
+
# Create a regex pattern from the normalized query to match any whitespace characters between words
|
11 |
+
pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split())
|
12 |
+
|
13 |
+
# Compile the regex to ignore case and search for it in the document
|
14 |
+
regex = re.compile(pattern, re.IGNORECASE)
|
15 |
+
match = regex.search(document)
|
16 |
+
|
17 |
+
if match:
|
18 |
+
return document[match.start(): match.end()], match.start(), match.end()
|
19 |
+
else:
|
20 |
+
return None
|
21 |
+
|
22 |
def rigorous_document_search(document: str, target: str):
|
23 |
"""
|
24 |
This function performs a rigorous search of a target string within a document.
|