devve1 commited on
Commit
86d6332
1 Parent(s): 99f6b00

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +18 -0
utils.py CHANGED
@@ -1,6 +1,24 @@
 
1
  from thefuzz import fuzz
2
  from thefuzz import process
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def rigorous_document_search(document: str, target: str):
5
  """
6
  This function performs a rigorous search of a target string within a document.
 
1
+ import re
2
  from thefuzz import fuzz
3
  from thefuzz import process
4
 
5
+ def find_query_despite_whitespace(document, query):
6
+
7
+ # Normalize spaces and newlines in the query
8
+ normalized_query = re.sub(r'\s+', ' ', query).strip()
9
+
10
+ # Create a regex pattern from the normalized query to match any whitespace characters between words
11
+ pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split())
12
+
13
+ # Compile the regex to ignore case and search for it in the document
14
+ regex = re.compile(pattern, re.IGNORECASE)
15
+ match = regex.search(document)
16
+
17
+ if match:
18
+ return document[match.start(): match.end()], match.start(), match.end()
19
+ else:
20
+ return None
21
+
22
  def rigorous_document_search(document: str, target: str):
23
  """
24
  This function performs a rigorous search of a target string within a document.