Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Sleeping

App Files Files Community

devve1 commited on 25 days ago

Commit

99f6b00

•

1 Parent(s): 12c55e6

Create utils.py

Browse files

Files changed (1) hide show

utils.py +47 -0

utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from thefuzz import fuzz
+from thefuzz import process
+def rigorous_document_search(document: str, target: str):
+    """
+    This function performs a rigorous search of a target string within a document.
+    It handles issues related to whitespace, changes in grammar, and other minor text alterations.
+    The function first checks for an exact match of the target in the document.
+    If no exact match is found, it performs a raw search that accounts for variations in whitespace.
+    If the raw search also fails, it splits the document into sentences and uses fuzzy matching
+    to find the sentence that best matches the target.
+    Args:
+        document (str): The document in which to search for the target.
+        target (str): The string to search for within the document.
+    Returns:
+        tuple: A tuple containing the best match found in the document, its start index, and its end index.
+        If no match is found, returns None.
+    """
+    if target.endswith('.'):
+        target = target[:-1]
+    if target in document:
+        start_index = document.find(target)
+        end_index = start_index + len(target)
+        return target, start_index, end_index
+    else:
+        raw_search = find_query_despite_whitespace(document, target)
+        if raw_search is not None:
+            return raw_search
+    # Split the text into sentences
+    sentences = re.split(r'[.!?]\s*|\n', document)
+    # Find the sentence that matches the query best
+    best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)
+    if best_match[1] < 98:
+        return None
+    reference = best_match[0]
+    start_index = document.find(reference)
+    end_index = start_index + len(reference)
+    return reference, start_index, end_index