Spaces:

GameScribes
/

Multipurpose-AI-Agent-Development

Starting on T4

App Files Files Community

Multipurpose-AI-Agent-Development / utils.py

devve1

Update utils.py

89d259a verified 25 days ago

raw

history blame

No virus

3.55 kB

	import re
	from thefuzz import fuzz
	from thefuzz import process

	def find_query_despite_whitespace(document, query):

	# Normalize spaces and newlines in the query
	normalized_query = re.sub(r'\s+', ' ', query).strip()

	# Create a regex pattern from the normalized query to match any whitespace characters between words
	pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split())

	# Compile the regex to ignore case and search for it in the document
	regex = re.compile(pattern, re.IGNORECASE)
	match = regex.search(document)

	if match:
	return document[match.start(): match.end()], match.start(), match.end()
	else:
	return None

	def rigorous_document_search(document: str, target: str):
	"""
	This function performs a rigorous search of a target string within a document.
	It handles issues related to whitespace, changes in grammar, and other minor text alterations.
	The function first checks for an exact match of the target in the document.
	If no exact match is found, it performs a raw search that accounts for variations in whitespace.
	If the raw search also fails, it splits the document into sentences and uses fuzzy matching
	to find the sentence that best matches the target.

	Args:
	document (str): The document in which to search for the target.
	target (str): The string to search for within the document.

	Returns:
	tuple: A tuple containing the best match found in the document, its start index, and its end index.
	If no match is found, returns None.
	"""
	if target.endswith('.'):
	target = target[:-1]

	if target in document:
	start_index = document.find(target)
	end_index = start_index + len(target)
	return target, start_index, end_index
	else:
	raw_search = find_query_despite_whitespace(document, target)
	if raw_search is not None:
	return raw_search

	# Split the text into sentences
	sentences = re.split(r'[.!?]\s*\|\n', document)

	# Find the sentence that matches the query best
	best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)

	if best_match[1] < 98:
	return None

	reference = best_match[0]

	start_index = document.find(reference)
	end_index = start_index + len(reference)

	return reference, start_index, end_index

	def get_chunks_and_metadata(self, document, splitter):
	documents = splitter.split_text(document)
	metadatas = []

	search_start_position = 0

	for document in documents:
	try:
	match_data = rigorous_document_search(document[search_start_position:], document)

	if match_data is not None:
	reference, relative_start_index, relative_end_index = match_data
	start_index = search_start_position + relative_start_index
	end_index = search_start_position + relative_end_index

	# Update the search start position for the next chunk
	search_start_position = end_index

	metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
	else:
	raise Exception(f"Error in finding {document} in {corpus_id}")
	except Exception as e:
	print(f"Error in finding {document} in {corpus_id}: {e}")
	raise e

	return documents, metadatas