src/data_utils.py · flax-community/t5-recipe-generation at refs/pr/1

t5-recipe-generation / src /data_utils.py

Add cleaner, fix data preparation

14f41dd over 3 years ago

1.83 kB

	from nltk.tokenize import wordpunct_tokenize as word_tokenize
	from nltk.tokenize import sent_tokenize

	import re
	import six
	import textwrap

	_whitelist = r"[0-9a-z\,\.\/\<\>]+"
	_regex = "0-9a-z\,\.\/\<\>"


	def filter_by_lang_regex(text, ratio=0.7, regex="0-9a-z\,\.\/\<\>"):
	candidate_text = re.sub(r"[^" + regex + "]+", " ", six.ensure_str(text), flags=re.IGNORECASE).replace(" ", "")
	text = text.replace(" ", "")

	return (len(candidate_text) / len(text)) > ratio


	def filter_by_num_tokens(text, gt=64):
	return len(word_tokenize(text)) > gt


	def filter_by_num_sents(text, gt=2):
	return len(sent_tokenize(text)) > gt


	def filter_by_steps(text):
	return re.search('(step\|mix all)', text, re.IGNORECASE) is not None


	def filter_by_length(text, gt=40):
	return len(text) > gt


	def filter_by_item(item_list, gt=4):
	return len(item_list) > gt


	def chars_to_preserve(sentence, whitelist):
	try:
	tokenized = re.findall(whitelist, sentence, re.IGNORECASE)
	return " ".join(tokenized)
	except Exception as error:
	print(
	textwrap.dedent(
	f"""
	Bad characters range {whitelist},
	{error}
	"""
	)
	)
	raise


	def normalizer(text, whitelist=r"[0-9a-z\,\.\/\<\>]+", do_lowercase=False):
	if do_lowercase:
	text = text.lower()

	text = chars_to_preserve(text, whitelist=whitelist)
	text = " ".join([word.strip() for word in text.split() if word.strip()])
	text = text.strip()

	return text

	# _text = "Crust, Peanut Butter}Melt <sep> 1/2Butter, 2 c. Eggs, Filling, Semi- Sweet Chocolate Chips, Milk, Butter, " \
	# "Frosting"
	# out = normalizer(_text)
	# print(out)
	#
	# _text = "step ... "
	# print(re.search('(step\|mix all)', _text, re.IGNORECASE) != None)