AceMath-1.5B-Instruct / evaluation /grader.py

Upload 3 files

94f14e9 verified 9 days ago

18.7 kB


	"""
	This script is adapted from Qwen2.5-Math
	https://github.com/QwenLM/Qwen2.5-Math/blob/main/evaluation/grader.py
	"""

	import re
	import regex
	import multiprocessing
	from math import isclose
	from typing import Union
	from collections import defaultdict

	from sympy import simplify, N
	from sympy.parsing.sympy_parser import parse_expr
	from sympy.parsing.latex import parse_latex


	def latex2sympy(sympy: str, variable_values={}):
	# record frac
	global frac_type
	if sympy.find(r'\frac') != -1:
	frac_type = r'\frac'
	if sympy.find(r'\dfrac') != -1:
	frac_type = r'\dfrac'
	if sympy.find(r'\tfrac') != -1:
	frac_type = r'\tfrac'
	sympy = sympy.replace(r'\dfrac', r'\frac')
	sympy = sympy.replace(r'\tfrac', r'\frac')
	# Translate Transpose
	sympy = sympy.replace(r'\mathrm{T}', 'T', -1)
	# Translate Derivative
	sympy = sympy.replace(r'\mathrm{d}', 'd', -1).replace(r'{\rm d}', 'd', -1)
	# Translate Matrix
	sympy = sympy.replace(r'\left[\begin{matrix}', r'\begin{bmatrix}', -1).replace(r'\end{matrix}\right]', r'\end{bmatrix}', -1)
	# Translate Permutation
	sympy = re.sub(r"$([a-zA-Z0-9+\-/\\ ]+?)$_{([a-zA-Z0-9+\-/\\ ]+?)}", r"\\frac{(\1)!}{((\1)-(\2))!}", sympy)
	# Remove \displaystyle
	sympy = sympy.replace(r'\displaystyle', ' ', -1)
	# Remove \quad
	sympy = sympy.replace(r'\quad', ' ', -1).replace(r'\qquad', ' ', -1).replace(r'~', ' ', -1).replace(r'\,', ' ', -1)
	# Remove $
	sympy = sympy.replace(r'$', ' ', -1)

	# variable values
	global VARIABLE_VALUES
	if len(variable_values) > 0:
	VARIABLE_VALUES = variable_values
	else:
	VARIABLE_VALUES = {}

	# setup listener
	matherror = MathErrorListener(sympy)

	# stream input
	stream = InputStream(sympy)
	lex = PSLexer(stream)
	lex.removeErrorListeners()
	lex.addErrorListener(matherror)

	tokens = CommonTokenStream(lex)
	parser = PSParser(tokens)

	# remove default console error listener
	parser.removeErrorListeners()
	parser.addErrorListener(matherror)

	# process the input
	return_data = None
	math = parser.math()

	# if a list
	if math.relation_list():
	return_data = []

	# go over list items
	relation_list = math.relation_list().relation_list_content()
	for list_item in relation_list.relation():
	expr = convert_relation(list_item)
	return_data.append(expr)

	# if not, do default
	else:
	relation = math.relation()
	return_data = convert_relation(relation)

	return return_data


	def math_answer_cleaning(answer, dataset_name):
	"""
	remove irrelevant strings and unify the answer format before checking whether the answers are equal
	"""
	def _is_completely_wrapped_by_text(input_string):
	pattern = r'^\\text{(.*)}$'
	match = re.match(pattern, input_string)
	if match:
	## input_string is completely wrapped by \text{}
	extracted_content = match.group(1)
	extracted_content = extracted_content.replace("(", "").replace(")", "").replace(",", "")
	return extracted_content
	else:
	return None

	## remove irrelevant \\text and space
	extracted_content = _is_completely_wrapped_by_text(answer)
	answer = extracted_content if extracted_content else answer

	## e.g., convert 5,\!460 into 5460; convert 14{,}916 into 14916 convert \$4 into 4
	answer = answer.replace(",\!", "").replace("{,}", "").replace("\$", "")
	## e.g., convert \dfrac{3}{2} into frac{3}{2}
	answer = answer.replace("dfrac{", "frac{").replace("tfrac{", "frac{")
	## e.g., convert 121^\circ into 121
	answer = answer.replace("^\circ", "")
	answer = answer.replace("^{\circ}", "")
	## remove \quad
	answer = answer.replace("\quad", "")
	## remove space
	answer = answer.replace(" ", "")
	## remove \n
	answer = answer.replace("\n", "").replace("\\n", "")
	## e.g., convert 3.54\times10^{10} into 3.54e10
	answer = re.sub(r'([+-]?\d*\.?\d+)[\\]times10\^{([+-]?\d+)}', r'\1e\2', answer)
	## e.g., convert 3.54\times10^10 into 3.54e10
	answer = re.sub(r'([+-]?\d*\.?\d+)[\\]times10\^([+-]?\d+)', r'\1e\2', answer)
	## e.g., convert 558\,\text{nm} into 558
	answer = re.sub(r'\\,\\text\{.*?\}', '', answer)
	## e.g., convert 558\text{nm} into 558
	answer = re.sub(r'\\text\{.*?\}', '', answer)
	## e.g., convert 2^{10} into 2^10
	answer = re.sub(r'(\d+)\^{(\d+)}', r'\1^\2', answer)
	## lowercase
	answer = answer.lower()

	if dataset_name == "collegemath":
	## convert 558\mathrm{ft} into 558
	answer = re.sub(r'\\mathrm\{.*?\}', '', answer)
	## clean noisy answer
	answer = re.sub(r'\$$[^)]*$', '', answer)
	if answer.endswith("-"):
	answer = answer[:-1]
	if answer.endswith("."):
	answer = answer[:-1]
	if answer.endswith("hours"):
	answer = answer[:-len("hours")]
	## extract final answer after '=' or ':'
	if "=" in answer:
	answer = answer.split("=", 1)[1]
	if ":" in answer:
	answer = answer.split(":", 1)[1]
	## \emptyset and \oslash both reprsent empty set in latex
	answer = answer.replace("\\emptyset", "\\oslash")
	if dataset_name == "gsm8k":
	# Example: 5,600 -> 5600
	answer = answer.replace(',', '')
	if dataset_name == "gaokao2023en":
	unit_strings = ['students', 'dollars', 'boxes', 'feet', 'kilometers', 'meters', 'degreesontheBreadusscale', '$', 'a.m.', 'am', 'minutes']
	for unit in unit_strings:
	answer = answer.replace(unit, "")

	return answer


	def extract_final_answer(output):
	pattern_re = re.compile(r"\\boxed\{((?:[^{}]\|\{(?:[^{}]\|\{[^{}]\})\})*)\}", re.DOTALL)
	all_matches = pattern_re.findall(output)

	if len(all_matches) >= 1:
	extracted_answer = all_matches[-1]
	else:
	extracted_answer = None

	return extracted_answer, all_matches


	def round_number(answer):
	def _is_float(string):
	try:
	float(string)
	return True
	except:
	return False

	if _is_float(answer) and float(answer) < 1:
	## to consider the case like 5.56e-10 (convert 5.56e-10 into 5.6e-10)
	## still return a string type
	return f"{float(answer):.2g}"

	return answer


	def choice_answer_clean(pred: str):
	pred = pred.strip("\n").rstrip(".").rstrip("/").strip(" ").lstrip(":")
	# Clean the answer based on the dataset
	tmp = re.findall(r"\b(A\|B\|C\|D\|E)\b", pred.upper())
	if tmp:
	pred = tmp
	else:
	pred = [pred.strip().strip(".")]
	pred = pred[-1]
	# Remove the period at the end, again!
	pred = pred.rstrip(".").rstrip("/")
	return pred


	def parse_digits(num):
	num = regex.sub(",", "", str(num))
	try:
	return float(num)
	except:
	if num.endswith("%"):
	num = num[:-1]
	if num.endswith("\\"):
	num = num[:-1]
	try:
	return float(num) / 100
	except:
	pass
	return None


	def is_digit(num):
	# paired with parse_digits
	return parse_digits(num) is not None


	def str_to_pmatrix(input_str):
	input_str = input_str.strip()
	matrix_str = re.findall(r"\{.,.\}", input_str)
	pmatrix_list = []

	for m in matrix_str:
	m = m.strip("{}")
	pmatrix = r"\begin{pmatrix}" + m.replace(",", "\\") + r"\end{pmatrix}"
	pmatrix_list.append(pmatrix)

	return ", ".join(pmatrix_list)


	def math_equal(
	prediction: Union[bool, float, str],
	reference: Union[float, str],
	include_percentage: bool = True,
	is_close: bool = True,
	timeout: bool = False,
	) -> bool:
	"""
	Exact match of math if and only if:
	1. numerical equal: both can convert to float and are equal
	2. symbolic equal: both can convert to sympy expression and are equal
	"""
	if prediction is None or reference is None:
	return False
	if str(prediction.strip().lower()) == str(reference.strip().lower()):
	return True
	if (
	reference in ["A", "B", "C", "D", "E"]
	and choice_answer_clean(prediction) == reference
	):
	return True

	# fraction equal
	if fraction_equal(prediction, reference):
	return True

	try: # numerical equal
	if round_number(prediction) == round_number(reference):
	return True
	if is_digit(prediction) and is_digit(reference):
	prediction = parse_digits(prediction)
	reference = parse_digits(reference)
	# number questions
	if include_percentage:
	gt_result = [reference / 100, reference, reference * 100]
	else:
	gt_result = [reference]
	for item in gt_result:
	try:
	if is_close:
	if numeric_equal(prediction, item):
	return True
	else:
	if item == prediction:
	return True
	except Exception:
	continue
	return False
	except:
	pass

	if not prediction and prediction not in [0, False]:
	return False

	# symbolic equal
	reference = str(reference).strip()
	prediction = str(prediction).strip()

	## pmatrix (amps)
	if "pmatrix" in prediction and not "pmatrix" in reference:
	reference = str_to_pmatrix(reference)

	## deal with [], (), {}
	pred_str, ref_str = prediction, reference
	if (
	prediction.startswith("[")
	and prediction.endswith("]")
	and not reference.startswith("(")
	) or (
	prediction.startswith("(")
	and prediction.endswith(")")
	and not reference.startswith("[")
	):
	pred_str = pred_str.strip("[]()")
	ref_str = ref_str.strip("[]()")
	for s in ["{", "}", "(", ")"]:
	ref_str = ref_str.replace(s, "")
	pred_str = pred_str.replace(s, "")
	if pred_str.lower() == ref_str.lower():
	return True

	## [a, b] vs. [c, d], return a==c and b==d
	if (
	regex.match(r"($\|\[).+($\|\])", prediction) is not None
	and regex.match(r"($\|\[).+($\|\])", reference) is not None
	):
	pred_parts = prediction[1:-1].split(",")
	ref_parts = reference[1:-1].split(",")
	if len(pred_parts) == len(ref_parts):
	if all(
	[
	math_equal(
	pred_parts[i], ref_parts[i], include_percentage, is_close
	)
	for i in range(len(pred_parts))
	]
	):
	return True
	if (
	(
	prediction.startswith("\\begin{pmatrix}")
	or prediction.startswith("\\begin{bmatrix}")
	)
	and (
	prediction.endswith("\\end{pmatrix}")
	or prediction.endswith("\\end{bmatrix}")
	)
	and (
	reference.startswith("\\begin{pmatrix}")
	or reference.startswith("\\begin{bmatrix}")
	)
	and (
	reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")
	)
	):
	pred_lines = [
	line.strip()
	for line in prediction[
	len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
	].split("\\\\")
	if line.strip()
	]
	ref_lines = [
	line.strip()
	for line in reference[
	len("\\begin{pmatrix}") : -len("\\end{pmatrix}")
	].split("\\\\")
	if line.strip()
	]
	matched = True
	if len(pred_lines) == len(ref_lines):
	for pred_line, ref_line in zip(pred_lines, ref_lines):
	pred_parts = pred_line.split("&")
	ref_parts = ref_line.split("&")
	if len(pred_parts) == len(ref_parts):
	if not all(
	[
	math_equal(
	pred_parts[i],
	ref_parts[i],
	include_percentage,
	is_close,
	)
	for i in range(len(pred_parts))
	]
	):
	matched = False
	break
	else:
	matched = False
	if not matched:
	break
	else:
	matched = False
	if matched:
	return True

	if prediction.count("=") == 1 and reference.count("=") == 1:
	pred = prediction.split("=")
	pred = f"{pred[0].strip()} - ({pred[1].strip()})"
	ref = reference.split("=")
	ref = f"{ref[0].strip()} - ({ref[1].strip()})"
	if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
	return True
	elif (
	prediction.count("=") == 1
	and len(prediction.split("=")[0].strip()) <= 2
	and "=" not in reference
	):
	if math_equal(
	prediction.split("=")[1], reference, include_percentage, is_close
	):
	return True
	elif (
	reference.count("=") == 1
	and len(reference.split("=")[0].strip()) <= 2
	and "=" not in prediction
	):
	if math_equal(
	prediction, reference.split("=")[1], include_percentage, is_close
	):
	return True

	# symbolic equal with sympy
	if timeout:
	if call_with_timeout(symbolic_equal_process, prediction, reference):
	return True
	else:
	if symbolic_equal(prediction, reference):
	return True

	return False


	def numeric_equal(prediction: float, reference: float):
	# Note that relative tolerance has significant impact
	# on the result of the synthesized GSM-Hard dataset
	# if reference.is_integer():
	# return isclose(reference, round(prediction), abs_tol=1e-4)
	# else:
	# prediction = round(prediction, len(str(reference).split(".")[-1]))
	return isclose(reference, prediction, rel_tol=1e-4)


	def fraction_equal(prediction, reference):
	def _calculate_numbers(input_string):
	try:
	result = eval(input_string)
	return result
	except:
	return None

	reference = re.sub(r'\\frac{(.?)}{(.?)}', r'(\1/\2)', reference)
	prediction = re.sub(r'\\frac{(.?)}{(.?)}', r'(\1/\2)', prediction)

	if reference == prediction:
	return True

	reference = _calculate_numbers(reference)
	prediction = _calculate_numbers(prediction)

	if reference and reference == prediction:
	return True

	return False

	def symbolic_equal(a, b):
	def _parse(s):
	for f in [parse_latex, parse_expr, latex2sympy]:
	try:
	return f(s.replace("\\\\", "\\"))
	except:
	try:
	return f(s)
	except:
	pass
	return s

	a = _parse(a)
	b = _parse(b)

	# direct equal
	try:
	if str(a) == str(b) or a == b:
	return True
	except:
	pass

	# simplify equal
	try:
	if a.equals(b) or simplify(a - b) == 0:
	return True
	except:
	pass

	# equation equal
	try:
	if (abs(a.lhs - a.rhs)).equals(abs(b.lhs - b.rhs)):
	return True
	except:
	pass

	try:
	if numeric_equal(float(N(a)), float(N(b))):
	return True
	except:
	pass

	# matrix
	try:
	# if a and b are matrix
	if a.shape == b.shape:
	_a = a.applyfunc(lambda x: round(x, 3))
	_b = b.applyfunc(lambda x: round(x, 3))
	if _a.equals(_b):
	return True
	except:
	pass

	return False


	def symbolic_equal_process(a, b, output_queue):
	result = symbolic_equal(a, b)
	output_queue.put(result)


	def math_equal_process(prediction, reference, output_queue):
	result = math_equal(prediction, reference, timeout=True)
	output_queue.put(result)


	def call_with_timeout(func, args, timeout=1, *kwargs):
	output_queue = multiprocessing.Queue()
	process_args = args + (output_queue,)
	process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
	process.start()
	process.join(timeout)

	if process.is_alive():
	process.terminate()
	process.join()
	return False

	return output_queue.get()


	def check_correctness_of_multiple_answer_cases(prediction, reference, all_matches):

	if prediction.replace(",", "").replace("$", "") == reference.replace(",", "").replace("$", ""):
	return True

	if not prediction.split("=")[-1] == reference.split("=")[-1].replace("$", ""):
	return False

	if "," in reference or "or" in reference or "and" in reference:
	## there are multiple answers
	if len(all_matches) <= 1:
	return False

	prediction1 = prediction.split("=")[-1]
	prediction2 = all_matches[-2].split("=")[-1]
	reference = reference.replace("$", "")
	if "or" in reference:
	gold_list = reference.split("or", 1)
	elif "and" in reference:
	gold_list = reference.split("and", 1)
	else:
	gold_list = reference.split(",", 1)

	reference1 = gold_list[-1].split("=")[-1]
	reference2 = gold_list[-2].split("=")[-1]

	if math_equal(prediction1, reference1) and math_equal(prediction2, reference2):
	return True
	elif math_equal(prediction2, reference1) and math_equal(prediction1, reference2):
	return True

	return False

	else:
	return True


	def is_equal(model_output, reference, dataset_name):

	extracted_model_answer, all_matches = extract_final_answer(model_output)
	if extracted_model_answer is None or reference is None:
	return False

	extracted_model_answer = math_answer_cleaning(extracted_model_answer, dataset_name)
	reference = math_answer_cleaning(reference, dataset_name)

	# if math_equal(prediction, reference, timeout=True):
	if call_with_timeout(math_equal_process, extracted_model_answer, reference):
	return True

	if dataset_name == "collegemath":
	return check_correctness_of_multiple_answer_cases(extracted_model_answer, reference, all_matches)

	return False