xdy-reasoning-project
/

reasoning_eval

Model card Files Files and versions Community

xdyu commited on Oct 22, 2024

Commit

55f9861

verified ·

1 Parent(s): 6fb3491

Upload run_program.py with huggingface_hub

Browse files

Files changed (1) hide show

run_program.py +144 -68

run_program.py CHANGED Viewed

@@ -25,6 +25,10 @@ client = OpenAI(
   api_key='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'
 )
 torch.random.manual_seed(0)
@@ -46,12 +50,12 @@ torch.random.manual_seed(0)
 # )
 # qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name)
-llama_pipeline = pipeline(
-    "text-generation",
-    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-    model_kwargs={"torch_dtype": torch.bfloat16},
-    device_map="auto",
-)
@@ -238,7 +242,7 @@ def update_question_with_new_parameters():
 	json.dump(program_data, outfile, indent=4)
-def call_answer_question(question, model_name='gpt', cot=False):
 	if cot:
 		prompt_template = PROMPT_DICT['prompt_answer_question_few_shot_cot']
 	else:
@@ -250,12 +254,13 @@ def call_answer_question(question, model_name='gpt', cot=False):
 	if model_name == 'gpt':
 		response = client.chat.completions.create(
 			model="gpt-4o",
 			messages=[
 				{"role": "system", "content": "You are a helpful assistant."},
 				{"role": "user", "content": prompt}
 			],
-			temperature=0,
-			max_tokens=300,
 			top_p=1
 		)
 		return response.choices[0].message.content
@@ -267,7 +272,7 @@ def call_answer_question(question, model_name='gpt', cot=False):
 				messages=[
 					{"role": "user", "content": prompt}
 				],
-				temperature=0,
 				top_p=1
 			)
 			return message.content[0].text
@@ -292,29 +297,31 @@ def call_answer_question(question, model_name='gpt', cot=False):
 		#
 		# 	output = pipe(messages, **generation_args)
 		# 	print(output[0]['generated_text'])
-		# if model_name == 'qwen':
-		# 	messages = [
-		# 		{"role": "system", "content": "You are a helpful assistant."},
-		# 		{"role": "user", "content": prompt}
-		# 	]
-		# 	text = qwen_tokenizer.apply_chat_template(
-		# 		messages,
-		# 		tokenize=False,
-		# 		add_generation_prompt=True
-		# 	)
-		# 	model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)
-		#
-		# 	generated_ids = qwen_model.generate(
-		# 		**model_inputs,
-		# 		max_new_tokens=300,
-		# 		temperature=0.7
-		# 	)
-		# 	generated_ids = [
-		# 		output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-		# 	]
-		#
-		# 	response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-		# 	return response
 		if model_name == 'llama':
 			messages = [
 				{"role": "system", "content": "You are a helpful assistant."},
@@ -324,30 +331,39 @@ def call_answer_question(question, model_name='gpt', cot=False):
 				messages,
 				max_new_tokens=300,
 				# temperature=0.00001
-				temperature = 0.7
 			)
 			# print(outputs[0]["generated_text"][-1])
 			return outputs[0]["generated_text"][-1]['content']
-def answer_question(model_name='gpt', cot=False):
-	infile = open('data/math/test_dump_gsm8k_train_perturbed_with_new_questions.json', 'r')
 	program_data = json.load(infile)
 	print(len(program_data))
 	for case in tqdm(program_data):
-		response = call_answer_question(case['question'], model_name=model_name, cot=cot)
 		case['prediction'] = response
 		# print(case['prediction'])
 		case['new_prediction'] = []
 		for question in case['new_questions']:
-			response = call_answer_question(question, model_name=model_name, cot=cot)
 			case['new_prediction'].append(response)
 		# print(case)
 		# break
 	# print(case)
 	# break
-	# outfile = open('data/math/test_dump_gsm8k_train_perturbed_with_new_questions_answer_llama8b.json', 'w')
-	outfile = open('data/math/gsm8k_cot_sc_llama3.1_8b/temp=0.7_iter=5.json', 'w')
 	json.dump(program_data, outfile, indent=4)
@@ -356,10 +372,19 @@ def parse_answer(answer):
 		if 'answer is' in answer:
 			answer = answer.split('answer is')[-1].strip()
 		else:
-			answer = answer.split(' ')[-1]
 		if len(answer) > 0 and answer[-1] == '.':
 				answer = answer[0:-1]
 		answer = re.sub("[^\d\.]", "", answer)
 		return answer
 	else:
 		answer_freq = {}
@@ -394,7 +419,7 @@ def collect_self_consistency_result(infile_path):
-def evaluator(infile_path):
 	infile = open(infile_path, 'r')
 	data = json.load(infile)
 	correct_case = 0
@@ -406,44 +431,95 @@ def evaluator(infile_path):
 			continue
 		total_case += 1
 		prediction = parse_answer(case['prediction'])
-		if prediction == case['answer'] or case['answer'] in prediction:
 			correct_case += 1
-			new_parameter_correct_case = 0
-			for idx, pred in enumerate(case['new_prediction']):
-				parsed_pred = parse_answer(pred)
-				if parsed_pred == case['new_answers'][idx] or case['new_answers'][idx] in parsed_pred:
-					new_parameter_correct_case += 1
-				else:
-					try:
-						parsed_pred = round(float(parsed_pred))
-						new_answer = round(float(case['new_answers'][idx]))
-						if parsed_pred == new_answer:
-							new_parameter_correct_case += 1
-						else:
-							print(parsed_pred, case['new_answers'][idx])
-					except:
-						continue
-			total_parameter_correct_case = len(case['new_prediction'])
-			percentage = float(new_parameter_correct_case / total_parameter_correct_case)
-			total_percentage += percentage
-			if new_parameter_correct_case not in new_parameter_correct_counter:
-				new_parameter_correct_counter[new_parameter_correct_case] = 0
-			new_parameter_correct_counter[new_parameter_correct_case] += 1
 		# else:
 		# 	print(prediction, case['answer'])
 	print(correct_case, total_case, correct_case/total_case)
-	print(total_percentage/correct_case)
 	print(new_parameter_correct_counter)
 def main():
 	# generate_new_parameter_value()
 	# update_question_with_new_parameters()
-	answer_question(model_name='llama', cot=True)
-	# collect_self_consistency_result('data/math/gsm8k_cot_sc')
-	# evaluator('data/math/gsm8k_cot_sc_merged_result.json')
 	# evaluator('data/math/test_dump_gsm8k_train_perturbed_with_new_questions_answer_few_shot_cot_qwen.json')
 if __name__ == "__main__":
 	main()

   api_key='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'
 )
+# client = OpenAI(
+#   api_key='sk-svcacct-JlNMlCPtZ_F0zJtJM9yaYSYzG8xnSdksl2uYUZLuabGoOCKqDtKGTWhHOlq-Idm4lT3BlbkFJ4zHo-hOjH6J8ne9IturX2sQA-tdKDOUw3Oj44pShZZ3iM-ptGsVcd8LFvB8pBIpAA'
+# )
 torch.random.manual_seed(0)
 # )
 # qwen_tokenizer = AutoTokenizer.from_pretrained(qwen_model_name)
+# llama_pipeline = pipeline(
+#     "text-generation",
+#     model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+#     model_kwargs={"torch_dtype": torch.bfloat16},
+#     device_map="auto",
+# )
 	json.dump(program_data, outfile, indent=4)
+def call_answer_question(question, model_name='gpt', cot=False, temp=0.7):
 	if cot:
 		prompt_template = PROMPT_DICT['prompt_answer_question_few_shot_cot']
 	else:
 	if model_name == 'gpt':
 		response = client.chat.completions.create(
 			model="gpt-4o",
+			# model="gpt-4-turbo",
 			messages=[
 				{"role": "system", "content": "You are a helpful assistant."},
 				{"role": "user", "content": prompt}
 			],
+			temperature=temp,
+			max_tokens=1024,
 			top_p=1
 		)
 		return response.choices[0].message.content
 				messages=[
 					{"role": "user", "content": prompt}
 				],
+				temperature=temp,
 				top_p=1
 			)
 			return message.content[0].text
 		#
 		# 	output = pipe(messages, **generation_args)
 		# 	print(output[0]['generated_text'])
+		if model_name == 'qwen':
+			messages = [
+				{"role": "system", "content": "You are a helpful assistant."},
+				{"role": "user", "content": prompt}
+			]
+			text = qwen_tokenizer.apply_chat_template(
+				messages,
+				tokenize=False,
+				add_generation_prompt=True
+			)
+			model_inputs = qwen_tokenizer([text], return_tensors="pt").to(qwen_model.device)
+			generated_ids = qwen_model.generate(
+				**model_inputs,
+				max_new_tokens=300,
+				temperature=temp
+			)
+			generated_ids = [
+				output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+			]
+			response = qwen_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+			return response
 		if model_name == 'llama':
 			messages = [
 				{"role": "system", "content": "You are a helpful assistant."},
 				messages,
 				max_new_tokens=300,
 				# temperature=0.00001
+				temperature = temp
 			)
 			# print(outputs[0]["generated_text"][-1])
 			return outputs[0]["generated_text"][-1]['content']
+def answer_question(model_name='gpt', cot=False, temp=0.0):
+	# infile = open('data/math/test_dump_gsm8k_train_perturbed_with_new_questions.json', 'r')
+	infile = open('data/math/test_dump_math_train_4o_perturbed_with_new_questions.json', 'r')
 	program_data = json.load(infile)
 	print(len(program_data))
 	for case in tqdm(program_data):
+		response = call_answer_question(case['question'], model_name=model_name, cot=cot, temp=temp)
 		case['prediction'] = response
 		# print(case['prediction'])
 		case['new_prediction'] = []
 		for question in case['new_questions']:
+			response = call_answer_question(question, model_name=model_name, cot=cot, temp=temp)
 			case['new_prediction'].append(response)
 		# print(case)
 		# break
 	# print(case)
 	# break
+	# outfile = open('data/math/test_dump_gsm8k_train_perturbed_with_new_questions_answer_few_shot_cot_llama8b.json', 'w')
+	# outfile = open('data/math/gsm8k_cot_sc_qwen/temp=0.7_iter=5.json', 'w')
+	# outfile = open('data/math/gsm8k_cot_sc_llama3.1_8b/temp=0.7_iter=5.json', 'w')
+	# outfile = open('data/math/test_dump_math_train_4o_perturbed_with_new_questions_few_shot_cot_qwen.json', 'w')
+	# outfile = open('data/math/test_dump_math_train_4o_perturbed_with_new_questions_few_shot_cot_gpt4o.json', 'w')
+	outfile = open('data/math/math_cot_sc_gpt4o/temp=0.7_iter=2.json', 'w')
+	# outfile = open('data/math/math_cot_sc_qwen/temp=0.7_iter=5.json', 'w')
+	# outfile = open('data/math/math_cot_sc_llama3.1_8b/temp=0.7_iter=4.json', 'w')
 	json.dump(program_data, outfile, indent=4)
 		if 'answer is' in answer:
 			answer = answer.split('answer is')[-1].strip()
 		else:
+			if '\\(' in answer and '\\)' in answer:
+				answer = answer.split('\\(')[-1].split('\\)')[0]
+			else:
+				# print("Before: ", answer)
+				answer = answer.split(' ')[-1]
 		if len(answer) > 0 and answer[-1] == '.':
 				answer = answer[0:-1]
+		print("##########Before: ", answer)
+		answer = answer.split('=')[-1]
 		answer = re.sub("[^\d\.]", "", answer)
+		print("################After: ", answer)
 		return answer
 	else:
 		answer_freq = {}
+def evaluator(infile_path, normalize=False):
 	infile = open(infile_path, 'r')
 	data = json.load(infile)
 	correct_case = 0
 			continue
 		total_case += 1
 		prediction = parse_answer(case['prediction'])
+		parsed_gold = parse_answer(str(case['answer']))
+		case['answer'] = str(case['answer'])
+		if prediction == case['answer'] or case['answer'] in prediction or prediction == parsed_gold or parsed_gold in prediction:
 			correct_case += 1
+		else:
+			# print(prediction)
+			if normalize:
+				continue
+		new_parameter_correct_case = 0
+		for idx, pred in enumerate(case['new_prediction']):
+			parsed_pred = parse_answer(pred)
+			parsed_gold = parse_answer(case['new_answers'][idx])
+			if parsed_pred == case['new_answers'][idx] or case['new_answers'][idx] in parsed_pred or parsed_pred == parsed_gold or parsed_gold in parsed_pred:
+				new_parameter_correct_case += 1
+			else:
+				try:
+					parsed_pred = round(float(parsed_pred))
+					new_answer = round(float(case['new_answers'][idx]))
+					if parsed_pred == new_answer:
+						new_parameter_correct_case += 1
+					# else:
+					# 	print(parsed_pred, case['new_answers'][idx])
+				except:
+					continue
+		total_parameter_correct_case = len(case['new_prediction'])
+		percentage = float(new_parameter_correct_case / total_parameter_correct_case)
+		total_percentage += percentage
+		if new_parameter_correct_case not in new_parameter_correct_counter:
+			new_parameter_correct_counter[new_parameter_correct_case] = 0
+		new_parameter_correct_counter[new_parameter_correct_case] += 1
 		# else:
 		# 	print(prediction, case['answer'])
 	print(correct_case, total_case, correct_case/total_case)
+	if normalize:
+		print(total_percentage, total_percentage/correct_case)
+	else:
+		print(total_percentage, total_percentage/total_case)
 	print(new_parameter_correct_counter)
+	print(new_parameter_correct_counter[5] / correct_case)
+def sample_questions(filepath):
+	infile = open(filepath, 'r')
+	data = json.load(infile)
+	filtered_data = []
+	for case in data:
+		if 'new_answers' not in case or len(case['new_answers']) != 5:
+			continue
+		filtered_data.append(case)
+	filtered_data = random.sample(filtered_data, 100)
+	# with open('data/sample_verification/gsk8k_sample.csv', 'w', newline='') as csvfile:
+	# 	csvwriter = csv.writer(csvfile, delimiter=' ',
+	# 	                        quotechar='|', quoting=csv.QUOTE_MINIMAL)
+	# 	for case in filtered_data:
+	# 		csvwriter.writerow([case['question'], case['answer'], case['parameters'], case['selected_programs'][0].replace('\n', '\\n'),
+	# 	                         case['new_parameters'], case['new_questions'], case['new_answers']])
+	out_data = []
+	for case in filtered_data:
+		new_case = {
+			'question': case['question'],
+			'answer': case['answer'],
+			'parameters': case['parameters'],
+			'programs': case['candidate_programs'][0],
+			'new_parameters': case['new_parameters'],
+			'new_questions': case['new_questions'],
+			'new_answers': case['new_answers']
+		}
+		out_data.append(new_case)
+	outfile1 = open('data/sample_verification/math_xiaodong_split.json', 'w')
+	outfile2 = open('data/sample_verification/math_ben_split.json', 'w')
+	outfile3 = open('data/sample_verification/math_hao_split.json', 'w')
+	json.dump(out_data[0:34], outfile1, indent=4)
+	json.dump(out_data[34:67], outfile2, indent=4)
+	json.dump(out_data[67:100], outfile3, indent=4)
 def main():
 	# generate_new_parameter_value()
 	# update_question_with_new_parameters()
+	# answer_question(model_name='gpt', cot=True, temp=0.7)
+	# collect_self_consistency_result('data/math/math_cot_sc_gpt4turbo')
+	evaluator('data/math/math_cot_sc_gpt4o/temp=0.7_iter=1.json', normalize=True)
 	# evaluator('data/math/test_dump_gsm8k_train_perturbed_with_new_questions_answer_few_shot_cot_qwen.json')
+	# sample_questions('data/math/test_dump_math_train_4o_perturbed_with_new_questions.json')
 if __name__ == "__main__":
 	main()