training script: https://gist.github.com/notlober/9bf4c3ab6ddeb12ec669ca495653708a inference code:
from transformers import AutoModelForCausalLM, AutoTokenizer
#####
max_new_toks = 2048
N_BEAMS = 5
#####
def do_instruct(prompt):
return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"
def generate_output_once(prompt):
message = [
{"role": "user", "content": do_instruct(prompt)}
]
text = tokenizer.apply_chat_template(
message,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
generated_ids = model.generate(
model_inputs,
max_new_tokens=max_new_toks,
repetition_penalty=1.2,
num_beam_groups=N_BEAMS,
num_beams=N_BEAMS,
diversity_penalty=0.5,
early_stopping=True,
do_sample=False # do not set to True if you get a warning, skip it
)
return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)
def test_gen(prompt):
answer_str = generate_output_once(prompt)
print(f"Answer: {answer_str}")
#####
model = AutoModelForCausalLM.from_pretrained(
"notbdq/gemma-grpo",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
#####
test_gen("...") # put your prompt here
benchmarks: its definitely better than qwen 14b 1m, but i have only tested for 15 samples of aime validation set and it was doing better than qwen 2.5 1m since first sample but there are 75 samples more so i am sharing the script so someone can benchmark it if wants:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
#####
max_new_toks = 2048
N_BEAMS = 5
#####
def do_instruct(prompt):
return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"
def generate_output_once_grpo(model, prompt):
message = [
{"role": "user", "content": do_instruct(prompt)}
]
text = tokenizer.apply_chat_template(
message,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
generated_ids = model.generate(
model_inputs,
max_new_tokens=max_new_toks,
repetition_penalty=1.2,
num_beam_groups=N_BEAMS,
num_beams=N_BEAMS,
diversity_penalty=0.5,
early_stopping=True,
do_sample=False # do not set to True if you get a warning, skip it
)
return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)
def generate_output_once(model, prompt):
message = [
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
message,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
generated_ids = model.generate(
model_inputs,
max_new_tokens=max_new_toks
)
return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)
def check_model_contain_output(model_output, ground_t_output):
if ground_t_output in model_output:
return True
return False
def extract_answer(text):
try: return text.split("<answer>")[1].split("</answer>")[0]
except: return None
def do_eval(debug):
total_iters = len(eval_dataset)
wins_reasoning = 0
wins_qwen = 0
for l in range(len(eval_dataset)):
row = eval_dataset[l]
problem = row["problem"]
ground_truth = row["answer"]
response = generate_output_once_grpo(model, problem)
response_qwen = generate_output_once(model_qwen, problem)
reward = check_model_contain_output(response, ground_truth)
reward_qwen = check_model_contain_output(response_qwen, ground_truth)
if reward: wins_reasoning += 1
if reward_qwen: wins_qwen += 1
print(f"reasoning model: %{wins_reasoning / total_iters}")
print(f"qwen model: %{wins_qwen / total_iters}")
if debug:
print("qwen:", response_qwen)
print("reasoning fine tuned:", response)
#####
model = AutoModelForCausalLM.from_pretrained(
"notbdq/gemma-grpo",
torch_dtype="auto",
device_map="auto"
)
model_qwen = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2.5-14B-Instruct-1M",
torch_dtype="auto",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
eval_dataset = load_dataset("AI-MO/aimo-validation-aime", split="train")
#####
do_eval(debug=False)
technique: GRPO applied to qwen 14b 1m with Numina cot dataset
hardware: 8xmi300x for like 64 steps
current issues: 1. infinite generation when hit a hard problem 2. growing sequence length when training
author: baki
contact: https://x.com/bakiv11771441
- Downloads last month
- 20
Inference Providers
NEW
This model is not currently available via any of the supported third-party Inference Providers, and
the model is not deployed on the HF Inference API.