notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning

training script: https://gist.github.com/notlober/9bf4c3ab6ddeb12ec669ca495653708a inference code:

from transformers import AutoModelForCausalLM, AutoTokenizer

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
    return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once(prompt):
    message = [
        {"role": "user", "content": do_instruct(prompt)}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks,
        repetition_penalty=1.2,
        num_beam_groups=N_BEAMS,
        num_beams=N_BEAMS,
        diversity_penalty=0.5,
        early_stopping=True,
        do_sample=False # do not set to True if you get a warning, skip it
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def test_gen(prompt):
    answer_str = generate_output_once(prompt)
    print(f"Answer: {answer_str}")

#####
model = AutoModelForCausalLM.from_pretrained(
    "notbdq/gemma-grpo",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
#####

test_gen("...") # put your prompt here

benchmarks: its definitely better than qwen 14b 1m, but i have only tested for 15 samples of aime validation set and it was doing better than qwen 2.5 1m since first sample but there are 75 samples more so i am sharing the script so someone can benchmark it if wants:

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset

#####
max_new_toks = 2048
N_BEAMS = 5
#####

def do_instruct(prompt):
    return f"A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first does reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>. User: {prompt} Assistant:"

def generate_output_once_grpo(model, prompt):
    message = [
        {"role": "user", "content": do_instruct(prompt)}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks,
        repetition_penalty=1.2,
        num_beam_groups=N_BEAMS,
        num_beams=N_BEAMS,
        diversity_penalty=0.5,
        early_stopping=True,
        do_sample=False # do not set to True if you get a warning, skip it
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def generate_output_once(model, prompt):
    message = [
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

    generated_ids = model.generate(
        model_inputs,
        max_new_tokens=max_new_toks
    )
    return tokenizer.decode(generated_ids[0, model_inputs.shape[1]:], skip_special_tokens=True)

def check_model_contain_output(model_output, ground_t_output):
    if ground_t_output in model_output:
        return True
    return False

def extract_answer(text):
    try: return text.split("<answer>")[1].split("</answer>")[0]
    except: return None

def do_eval(debug):
    total_iters = len(eval_dataset)
    wins_reasoning = 0
    wins_qwen = 0
    for l in range(len(eval_dataset)):
        row = eval_dataset[l]
        problem = row["problem"]
        ground_truth = row["answer"]
        response = generate_output_once_grpo(model, problem)
        response_qwen = generate_output_once(model_qwen, problem)
        reward = check_model_contain_output(response, ground_truth)
        reward_qwen = check_model_contain_output(response_qwen, ground_truth)
        if reward: wins_reasoning += 1
        if reward_qwen: wins_qwen += 1
        print(f"reasoning model: %{wins_reasoning / total_iters}")
        print(f"qwen model: %{wins_qwen / total_iters}")
        if debug:
            print("qwen:", response_qwen)
            print("reasoning fine tuned:", response)

#####
model = AutoModelForCausalLM.from_pretrained(
    "notbdq/gemma-grpo",
    torch_dtype="auto",
    device_map="auto"
)
model_qwen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2.5-14B-Instruct-1M",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-14B-Instruct-1M")
eval_dataset = load_dataset("AI-MO/aimo-validation-aime", split="train")
#####

do_eval(debug=False)

technique: GRPO applied to qwen 14b 1m with Numina cot dataset

hardware: 8xmi300x for like 64 steps

current issues: 1. infinite generation when hit a hard problem 2. growing sequence length when training

author: baki

contact: https://x.com/bakiv11771441

notbdq
/

Qwen2.5-14B-Instruct-1M-GRPO-Reasoning

Model tree for notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning