Uploaded model

  • Developed by: umarigan
  • License: apache-2.0
  • Finetuned from model : unsloth/meta-llama-3.1-8b-instruct-bnb-4bit

This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.

Eval results: arc-tr = 57.68% truthful_qa-tr = ~20%-40%

following code to reproduce the results:


import torch
from transformers import pipeline
from datasets import load_dataset
import re
import torch
from transformers import pipeline

model_id = "umarigan/llama-3.2-8B-R1-Tr"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


#ARC-TR
ds = load_dataset("mukayese/arc-tr", split ='test')

def extract_answer(text):
    """Extract first occurring A-D label from generated text"""
    match = re.search(r'\b([A-D])\b', text, re.IGNORECASE)
    return match.group(1).upper() if match else None

total = 0
correct = 0

for example in ds:
    # Format the question and choices
    question = example["question"]
    choices = "\n".join([f"{label}) {text}" for label, text in 
                       zip(example["choices"]["label"], example["choices"]["text"])])
    
    # Create prompt with explicit instruction
    prompt = f"""Answer this multiple-choice question by providing ONLY the letter corresponding to the correct answer (A, B, C, or D). Do not include any explanation.

    Question: {question}
    Options:
    {choices}
    Answer:"""
        
    # Generate response
    messages = [{"role": "user", "content": prompt}]
    try:
        outputs = pipe(
            messages,
            max_new_tokens=5,  # Limit response length to get just the answer
            do_sample=False    # Disable sampling for more deterministic answers
        )
        response = outputs[0]["generated_text"][-1]['content']
        predicted = extract_answer(response)
        answer = example["answerKey"]
        
        # Update counters
        total += 1
        if predicted == answer:
            correct += 1
            
    except Exception as e:
        print(f"Error processing example: {e}")
        continue

# Print results
print(f"\nBenchmark Results:")
print(f"Total questions processed: {total}")
print(f"Correct answers: {correct}")
print(f"Accuracy: {correct/total:.2%}" if total > 0 else "No questions processed")
#output
#Benchmark Results:
#Total questions processed: 1172
#Correct answers: 676
#Accuracy: 57.68%


#TRUTHFUL-TR

import re
ds2 = load_dataset("mukayese/truthful_qa-tr", split ='validation')
def evaluate_mc(example, targets_key="mc1_targets"):
    """Evaluate a single multiple-choice example with variable choices"""
    question = example["question"]
    choices = example[targets_key]["choices"]
    labels = example[targets_key]["labels"]
    
    # Generate option labels dynamically (A, B, C, ..., G)
    option_labels = [chr(65 + i) for i in range(len(choices))]
    
    # Create prompt with explicit instruction
    options_text = "\n".join([f"{label}) {text}" for label, text in zip(option_labels, choices)])
    prompt = f"""Answer this multiple-choice question by selecting the most correct option. Provide only the letter corresponding to your choice ({', '.join(option_labels)}).

  Question: {question}
  Options:
  {options_text}
  Answer:"""
    
    # Generate response
    messages = [{"role": "user", "content": prompt}]
    try:
        outputs = pipe(
            messages,
            max_new_tokens=5,  # Limit response length to get just the answer
            do_sample=False    # Disable sampling for more deterministic answers
        )
        response = outputs[0]["generated_text"][-1]['content']
        
        # Extract predicted label
        predicted = extract_answer(response, option_labels)
        if predicted is None:
            return 0  # Count as incorrect if no valid answer
        
        # Get correct answer
        correct_idx = labels.index(1)
        correct_label = option_labels[correct_idx]
        
        return int(predicted == correct_label)
    
    except Exception as e:
        print(f"Error processing example: {e}")
        return 0

def extract_answer(text, valid_labels):
    """Extract first occurring valid label from generated text"""
    # Create regex pattern that matches any of the valid labels
    pattern = r'\b(' + '|'.join(valid_labels) + r')\b'
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(1).upper() if match else None

# Evaluate on both mc1 and mc2 targets
mc1_scores = []
mc2_scores = []

for example in ds2:
    mc1_scores.append(evaluate_mc(example, "mc1_targets"))
    mc2_scores.append(evaluate_mc(example, "mc2_targets"))

# Calculate metrics
def calculate_metrics(scores):
    total = len(scores)
    correct = sum(scores)
    accuracy = correct / total if total > 0 else 0
    return total, correct, accuracy

mc1_total, mc1_correct, mc1_accuracy = calculate_metrics(mc1_scores)
mc2_total, mc2_correct, mc2_accuracy = calculate_metrics(mc2_scores)

# Print results
print("\nBenchmark Results:")
print(f"MC1 Targets:")
print(f"Total questions: {mc1_total}")
print(f"Correct answers: {mc1_correct}")
print(f"Accuracy: {mc1_accuracy:.2%}")
print(f"\nMC2 Targets:")
print(f"Total questions: {mc2_total}")
print(f"Correct answers: {mc2_correct}")
print(f"Accuracy: {mc2_accuracy:.2%}")

#output
#MC1 Targets:
#Total questions: 817
#Correct answers: 355
#Accuracy: 43.45%

#MC2 Targets:
#Total questions: 817
#Correct answers: 181
#Accuracy: 22.15
Downloads last month
16
Safetensors
Model size
8.03B params
Tensor type
BF16
·
Inference Providers NEW
This model is not currently available via any of the supported third-party Inference Providers, and the model is not deployed on the HF Inference API.

Dataset used to train umarigan/llama-3.2-8B-R1-Tr