from datasets import load_dataset from transformers import AutoTokenizer # These will use different templates automatically mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat") smol_tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct") messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"}, ] # Each will format according to its model's template mistral_chat = mistral_tokenizer.apply_chat_template(messages, tokenize=False) qwen_chat = qwen_tokenizer.apply_chat_template(messages, tokenize=False) smol_chat = smol_tokenizer.apply_chat_template(messages, tokenize=False) dataset = load_dataset("HuggingFaceTB/smoltalk") def convert_to_chatml(example): return { "messages": [ {"role": "user", "content": example["input"]}, {"role": "assistant", "content": example["output"]}, ] }