import torch import gradio as gr from peft import AutoPeftModelForCausalLM from huggingface_hub import InferenceClient from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig """ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co./docs/huggingface_hub/v0.22.2/en/guides/inference """ model_name = "tz3r0n4r/SmolLM2-FT-Race-v0.002.76" # client = InferenceClient("tz3r0n4r/SmolLM2-FT-Race-v0.002.76") model = AutoPeftModelForCausalLM.from_pretrained( model_name, device_map="auto", torch_dtype=torch.float16 ) tokenizer = AutoTokenizer.from_pretrained(model_name) def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, ): messages = [ { "role": "system", "content": system_message }, { "role": "user", "content": message } ] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) outputs = model.generate( **tokenizer(prompt, return_tensors="pt", padding=True).to(model.device), max_new_tokens=max_tokens, temperature=temperature, # Try 0.8-0.9 to reduce repetition do_sample=True, top_p=top_p, # Add these parameters: num_beams=3, # Add beam search no_repeat_ngram_size=3, # Prevent repetition repetition_penalty=1.2, # Penalize repeated content length_penalty=1.0, # Encourage completion early_stopping=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ prompt = """ You are a skilled assistant specialized in reading comprehension and question generation. For each passage, generate ONE meaningful multiple-choice question that tests deep understanding. REQUIRED FORMAT: Question: [Clear, specific question] Options: A) [Distinct option 1] B) [Distinct option 2] C) [Distinct option 3] D) [Distinct option 4] Answer: [A/B/C/D] RULES: 1. ALWAYS include Question:, Options:, and Answer: markers 2. NEVER repeat options 3. Make all options similar in length 4. Base everything on passage content only 5. Complete all responses fully """ demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value=prompt, label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), ], ) if __name__ == "__main__": demo.launch()