import gradio as gr from huggingface_hub import InferenceClient # Initialize the client with the fine-tuned model client = InferenceClient("Qwen/Qwen2.5-Coder-7B-Instruct") # Update if using another model # Model's token limit MODEL_TOKEN_LIMIT = 16384 # Function to validate inputs def validate_inputs(max_tokens, temperature, top_p, input_tokens): if max_tokens + input_tokens > MODEL_TOKEN_LIMIT: raise ValueError(f"Max tokens + input tokens must not exceed {MODEL_TOKEN_LIMIT}. Adjust the max tokens.") if not (0.1 <= temperature <= 4.0): raise ValueError("Temperature must be between 0.1 and 4.0.") if not (0.1 <= top_p <= 1.0): raise ValueError("Top-p must be between 0.1 and 1.0.") # Function to calculate input token count (basic approximation) def count_tokens(messages): return sum(len(m["content"].split()) for m in messages) # Response generation def respond(message, history, system_message, max_tokens, temperature, top_p): # Prepare messages for the model messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: # User's message messages.append({"role": "user", "content": val[0]}) if val[1]: # Assistant's response messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) # Calculate input token count input_tokens = count_tokens(messages) max_allowed_tokens = MODEL_TOKEN_LIMIT - input_tokens # Ensure max_tokens does not exceed the model's token limit if max_tokens > max_allowed_tokens: max_tokens = max_allowed_tokens validate_inputs(max_tokens, temperature, top_p, input_tokens) response = "" # Generate response with streaming for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response # Updated system message system_message = """ You are an advanced AI assistant specialized in coding tasks. - You deliver precise, error-free code in multiple programming languages. - Analyze queries for logical accuracy and provide optimized solutions. - Ensure clarity, brevity, and adherence to programming standards. Guidelines: 1. Prioritize accurate, functional code. 2. Provide explanations only when necessary for understanding. 3. Handle tasks ethically, respecting user intent and legal constraints. Thank you for using this system. Please proceed with your query. """ # Gradio Interface demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value=system_message, label="System message", lines=10), gr.Slider(minimum=1, maximum=16384, value=1000, step=1, label="Max new tokens"), # Default fixed gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), ], ) if __name__ == "__main__": demo.launch()