import spaces from transformers import AutoModelForCausalLM, AutoTokenizer import torch import gradio as gr import os torch.backends.cuda.matmul.allow_tf32 = False torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False torch.backends.cudnn.allow_tf32 = False torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False #torch.backends.cuda.preferred_blas_library="cublas" # torch.backends.cuda.preferred_linalg_library="cusolver" torch.set_float32_matmul_precision("highest") os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1") os.environ["SAFETENSORS_FAST_GPU"] = "1" model_name = "Qwen/Qwen2.5-Coder-14B-Instruct" # Load model and tokenizer (outside the function for efficiency) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="auto", # device_map="auto", trust_remote_code=True # Add this line for Qwen models ).to('cuda',torch.bfloat16) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True) # Add this line for Qwen models @spaces.GPU(required=True) def generate_code(prompt): messages = [ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to(model.device) with torch.no_grad(): generated_ids = model.generate( **model_inputs, max_new_tokens = 1024, min_new_tokens = 256, low_memory = False, do_sample = True, #guidance_scale = 3.8, ) generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] return response with gr.Blocks(title="Qwen 14b") as demo: # Updated title with gr.Tab("Code Chat"): run_button = gr.Button("Run", scale=0) prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) result = gr.Text( label="Result", show_label=False, max_lines=100, container=False, ) gr.on( triggers=[ run_button.click, ], # api_name="generate", # Add this line fn=generate_code, inputs=[ prompt, ], outputs=[result], ) demo.launch(share=False)