Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torch | |
import gradio as gr | |
import os | |
torch.backends.cuda.matmul.allow_tf32 = False | |
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False | |
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False | |
torch.backends.cudnn.allow_tf32 = False | |
torch.backends.cudnn.deterministic = False | |
torch.backends.cudnn.benchmark = False | |
#torch.backends.cuda.preferred_blas_library="cublas" | |
# torch.backends.cuda.preferred_linalg_library="cusolver" | |
torch.set_float32_matmul_precision("highest") | |
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1") | |
os.environ["SAFETENSORS_FAST_GPU"] = "1" | |
model_name = "Qwen/Qwen2.5-Coder-14B-Instruct" | |
# Load model and tokenizer (outside the function for efficiency) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype="auto", | |
# device_map="auto", | |
trust_remote_code=True # Add this line for Qwen models | |
).to('cuda',torch.bfloat16) | |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True) # Add this line for Qwen models | |
def generate_code(prompt): | |
messages = [ | |
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
] | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |
with torch.no_grad(): | |
generated_ids = model.generate( | |
**model_inputs, | |
max_new_tokens = 1024, | |
min_new_tokens = 256, | |
low_memory = False, | |
do_sample = True, | |
#guidance_scale = 3.8, | |
) | |
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] | |
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
return response | |
with gr.Blocks(title="Qwen 14b") as demo: # Updated title | |
with gr.Tab("Code Chat"): | |
run_button = gr.Button("Run", scale=0) | |
prompt = gr.Text( | |
label="Prompt", | |
show_label=False, | |
max_lines=1, | |
placeholder="Enter your prompt", | |
container=False, | |
) | |
result = gr.Text( | |
label="Result", | |
show_label=False, | |
max_lines=100, | |
container=False, | |
) | |
gr.on( | |
triggers=[ | |
run_button.click, | |
], | |
# api_name="generate", # Add this line | |
fn=generate_code, | |
inputs=[ | |
prompt, | |
], | |
outputs=[result], | |
) | |
demo.launch(share=False) |