Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,811 Bytes
c311821 9b2c83e c311821 a447070 b1061d5 417df1c 9b2c83e 890d27d 9b2c83e c311821 9b2c83e df78555 c311821 417df1c 9b2c83e 48c68d3 c311821 5c6e9ae 6d728a0 5c6e9ae 417df1c 5c6e9ae d1fba84 5c6e9ae dd12d34 3682631 c311821 9b2c83e cb9cef0 4239ac9 cb9cef0 4239ac9 cb9cef0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import os
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
#torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"
torch.set_float32_matmul_precision("highest")
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
os.environ["SAFETENSORS_FAST_GPU"] = "1"
model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"
# Load model and tokenizer (outside the function for efficiency)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
# device_map="auto",
trust_remote_code=True # Add this line for Qwen models
).to('cuda',torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True) # Add this line for Qwen models
@spaces.GPU(required=True)
def generate_code(prompt):
messages = [
{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
with torch.no_grad():
generated_ids = model.generate(
**model_inputs,
max_new_tokens = 1024,
min_new_tokens = 256,
low_memory = False,
do_sample = True,
#guidance_scale = 3.8,
)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
with gr.Blocks(title="Qwen 14b") as demo: # Updated title
with gr.Tab("Code Chat"):
run_button = gr.Button("Run", scale=0)
prompt = gr.Text(
label="Prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
container=False,
)
result = gr.Text(
label="Result",
show_label=False,
max_lines=100,
container=False,
)
gr.on(
triggers=[
run_button.click,
],
# api_name="generate", # Add this line
fn=generate_code,
inputs=[
prompt,
],
outputs=[result],
)
demo.launch(share=False) |