Spaces:

1inkusFace
/

qwen2.5-32b-instruct

Running on Zero

File size: 2,811 Bytes

c311821
9b2c83e
c311821
a447070
b1061d5
417df1c
 
 
 
 
 
 
 
 
 
 
 
 
9b2c83e
890d27d
9b2c83e
c311821
9b2c83e
 
 
df78555
c311821
417df1c
9b2c83e
48c68d3
c311821
 
 
 
 
 
 
 
 
 
 
 
 
5c6e9ae
 
 
6d728a0
5c6e9ae
417df1c
5c6e9ae
d1fba84
5c6e9ae
dd12d34
3682631
c311821
9b2c83e
cb9cef0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4239ac9
cb9cef0
 
4239ac9
cb9cef0

import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import gradio as gr
import os 

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
torch.backends.cudnn.allow_tf32 = False
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = False
#torch.backends.cuda.preferred_blas_library="cublas"
# torch.backends.cuda.preferred_linalg_library="cusolver"

torch.set_float32_matmul_precision("highest")
os.putenv("HF_HUB_ENABLE_HF_TRANSFER","1")
os.environ["SAFETENSORS_FAST_GPU"] = "1"

model_name = "Qwen/Qwen2.5-Coder-14B-Instruct"

# Load model and tokenizer (outside the function for efficiency)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
   # device_map="auto",
    trust_remote_code=True # Add this line for Qwen models
).to('cuda',torch.bfloat16)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True) # Add this line for Qwen models

@spaces.GPU(required=True)
def generate_code(prompt):
    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens = 1024,
            min_new_tokens = 256,
            low_memory = False,
            do_sample = True,
            #guidance_scale = 3.8,
        )
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

with gr.Blocks(title="Qwen 14b") as demo:  # Updated title
    with gr.Tab("Code Chat"):
        run_button = gr.Button("Run", scale=0)
        prompt = gr.Text(
            label="Prompt",
            show_label=False,
            max_lines=1,
            placeholder="Enter your prompt",
            container=False,
        )
        result = gr.Text(
            label="Result",
            show_label=False,
            max_lines=100,
            container=False,
        )
        gr.on(
            triggers=[
                run_button.click,
            ],
      #  api_name="generate",  # Add this line
            fn=generate_code,
            inputs=[
                prompt,
            ],
            outputs=[result],
        )

demo.launch(share=False)