from transformers import AutoModelForCausalLM, AutoTokenizer import torch import gradio as gr # 加载指令模型 model = AutoModelForCausalLM.from_pretrained( "MediaTek-Research/Breeze-7B-Instruct-v1_0", device_map="auto", # 保留这一行以使用设备映射 torch_dtype=torch.bfloat16, ) # 加载分词器 tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v1_0") # 定义SYS_PROMPT SYS_PROMPT = "You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan." def generate_response(user_input): # 定义聊天内容 chat = [ {"role": "user", "content": user_input}, ] # 应用聊天模板 prompt = tokenizer.apply_chat_template(chat, tokenize=False) full_prompt = f"{SYS_PROMPT} [INST] {prompt} [/INST]" # 生成文本 inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device) outputs = model.generate( inputs["input_ids"], max_new_tokens=128, top_p=0.95, top_k=50, repetition_penalty=1.1, temperature=0.7, ) # 解码输出 generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return generated_text # 创建Gradio界面 iface = gr.Interface(fn=generate_response, inputs="text", outputs="text") # 启动Gradio界面并共享链接 iface.launch(share=True)