import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import os # 获取环境变量中的 token hf_token = os.getenv("HF_TOKEN") # 加载模型和分词器 model_path = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8" tokenizer = AutoTokenizer.from_pretrained( model_path, token=hf_token ) model = AutoModelForCausalLM.from_pretrained( model_path, token=hf_token, device_map="auto", load_in_4bit=True, # 启用4-bit量化加载 ) def generate_text(prompt, max_length=512, temperature=0.7, top_p=0.9): # 准备输入 inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # 生成回答 with torch.no_grad(): outputs = model.generate( **inputs, max_length=max_length, temperature=temperature, top_p=top_p, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, ) # 解码输出 response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # 创建Gradio界面 iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(lines=5, label="输入提示"), gr.Slider(minimum=64, maximum=1024, value=512, label="最大长度"), gr.Slider(minimum=0.1, maximum=1.0, value=0.7, label="温度"), gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"), ], outputs=gr.Textbox(lines=5, label="生成的文本"), title="Llama-3.2-1B-Instruct 演示", description="输入提示,获取AI生成的回答", ) # 启动应用 iface.launch()