import gradio as gr # type: ignore import spaces # type: ignore import torch # Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True ) model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True ) model.to("cuda") def greet(name, sliderint): return "Hellonyaaaaa " + name + "!!" + str(sliderint) chat_template = ( "{% for message in messages %}" "{{'<|' + message['role'] + '|>' + message['content'] + '\n'}}" "{% endfor %}" "{% if add_generation_prompt %}" "{{ '<|model|>\n' }}" "{% endif %}" ) # @spaces.GPU(duration=45) def chatinterface_fn(message, history): prompt = [] for human, assistant in history: prompt.append({"role": "user", "content": human}) prompt.append({"role": "model", "content": assistant}) prompt.append({"role": "user", "content": message}) token_ids = tokenizer.apply_chat_template( prompt, tokenize=True, add_generation_prompt=True, chat_template=chat_template, return_tensors="pt", ) print("token_ids:", token_ids) # デバッグ用に追加 output_ids = model.generate( token_ids.to(model.device), temperature=0.1, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=256, ) text = tokenizer.decode(output_ids[0], skip_special_tokens=True) print(text) return text @spaces.GPU(duration=45) def infer(message: str) -> str: input_ids = tokenizer.encode( "hello, this is", add_special_tokens=False, return_tensors="pt" ).to(model.device) print(model.device) outputs = model.generate(input_ids) text = tokenizer.decode(outputs[0], skip_special_tokens=True) return text with gr.Blocks() as demo: name = gr.Textbox(label="name") output = gr.Interface(fn=greet, inputs=["text", "slider"], outputs="text") a = gr.ChatInterface(chatinterface_fn, title="microsoft/Phi-3-mini-4k-instruct") b = gr.Interface(fn=infer, inputs="text", outputs="text") demo.launch()