import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, StoppingCriteria, StoppingCriteriaList from threading import Thread tokenizer = AutoTokenizer.from_pretrained("haidlir/bloom-chatml-id") model = AutoModelForCausalLM.from_pretrained("haidlir/bloom-chatml-id") def predict(message, history): history_chatml_format = [] for human, assistant in history: history_chatml_format.append({"role": "user", "content": human }) history_chatml_format.append({"role": "assistant", "content":assistant}) history_chatml_format.append({"role": "user", "content": message}) model_inputs = chat_tokenizer.apply_chat_template( history_chatml_format, tokenize=True, add_generation_prompt=True, return_tensors="pt", ) streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( model_inputs, streamer=streamer, max_new_tokens=1024, do_sample=True, top_p=0.95, top_k=1000, temperature=1.0, num_beams=1, stopping_criteria=StoppingCriteriaList([stop]) ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_message = "" for new_token in streamer: if new_token != '<': partial_message += new_token yield partial_message gr.ChatInterface(predict).launch()