|
import os |
|
import gradio as gr |
|
from ctransformers import AutoModelForCausalLM |
|
|
|
model_repo = os.getenv('HF_MODEL_REPO') |
|
model_bin = os.getenv('HF_MODEL_BIN') |
|
|
|
llm = AutoModelForCausalLM.from_pretrained( |
|
model_repo, |
|
model_file=model_bin, |
|
threads=2, |
|
seed=42, |
|
context_length=16384, |
|
lib="avx2", |
|
) |
|
|
|
def response(prompt): |
|
txt = llm(prompt, max_new_tokens=8192, temperature=0.8, top_p=0.5, repetition_penalty=1.1, reset=False, stop=["</s>","<|im_end|>"], ) |
|
return txt |
|
|
|
if __name__ == '__main__': |
|
|
|
title = "Chat" |
|
|
|
demo_status = "Demo is running on CPU" |
|
|
|
gr.Interface(response, inputs="text", outputs="text", |
|
title=title, |
|
).launch() |