Spaces:
Running
on
Zero
Running
on
Zero
import os | |
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
import spaces | |
huggingface_token = os.getenv("HF_TOKEN") | |
infer_prompt = "فيما يلي تعليمات تصف مهمة. اكتب استجابة تكمل الطلب بشكل مناسب.\n\n### تعليمات:\n{}\n\n### إجابة:\n" | |
model_id = "nazimali/mistral-7b-v0.3-instruct-arabic" | |
file_name = "Q8_0.gguf" | |
llm = None | |
hf_hub_download( | |
repo_id=model_id, | |
filename=file_name, | |
local_dir="./models", | |
token=huggingface_token, | |
) | |
def respond( | |
message, | |
history, | |
): | |
global llm | |
if llm is None: | |
llm = Llama( | |
model_path=f"./models/{file_name}", | |
flash_attn=True, | |
n_gpu_layers=-1, | |
n_ctx=1024, | |
verbose=True, | |
) | |
stream = llm.create_chat_completion( | |
messages=[{"role": "user", "content": infer_prompt.format(message) }], | |
max_tokens=512, | |
repeat_penalty=1.2, | |
stream=True, | |
temperature=0.7, | |
top_k=40, | |
top_p=0.95, | |
) | |
outputs = "" | |
for output in stream: | |
print(output) | |
outputs += output["choices"][0]["delta"].get("content", "") | |
yield outputs | |
demo = gr.ChatInterface(respond, examples=["السلام عليكم كيف حالك؟", "hello"], title="Mistral 7B Arabic Fine-tuned") | |
if __name__ == "__main__": | |
demo.launch() |