import os import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama import spaces huggingface_token = os.getenv("HF_TOKEN") infer_prompt = "فيما يلي تعليمات تصف مهمة. اكتب استجابة تكمل الطلب بشكل مناسب.\n\n### تعليمات:\n{}\n\n### إجابة:\n" model_id = "nazimali/mistral-7b-v0.3-instruct-arabic" file_name = "Q8_0.gguf" llm = None hf_hub_download( repo_id=model_id, filename=file_name, local_dir="./models", token=huggingface_token, ) @spaces.GPU def respond( message, history, ): global llm if llm is None: llm = Llama( model_path=f"./models/{file_name}", flash_attn=True, n_gpu_layers=-1, n_ctx=2048, verbose=True, ) stream = llm.create_chat_completion( messages=[{"role": "user", "content": infer_prompt.format(message) }], max_tokens=50, repeat_penalty=1.2, stream=True, temperature=0.7, top_k=40, top_p=0.95, ) outputs = "" for output in stream: print(output) outputs += output["choices"][0]["delta"].get("content", "") yield outputs demo = gr.ChatInterface(respond, examples=["السلام عليكم", "hello"], title="Mistral 7B Arabic Fine-tuned") if __name__ == "__main__": demo.launch()