Spaces:

ai-blueprint
/

rag-generate

Running

File size: 2,343 Bytes

fc223d6
ffa07db
e56656b
fc223d6
ffa07db
 
 
 
b17bbf5
0e535a1
ffa07db
e741027
ffa07db
 
 
 
 
e741027
ffa07db
 
 
 
fc223d6
ffa07db
 
fc223d6
ffa07db
 
 
 
 
 
fc223d6
 
 
ffa07db
fc223d6
ffa07db
 
a52592f
fc223d6
 
400a077
ffa07db
fc223d6
ffa07db
 
e741027
ffa07db
 
 
fc223d6
 
 
 
 
 
 
 
ffa07db
 
 
 
 
 
 
 
 
 
fc223d6

import gradio as gr

from gradio_client import Client
from huggingface_hub import get_token, InferenceClient
from llama_cpp import Llama


llm = Llama.from_pretrained(
    repo_id="prithivMLmods/SmolLM2-135M-Instruct-GGUF",
	filename="SmolLM2-135M-Instruct.Q5_K_M.gguf",
    verbose=False,
    n_ctx=7000
)

def generate(
    user_prompt: str,
    system_prompt: str = "You are a helpful assistant.",
    max_tokens: int = 8192-7000,
    temperature: float = 0.2,
    top_p: float = 0.95,
    top_k: int = 40,
):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    return llm.create_chat_completion(
        messages,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
    )

with gr.Blocks() as demo:
    gr.Markdown("""# RAG - generate
                
                Generate a response to a query using a [HuggingFaceTB/SmolLM2-360M-Instruct and llama-cpp-python](https://huggingface.co./HuggingFaceTB/SmolLM2-360M-Instruct-GGUF?library=llama-cpp-python).
                
                Part of [ai-blueprint](https://github.com/huggingface/ai-blueprint) - a blueprint for AI development, focusing on applied examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs and agents.""")

    with gr.Row():
        system_prompt = gr.Textbox(label="System prompt", lines=3, value="You are a helpful assistant.")
        user_prompt = gr.Textbox(label="Query", lines=3)

    with gr.Accordion("kwargs"):
        with gr.Row(variant="panel"):
            max_tokens = gr.Number(label="Max tokens", value=1100)
            temperature = gr.Number(label="Temperature", value=0.2)
            top_p = gr.Number(label="Top p", value=0.95)
            top_k = gr.Number(label="Top k", value=40)

    submit_btn = gr.Button("Submit")
    response_output = gr.Textbox(label="Response", lines=10)
    documents_output = gr.Dataframe(
        label="Documents", headers=["chunk", "url", "distance", "rank"], wrap=True
    )

    submit_btn.click(
        fn=generate,
        inputs=[
            user_prompt,
            system_prompt,
            max_tokens,
            temperature,
            top_p,
            top_k,
        ],
        outputs=[response_output],
    )

demo.launch()