Spaces:

ai-blueprint
/

rag-generate

Running

App Files Files Community

davidberenstein1957 HF staff commited on 1 day ago

Commit

ffa07db

verified ·

1 Parent(s): e09486d

app.py

Browse files

Files changed (1) hide show

app.py +53 -64

app.py CHANGED Viewed

@@ -1,76 +1,58 @@
 import gradio as gr
-import pandas as pd
 from gradio_client import Client
 from huggingface_hub import get_token, InferenceClient
-from sentence_transformers import CrossEncoder
-gradio_client = Client("https://smol-blueprint-vector-search-hub.hf.space/")
-reranker = CrossEncoder("sentence-transformers/all-MiniLM-L12-v2")
-inference_client = InferenceClient(api_key=get_token())
-def similarity_search(query: str, k: int = 5):
-    results = gradio_client.predict(api_name="/similarity_search", query=query, k=k)
-    return pd.DataFrame(data=results["data"], columns=results["headers"])
-def query_and_rerank_documents(query: str, k_retrieved: int = 10):
-    documents = similarity_search(query, k_retrieved)
-    documents = documents.drop_duplicates("chunk")
-    documents["rank"] = reranker.predict([[query, hit] for hit in documents["chunk"]])
-    reranked_documents = documents.sort_values(by="rank", ascending=False)
-    return reranked_documents
-def generate_response_api(query: str):
     messages = [
-        {
-            "role": "system",
-            "content": "You will receive a query and context. Only return the answer based on the context without mentioning the context.",
-        },
-        {"role": "user", "content": query},
     ]
-    completion = inference_client.chat.completions.create(
-        model="HuggingFaceTB/SmolLM2-360M-Instruct", messages=messages, max_tokens=2000
-    )
-    return completion.choices[0].message
-def rag_pipeline(query: str, k_retrieved: int = 10, k_reranked: int = 5):
-    documents = query_and_rerank_documents(query, k_retrieved=k_retrieved)
-    query_with_context = (
-        f"Context: {documents['chunk'].to_list()[:k_reranked]}\n\nQuery: {query}"
     )
-    return generate_response_api(query_with_context).content, documents
 with gr.Blocks() as demo:
-    gr.Markdown("""# RAG Hub Datasets
-                Part of [smol blueprint](https://github.com/davidberenstein1957/smol-blueprint) - a smol blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs.""")
     with gr.Row():
-        query_input = gr.Textbox(
-            label="Query", placeholder="Enter your question here...", lines=3
-        )
-    with gr.Row():
-        with gr.Column():
-            retrieve_slider = gr.Slider(
-                minimum=1,
-                maximum=20,
-                value=10,
-                label="Number of documents to retrieve",
-            )
-        with gr.Column():
-            rerank_slider = gr.Slider(
-                minimum=1,
-                maximum=10,
-                value=5,
-                label="Number of documents to use after reranking",
-            )
     submit_btn = gr.Button("Submit")
     response_output = gr.Textbox(label="Response", lines=10)
@@ -79,9 +61,16 @@ with gr.Blocks() as demo:
     )
     submit_btn.click(
-        fn=rag_pipeline,
-        inputs=[query_input, retrieve_slider, rerank_slider],
-        outputs=[response_output, documents_output],
     )
 demo.launch()

 import gradio as gr
 from gradio_client import Client
 from huggingface_hub import get_token, InferenceClient
+from llama_cpp import Llama
+llm = Llama.from_pretrained(
+    repo_id="HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
+    filename="smollm2-360m-instruct-q8_0.gguf",
+    verbose=False,
+)
+def generate(
+    user_prompt: str,
+    system_prompt: str = "You are a helpful assistant.",
+    max_tokens: int = 4000,
+    temperature: float = 0.2,
+    top_p: float = 0.95,
+    top_k: int = 40,
+    presence_penalty: float = 0.0,
+    frequency_penalty: float = 0.0,
+):
     messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
     ]
+    return llm.create_chat_completion(
+        messages,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
     )
 with gr.Blocks() as demo:
+    gr.Markdown("""# RAG - generate
+                Generate a response to a query using a [HuggingFaceTB/SmolLM2-360M-Instruct and llama-cpp-python](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct-GGUF?library=llama-cpp-python).
+                Part of [ai-blueprint](https://github.com/davidberenstein1957/ai-blueprint) - a blueprint for AI development, focusing on applied examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs and agents.""")
     with gr.Row():
+        system_prompt = gr.Textbox(label="System prompt", lines=3)
+        user_prompt = gr.Textbox(label="Query", lines=3)
+    with gr.Accordion("kwargs"):
+        with gr.Row(variant="panel"):
+            max_tokens = gr.Number(label="Max tokens", value=512)
+            temperature = gr.Number(label="Temperature", value=0.2)
+            top_p = gr.Number(label="Top p", value=0.95)
+            top_k = gr.Number(label="Top k", value=40)
     submit_btn = gr.Button("Submit")
     response_output = gr.Textbox(label="Response", lines=10)
     )
     submit_btn.click(
+        fn=generate,
+        inputs=[
+            user_prompt,
+            system_prompt,
+            max_tokens,
+            temperature,
+            top_p,
+            top_k,
+        ],
+        outputs=[response_output],
     )
 demo.launch()