davidberenstein1957 HF staff commited on
Commit
ffa07db
·
verified ·
1 Parent(s): e09486d
Files changed (1) hide show
  1. app.py +53 -64
app.py CHANGED
@@ -1,76 +1,58 @@
1
  import gradio as gr
2
- import pandas as pd
3
  from gradio_client import Client
4
  from huggingface_hub import get_token, InferenceClient
5
- from sentence_transformers import CrossEncoder
6
-
7
-
8
- gradio_client = Client("https://smol-blueprint-vector-search-hub.hf.space/")
9
- reranker = CrossEncoder("sentence-transformers/all-MiniLM-L12-v2")
10
- inference_client = InferenceClient(api_key=get_token())
11
-
12
-
13
- def similarity_search(query: str, k: int = 5):
14
- results = gradio_client.predict(api_name="/similarity_search", query=query, k=k)
15
- return pd.DataFrame(data=results["data"], columns=results["headers"])
16
-
17
-
18
- def query_and_rerank_documents(query: str, k_retrieved: int = 10):
19
- documents = similarity_search(query, k_retrieved)
20
- documents = documents.drop_duplicates("chunk")
21
- documents["rank"] = reranker.predict([[query, hit] for hit in documents["chunk"]])
22
- reranked_documents = documents.sort_values(by="rank", ascending=False)
23
- return reranked_documents
24
-
25
-
26
- def generate_response_api(query: str):
27
  messages = [
28
- {
29
- "role": "system",
30
- "content": "You will receive a query and context. Only return the answer based on the context without mentioning the context.",
31
- },
32
- {"role": "user", "content": query},
33
  ]
34
- completion = inference_client.chat.completions.create(
35
- model="HuggingFaceTB/SmolLM2-360M-Instruct", messages=messages, max_tokens=2000
36
- )
37
-
38
- return completion.choices[0].message
39
-
40
-
41
- def rag_pipeline(query: str, k_retrieved: int = 10, k_reranked: int = 5):
42
- documents = query_and_rerank_documents(query, k_retrieved=k_retrieved)
43
- query_with_context = (
44
- f"Context: {documents['chunk'].to_list()[:k_reranked]}\n\nQuery: {query}"
45
  )
46
- return generate_response_api(query_with_context).content, documents
47
-
48
 
49
  with gr.Blocks() as demo:
50
- gr.Markdown("""# RAG Hub Datasets
51
 
52
- Part of [smol blueprint](https://github.com/davidberenstein1957/smol-blueprint) - a smol blueprint for AI development, focusing on practical examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs.""")
 
 
53
 
54
  with gr.Row():
55
- query_input = gr.Textbox(
56
- label="Query", placeholder="Enter your question here...", lines=3
57
- )
58
 
59
- with gr.Row():
60
- with gr.Column():
61
- retrieve_slider = gr.Slider(
62
- minimum=1,
63
- maximum=20,
64
- value=10,
65
- label="Number of documents to retrieve",
66
- )
67
- with gr.Column():
68
- rerank_slider = gr.Slider(
69
- minimum=1,
70
- maximum=10,
71
- value=5,
72
- label="Number of documents to use after reranking",
73
- )
74
 
75
  submit_btn = gr.Button("Submit")
76
  response_output = gr.Textbox(label="Response", lines=10)
@@ -79,9 +61,16 @@ with gr.Blocks() as demo:
79
  )
80
 
81
  submit_btn.click(
82
- fn=rag_pipeline,
83
- inputs=[query_input, retrieve_slider, rerank_slider],
84
- outputs=[response_output, documents_output],
 
 
 
 
 
 
 
85
  )
86
 
87
  demo.launch()
 
1
  import gradio as gr
2
+
3
  from gradio_client import Client
4
  from huggingface_hub import get_token, InferenceClient
5
+ from llama_cpp import Llama
6
+
7
+
8
+ llm = Llama.from_pretrained(
9
+ repo_id="HuggingFaceTB/SmolLM2-360M-Instruct-GGUF",
10
+ filename="smollm2-360m-instruct-q8_0.gguf",
11
+ verbose=False,
12
+ )
13
+
14
+
15
+ def generate(
16
+ user_prompt: str,
17
+ system_prompt: str = "You are a helpful assistant.",
18
+ max_tokens: int = 4000,
19
+ temperature: float = 0.2,
20
+ top_p: float = 0.95,
21
+ top_k: int = 40,
22
+ presence_penalty: float = 0.0,
23
+ frequency_penalty: float = 0.0,
24
+ ):
 
 
25
  messages = [
26
+ {"role": "system", "content": system_prompt},
27
+ {"role": "user", "content": user_prompt},
 
 
 
28
  ]
29
+ return llm.create_chat_completion(
30
+ messages,
31
+ max_tokens=max_tokens,
32
+ temperature=temperature,
33
+ top_p=top_p,
34
+ top_k=top_k,
35
+ presence_penalty=presence_penalty,
36
+ frequency_penalty=frequency_penalty,
 
 
 
37
  )
 
 
38
 
39
  with gr.Blocks() as demo:
40
+ gr.Markdown("""# RAG - generate
41
 
42
+ Generate a response to a query using a [HuggingFaceTB/SmolLM2-360M-Instruct and llama-cpp-python](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct-GGUF?library=llama-cpp-python).
43
+
44
+ Part of [ai-blueprint](https://github.com/davidberenstein1957/ai-blueprint) - a blueprint for AI development, focusing on applied examples of RAG, information extraction, analysis and fine-tuning in the age of LLMs and agents.""")
45
 
46
  with gr.Row():
47
+ system_prompt = gr.Textbox(label="System prompt", lines=3)
48
+ user_prompt = gr.Textbox(label="Query", lines=3)
 
49
 
50
+ with gr.Accordion("kwargs"):
51
+ with gr.Row(variant="panel"):
52
+ max_tokens = gr.Number(label="Max tokens", value=512)
53
+ temperature = gr.Number(label="Temperature", value=0.2)
54
+ top_p = gr.Number(label="Top p", value=0.95)
55
+ top_k = gr.Number(label="Top k", value=40)
 
 
 
 
 
 
 
 
 
56
 
57
  submit_btn = gr.Button("Submit")
58
  response_output = gr.Textbox(label="Response", lines=10)
 
61
  )
62
 
63
  submit_btn.click(
64
+ fn=generate,
65
+ inputs=[
66
+ user_prompt,
67
+ system_prompt,
68
+ max_tokens,
69
+ temperature,
70
+ top_p,
71
+ top_k,
72
+ ],
73
+ outputs=[response_output],
74
  )
75
 
76
  demo.launch()