Spaces:

akazakov
/

rag-gradio-sample-project

Paused

App Files Files Community

AlexanderKazakov commited on Nov 27, 2023

Commit

8b1c859

1 Parent(s): 360f505

improve interface and cut documents to fit the context length

Browse files

Files changed (6) hide show

gradio_app/app.py +69 -55
gradio_app/backend/ChatGptInteractor.py +170 -0
gradio_app/backend/query_llm.py +55 -19
gradio_app/templates/context_html_template.j2 +95 -0
gradio_app/templates/context_template.j2 +20 -0
settings.py +7 -1

gradio_app/app.py CHANGED Viewed

@@ -11,8 +11,9 @@ from time import perf_counter
 import gradio as gr
 from jinja2 import Environment, FileSystemLoader
-from backend.query_llm import generate_hf, generate_openai
-from backend.semantic_search import table, embedder
 from settings import *
@@ -24,23 +25,29 @@ logger = logging.getLogger(__name__)
 env = Environment(loader=FileSystemLoader('gradio_app/templates'))
 # Load the templates directly from the environment
-prompt_template = env.get_template('prompt_template.j2')
-template_html = env.get_template('template_html.j2')
 # Examples
-examples = ['What is the capital of China?',
-            'Why is the sky blue?',
-            'Who won the mens world cup in 2014?', ]
 def add_text(history, text):
     history = [] if history is None else history
-    history = history + [(text, None)]
     return history, gr.Textbox(value="", interactive=False)
 def bot(history, api_kind):
-    top_k_rank = 4
     query = history[-1][0]
     if not query:
@@ -53,71 +60,78 @@ def bot(history, api_kind):
     query_vec = embedder.encode(query)
     documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
     documents = [doc[TEXT_COLUMN_NAME] for doc in documents]
     document_time = perf_counter() - document_start
     logger.info(f'Finished Retrieving documents in {round(document_time, 2)} seconds...')
-    # Create Prompt
-    prompt = prompt_template.render(documents=documents, query=query)
-    prompt_html = template_html.render(documents=documents, query=query)
-    if api_kind == "HuggingFace":
-        generate_fn = generate_hf
-    elif api_kind == "OpenAI":
-        generate_fn = generate_openai
-    elif api_kind is None:
-        gr.Warning("API name was not provided")
-        raise ValueError("API name was not provided")
     else:
-        gr.Warning(f"API {api_kind} is not supported")
-        raise ValueError(f"API {api_kind} is not supported")
-    history[-1][1] = ""
-    for character in generate_fn(prompt, history[:-1]):
-        history[-1][1] = character
-        yield history, prompt_html
 with gr.Blocks() as demo:
-    chatbot = gr.Chatbot(
-        [],
-        elem_id="chatbot",
-        avatar_images=('https://aui.atlassian.com/aui/8.8/docs/images/avatar-person.svg',
-                       'https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg'),
-        bubble_full_width=False,
-        show_copy_button=True,
-        show_share_button=True,
-    )
     with gr.Row():
-        txt = gr.Textbox(
-            scale=3,
-            show_label=False,
-            placeholder="Enter text and press enter",
-            container=False,
-        )
-        txt_btn = gr.Button(value="Submit text", scale=1)
-    api_kind = gr.Radio(choices=["HuggingFace", "OpenAI"], value="HuggingFace")
-    prompt_html = gr.HTML()
     # Turn off interactivity while generating if you click
-    txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
-        bot, [chatbot, api_kind], [chatbot, prompt_html])
     # Turn it back on
-    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
     # Turn off interactivity while generating if you hit enter
-    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
-        bot, [chatbot, api_kind], [chatbot, prompt_html])
     # Turn it back on
-    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)
-    # Examples
-    gr.Examples(examples, txt)
 demo.queue()
 demo.launch(debug=True)

 import gradio as gr
 from jinja2 import Environment, FileSystemLoader
+from gradio_app.backend.ChatGptInteractor import num_tokens_from_messages
+from gradio_app.backend.query_llm import generate_hf, generate_openai, construct_openai_messages
+from gradio_app.backend.semantic_search import table, embedder
 from settings import *
 env = Environment(loader=FileSystemLoader('gradio_app/templates'))
 # Load the templates directly from the environment
+context_template = env.get_template('context_template.j2')
+context_html_template = env.get_template('context_html_template.j2')
 # Examples
+examples = [
+    'What is BERT?',
+    'Tell me about BERT deep learning model',
+    'What is the capital of China?',
+    'Why is the sky blue?',
+    'Who won the mens world cup in 2014?',
+]
 def add_text(history, text):
     history = [] if history is None else history
+    history = history + [(text, "")]
     return history, gr.Textbox(value="", interactive=False)
 def bot(history, api_kind):
+    top_k_rank = 5
+    thresh_dist = 1.2
+    history[-1][1] = ""
     query = history[-1][0]
     if not query:
     query_vec = embedder.encode(query)
     documents = table.search(query_vec, vector_column_name=VECTOR_COLUMN_NAME).limit(top_k_rank).to_list()
+    thresh_dist = max(thresh_dist, min(d['_distance'] for d in documents))
+    documents = [d for d in documents if d['_distance'] <= thresh_dist]
     documents = [doc[TEXT_COLUMN_NAME] for doc in documents]
     document_time = perf_counter() - document_start
     logger.info(f'Finished Retrieving documents in {round(document_time, 2)} seconds...')
+    while len(documents) != 0:
+        context = context_template.render(documents=documents)
+        context_html = context_html_template.render(documents=documents)
+        messages = construct_openai_messages(context, history)
+        num_tokens = num_tokens_from_messages(messages, OPENAI_LLM_NAME)
+        if num_tokens + 512 < context_lengths[OPENAI_LLM_NAME]:
+            break
+        documents.pop()
     else:
+        raise gr.Error('Model context length exceeded, reload the page')
+    for part in generate_openai(messages):
+        history[-1][1] += part
+        yield history, context_html
+    else:
+        print('Finished generation stream.')
 with gr.Blocks() as demo:
     with gr.Row():
+        with gr.Column():
+            chatbot = gr.Chatbot(
+                [],
+                elem_id="chatbot",
+                avatar_images=('https://aui.atlassian.com/aui/8.8/docs/images/avatar-person.svg',
+                               'https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.svg'),
+                bubble_full_width=False,
+                show_copy_button=True,
+                show_share_button=True,
+                height=600,
+            )
+            with gr.Row():
+                input_textbox = gr.Textbox(
+                    scale=3,
+                    show_label=False,
+                    placeholder="Enter text and press enter",
+                    container=False,
+                )
+                txt_btn = gr.Button(value="Submit text", scale=1)
+            api_kind = gr.Radio(choices=["HuggingFace", "OpenAI"], value="OpenAI", label='Backend')
+            # Examples
+            gr.Examples(examples, input_textbox)
+        with gr.Column():
+            context_html = gr.HTML()
     # Turn off interactivity while generating if you click
+    txt_msg = txt_btn.click(
+        add_text, [chatbot, input_textbox], [chatbot, input_textbox], queue=False
+    ).then(
+        bot, [chatbot, api_kind], [chatbot, context_html]
+    )
     # Turn it back on
+    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [input_textbox], queue=False)
     # Turn off interactivity while generating if you hit enter
+    txt_msg = input_textbox.submit(add_text, [chatbot, input_textbox], [chatbot, input_textbox], queue=False).then(
+        bot, [chatbot, api_kind], [chatbot, context_html])
     # Turn it back on
+    txt_msg.then(lambda: gr.Textbox(interactive=True), None, [input_textbox], queue=False)
 demo.queue()
 demo.launch(debug=True)

gradio_app/backend/ChatGptInteractor.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import time
+import tiktoken
+import openai
+with open('data/openaikey.txt') as f:
+    OPENAI_KEY = f.read().strip()
+openai.api_key = OPENAI_KEY
+def num_tokens_from_messages(messages, model):
+    """
+    Return the number of tokens used by a list of messages.
+    https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
+    """
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        print("Warning: model not found. Using cl100k_base encoding.")
+        encoding = tiktoken.get_encoding("cl100k_base")
+    if model in {
+        "gpt-3.5-turbo-0613",
+        "gpt-3.5-turbo-16k-0613",
+        "gpt-4-0314",
+        "gpt-4-32k-0314",
+        "gpt-4-0613",
+        "gpt-4-32k-0613",
+        }:
+        tokens_per_message = 3
+        tokens_per_name = 1
+    elif model == "gpt-3.5-turbo-0301":
+        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
+        tokens_per_name = -1  # if there's a name, the role is omitted
+    elif "gpt-3.5-turbo" in model:
+        # print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
+        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
+    elif "gpt-4" in model:
+        # print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
+        return num_tokens_from_messages(messages, model="gpt-4-0613")
+    else:
+        raise NotImplementedError(
+            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
+        )
+    num_tokens = 0
+    for message in messages:
+        num_tokens += tokens_per_message
+        for key, value in message.items():
+            num_tokens += len(encoding.encode(value, disallowed_special=()))
+            if key == "name":
+                num_tokens += tokens_per_name
+    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
+    return num_tokens
+class ChatGptInteractor:
+    def __init__(self, model_name='gpt-3.5-turbo'):
+        self.model_name = model_name
+        self.tokenizer = tiktoken.encoding_for_model(self.model_name)
+    def chat_completion_simple(
+            self,
+            *,
+            user_text,
+            system_text=None,
+            max_tokens=None,
+            temperature=None,
+            stream=False,
+    ):
+        return self.chat_completion(
+            self._construct_messages_simple(user_text, system_text),
+            max_tokens=max_tokens,
+            temperature=temperature,
+            stream=stream,
+        )
+    def count_tokens_simple(self, *, user_text, system_text=None):
+        return self.count_tokens(self._construct_messages_simple(user_text, system_text))
+    @staticmethod
+    def _construct_messages_simple(user_text, system_text=None):
+        messages = []
+        if system_text is not None:
+            messages.append({
+                "role": "system",
+                "content": system_text
+            })
+        messages.append({
+            "role": "user",
+            "content": user_text
+        })
+        return messages
+    def chat_completion(
+            self,
+            messages,
+            max_tokens=None,
+            temperature=None,
+            stream=False,
+    ):
+        print(f'Sending request to {self.model_name} stream={stream} ...')
+        t1 = time.time()
+        completion = self._request(
+            model=self.model_name,
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            stream=stream,
+        )
+        if stream:
+            return completion
+        t2 = time.time()
+        usage = completion['usage']
+        print(
+            f'Received response: {usage["prompt_tokens"]} in + {usage["completion_tokens"]} out'
+            f' = {usage["total_tokens"]} total tokens. Time: {t2 - t1:3.1f} seconds'
+        )
+        return completion.choices[0].message['content']
+    @staticmethod
+    def get_stream_text(stream_part):
+        return stream_part['choices'][0]['delta'].get('content', '')
+    def count_tokens(self, messages):
+        return num_tokens_from_messages(messages, self.model_name)
+    def _request(self, *args, **kwargs):
+        for _ in range(5):
+            try:
+                completion = openai.ChatCompletion.create(
+                    *args, **kwargs,
+                    request_timeout=100.0,
+                )
+                return completion
+            except (openai.error.Timeout, openai.error.ServiceUnavailableError):
+                continue
+        raise RuntimeError('Failed to connect to OpenAI (timeout error)')
+if __name__ == '__main__':
+    cgi = ChatGptInteractor()
+    for txt in [
+        "Hello World!",
+        "Hello",
+        " World!",
+        "World!",
+        "World",
+        "!",
+        " ",
+        "  ",
+        "   ",
+        "    ",
+        "\n",
+        "\n\t",
+    ]:
+        print(f'`{txt}` | {cgi.tokenizer.encode(txt)}')
+    st = 'You are a helpful assistant and an experienced programmer, ' \
+         'answering questions exactly in two rhymed sentences'
+    ut = 'Explain the principle of recursion in programming'
+    print('Count tokens:', cgi.count_tokens_simple(user_text=ut, system_text=st))
+    print(cgi.chat_completion_simple(user_text=ut, system_text=st))
+    print('---')
+    for part in cgi.chat_completion_simple(user_text=ut, system_text=st, stream=True):
+        print(cgi.get_stream_text(part), end='')
+    print('\n---')

gradio_app/backend/query_llm.py CHANGED Viewed

@@ -1,22 +1,20 @@
-import openai
 import gradio as gr
-from os import getenv
 from typing import Any, Dict, Generator, List
 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer
 from settings import *
-tokenizer = AutoTokenizer.from_pretrained(LLM_NAME)
-OPENAI_KEY = getenv("OPENAI_API_KEY")
-HF_TOKEN = getenv("HUGGING_FACE_HUB_TOKEN")
-hf_client = InferenceClient(LLM_NAME, token=HF_TOKEN)
 def format_prompt(message: str, api_kind: str):
@@ -42,7 +40,7 @@ def format_prompt(message: str, api_kind: str):
 def generate_hf(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 512,
-             top_p: float = 0.6, repetition_penalty: float = 1.2) -> Generator[str, None, str]:
     """
     Generate a sequence of tokens based on a given prompt and history using Mistral client.
@@ -69,13 +67,13 @@ def generate_hf(prompt: str, history: str, temperature: float = 0.9, max_new_tok
         'repetition_penalty': repetition_penalty,
         'do_sample': True,
         'seed': 42,
-        }
     formatted_prompt = format_prompt(prompt, "hf")
     try:
         stream = hf_client.text_generation(formatted_prompt, **generate_kwargs,
-                                            stream=True, details=True, return_full_text=False)
         output = ""
         for response in stream:
             output += response.token.text
@@ -96,8 +94,44 @@ def generate_hf(prompt: str, history: str, temperature: float = 0.9, max_new_tok
             return "I do not know what happened, but I couldn't understand you."
-def generate_openai(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 512,
-             top_p: float = 0.6, repetition_penalty: float = 1.2) -> Generator[str, None, str]:
     """
     Generate a sequence of tokens based on a given prompt and history using Mistral client.
@@ -116,21 +150,23 @@ def generate_openai(prompt: str, history: str, temperature: float = 0.9, max_new
     temperature = max(float(temperature), 1e-2)  # Ensure temperature isn't too low
     top_p = float(top_p)
     generate_kwargs = {
         'temperature': temperature,
         'max_tokens': max_new_tokens,
         'top_p': top_p,
         'frequency_penalty': max(-2., min(repetition_penalty, 2.)),
-        }
     formatted_prompt = format_prompt(prompt, "openai")
     try:
-        stream = openai.ChatCompletion.create(model="gpt-3.5-turbo-0301",
-                                                messages=formatted_prompt,
-                                                **generate_kwargs,
-                                                stream=True)
         output = ""
         for chunk in stream:
             output += chunk.choices[0].delta.get("content", "")

 import gradio as gr
 from typing import Any, Dict, Generator, List
 from huggingface_hub import InferenceClient
 from transformers import AutoTokenizer
+from jinja2 import Environment, FileSystemLoader
 from settings import *
+from gradio_app.backend.ChatGptInteractor import *
+tokenizer = AutoTokenizer.from_pretrained(HF_LLM_NAME)
+HF_TOKEN = None
+hf_client = InferenceClient(HF_LLM_NAME, token=HF_TOKEN)
 def format_prompt(message: str, api_kind: str):
 def generate_hf(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 512,
+                top_p: float = 0.6, repetition_penalty: float = 1.2) -> Generator[str, None, str]:
     """
     Generate a sequence of tokens based on a given prompt and history using Mistral client.
         'repetition_penalty': repetition_penalty,
         'do_sample': True,
         'seed': 42,
+    }
     formatted_prompt = format_prompt(prompt, "hf")
     try:
         stream = hf_client.text_generation(formatted_prompt, **generate_kwargs,
+                                           stream=True, details=True, return_full_text=False)
         output = ""
         for response in stream:
             output += response.token.text
             return "I do not know what happened, but I couldn't understand you."
+env = Environment(loader=FileSystemLoader('gradio_app/templates'))
+context_template = env.get_template('context_template.j2')
+start_system_message = context_template.render(documents=[])
+def construct_openai_messages(context, history):
+    messages = [
+        {
+            "role": "system",
+            "content": start_system_message,
+        },
+    ]
+    for q, a in history:
+        if len(a) == 0:  # the last message
+            messages.append({
+                "role": "system",
+                "content": context,
+            })
+        messages.append({
+            "role": "user",
+            "content": q,
+        })
+        if len(a) != 0:  # some of the previous LLM answers
+            messages.append({
+                "role": "assistant",
+                "content": a,
+            })
+    return messages
+def generate_openai(messages):
+    cgi = ChatGptInteractor(model_name=OPENAI_LLM_NAME)
+    for part in cgi.chat_completion(messages, max_tokens=512, temperature=0, stream=True):
+        yield cgi.get_stream_text(part)
+def _generate_openai(prompt: str, history: str, temperature: float = 0.9, max_new_tokens: int = 512,
+                    top_p: float = 0.6, repetition_penalty: float = 1.2) -> Generator[str, None, str]:
     """
     Generate a sequence of tokens based on a given prompt and history using Mistral client.
     temperature = max(float(temperature), 1e-2)  # Ensure temperature isn't too low
     top_p = float(top_p)
     generate_kwargs = {
         'temperature': temperature,
         'max_tokens': max_new_tokens,
         'top_p': top_p,
         'frequency_penalty': max(-2., min(repetition_penalty, 2.)),
+    }
     formatted_prompt = format_prompt(prompt, "openai")
     try:
+        stream = openai.ChatCompletion.create(
+            model=OPENAI_LLM_NAME,
+            messages=formatted_prompt,
+            **generate_kwargs,
+            stream=True
+        )
         output = ""
         for chunk in stream:
             output += chunk.choices[0].delta.get("content", "")

gradio_app/templates/context_html_template.j2 ADDED Viewed

	@@ -0,0 +1,95 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Information Page</title>
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@400;600&amp;display=swap">
+    <link rel="stylesheet" href="https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;600&amp;display=swap">
+    <style>
+        * {
+            font-family: "Source Sans Pro";
+        }
+        .instructions > * {
+          color: #111 !important;
+        }
+        details.doc-box * {
+          color: #111 !important;
+        }
+        .dark {
+            background: #111;
+            color: white;
+        }
+        .doc-box {
+            padding: 10px;
+            margin-top: 10px;
+            background-color: #baecc2;
+            border-radius: 6px;
+            color: #111 !important;
+            max-width: 700px;
+            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
+        }
+        .doc-full {
+            margin: 10px 14px;
+            line-height: 1.6rem;
+        }
+        .instructions {
+            color: #111 !important;
+            background: #b7bdfd;
+            display: block;
+            border-radius: 6px;
+            padding: 6px 10px;
+            line-height: 1.6rem;
+            max-width: 700px;
+            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
+        }
+        .query {
+            color: #111 !important;
+            background: #ffbcbc;
+            display: block;
+            border-radius: 6px;
+            padding: 6px 10px;
+            line-height: 1.6rem;
+            max-width: 700px;
+            box-shadow: rgba(0, 0, 0, 0.2) 0px 1px 2px 0px;
+        }
+    </style>
+</head>
+<body>
+<div class="prose svelte-1ybaih5" id="context_html">
+<h2>Context:</h2>
+{% for doc in documents %}
+    <details class="doc-box">
+        <summary>
+            <b>Doc {{ loop.index }}:</b> <span class="doc-short">{{ doc[:1000] }}...</span>
+        </summary>
+        <div class="doc-full">{{ doc }}</div>
+    </details>
+{% endfor %}
+</div>
+<script>
+document.addEventListener("DOMContentLoaded", function() {
+    const detailsElements = document.querySelectorAll('.doc-box');
+    detailsElements.forEach(detail => {
+        detail.addEventListener('toggle', function() {
+            const docShort = this.querySelector('.doc-short');
+            if (this.open) {
+                docShort.style.display = 'none';
+            } else {
+                docShort.style.display = 'inline';
+            }
+        });
+    });
+});
+</script>
+</body>
+</html>

gradio_app/templates/context_template.j2 ADDED Viewed

	@@ -0,0 +1,20 @@

+You are a helpful assistant.
+You answer questions based only on the provided information.
+If there is no relevant information in the context, just say "No relevant information".
+You must not make up an answer! Use only provided context!
+In each answer, you must provide a precise citation from the given context in double quotes.
+Citation is mandatory in the answer!
+Context:
+{% for doc in documents %}
+---
+{{ doc }}
+{% endfor %}

settings.py CHANGED Viewed

@@ -5,4 +5,10 @@ LANCEDB_DIRECTORY = "data/lancedb"
 LANCEDB_TABLE_NAME = "table"
 VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
-LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

 LANCEDB_TABLE_NAME = "table"
 VECTOR_COLUMN_NAME = "embedding"
 TEXT_COLUMN_NAME = "text"
+HF_LLM_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
+OPENAI_LLM_NAME = "gpt-3.5-turbo"
+context_lengths = {
+    "mistralai/Mistral-7B-Instruct-v0.1": 4096,
+    "gpt-3.5-turbo": 4096,
+}