Spaces:

sergey21000
/

gradio-llamacpp-chatbot

Sleeping

App Files Files Community

sergey21000 commited on Oct 12, 2024

Commit

ef59540

verified ·

1 Parent(s): dc668d5

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -303

app.py CHANGED Viewed

@@ -1,64 +1,128 @@
-from typing import List, Optional
 import gradio as gr
-from langchain_core.vectorstores import VectorStore
-from config import (
-    LLM_MODEL_REPOS,
-    EMBED_MODEL_REPOS,
-    SUBTITLES_LANGUAGES,
-    GENERATE_KWARGS,
-)
-from utils import (
-    load_llm_model,
-    load_embed_model,
-    load_documents_and_create_db,
-    user_message_to_chatbot,
-    update_user_message_with_context,
-    get_llm_response,
-    get_gguf_model_names,
-    add_new_model_repo,
-    clear_llm_folder,
-    clear_embed_folder,
-    get_memory_usage,
-)
-# ============ INTERFACE COMPONENT INITIALIZATION FUNCS ============
-def get_rag_settings(rag_mode: bool, render: bool = True):
-    k = gr.Radio(
-        choices=[1, 2, 3, 4, 5, 'all'],
-        value=2,
-        label='Number of relevant documents for search',
-        visible=rag_mode,
-        render=render,
-        )
-    score_threshold = gr.Slider(
-        minimum=0,
-        maximum=1,
-        value=0.5,
-        step=0.05,
-        label='relevance_scores_threshold',
-        visible=rag_mode,
-        render=render,
-        )
-    return k, score_threshold
-def get_user_message_with_context(text: str, rag_mode: bool) -> gr.component:
-    num_lines = len(text.split('\n'))
-    max_lines = 10
-    num_lines = max_lines if num_lines > max_lines else num_lines
-    return gr.Textbox(
-        text,
-        visible=rag_mode,
-        interactive=False,
-        label='User Message With Context',
-        lines=num_lines,
         )
 def get_system_prompt_component(interactive: bool) -> gr.Textbox:
     value = '' if interactive else 'System prompt is not supported by this model'
@@ -66,28 +130,32 @@ def get_system_prompt_component(interactive: bool) -> gr.Textbox:
 def get_generate_args(do_sample: bool) -> List[gr.component]:
     generate_args = [
-        gr.Slider(minimum=0.1, maximum=3, value=GENERATE_KWARGS['temperature'], step=0.1, label='temperature', visible=do_sample),
-        gr.Slider(minimum=0.1, maximum=1, value=GENERATE_KWARGS['top_p'], step=0.01, label='top_p', visible=do_sample),
-        gr.Slider(minimum=1, maximum=50, value=GENERATE_KWARGS['top_k'], step=1, label='top_k', visible=do_sample),
-        gr.Slider(minimum=1, maximum=5, value=GENERATE_KWARGS['repeat_penalty'], step=0.1, label='repeat_penalty', visible=do_sample),
     ]
     return generate_args
-def get_rag_mode_component(db: Optional[VectorStore]) -> gr.Checkbox:
-    value = visible = db is not None
-    return gr.Checkbox(value=value, label='RAG Mode', scale=1, visible=visible)
-# ================ LOADING AND INITIALIZING MODELS ========================
-start_llm_model, start_support_system_role, load_log = load_llm_model(LLM_MODEL_REPOS[0], 'gemma-2-2b-it-Q8_0.gguf')
-start_embed_model, load_log = load_embed_model(EMBED_MODEL_REPOS[0])
-# ================== APPLICATION WEB INTERFACE ============================
 theme = gr.themes.Base(primary_hue='green', secondary_hue='yellow', neutral_hue='zinc').set(
     loader_color='rgb(0, 255, 0)',
@@ -97,32 +165,23 @@ theme = gr.themes.Base(primary_hue='green', secondary_hue='yellow', neutral_hue=
 )
 css = '''.gradio-container {width: 60% !important}'''
-with gr.Blocks(theme=theme, css=css) as interface:
-    # ==================== GRADIO STATES ===============================
-    documents = gr.State([])
-    db = gr.State(None)
-    user_message_with_context = gr.State('')
     support_system_role = gr.State(start_support_system_role)
-    llm_model_repos = gr.State(LLM_MODEL_REPOS)
-    embed_model_repos = gr.State(EMBED_MODEL_REPOS)
-    llm_model = gr.State(start_llm_model)
-    embed_model = gr.State(start_embed_model)
-    # ==================== BOT PAGE =================================
-    with gr.Tab(label='Chatbot'):
         with gr.Row():
             with gr.Column(scale=3):
                 chatbot = gr.Chatbot(
                     type='messages',  # new in gradio 5+
-                    show_copy_button=True,
-                    bubble_full_width=False,
                     height=480,
-                )
                 user_message = gr.Textbox(label='User')
                 with gr.Row():
@@ -130,14 +189,14 @@ with gr.Blocks(theme=theme, css=css) as interface:
                     stop_btn = gr.Button('Stop')
                     clear_btn = gr.Button('Clear')
-            # ------------- GENERATION PARAMETERS -------------------
             with gr.Column(scale=1, min_width=80):
                 with gr.Group():
-                    gr.Markdown('History size')
                     history_len = gr.Slider(
                         minimum=0,
-                        maximum=5,
                         value=0,
                         step=1,
                         info='Number of previous messages taken into account in history',
@@ -158,243 +217,56 @@ with gr.Blocks(theme=theme, css=css) as interface:
                             inputs=do_sample,
                             outputs=generate_args,
                             show_progress=False,
-                            )
-        rag_mode = get_rag_mode_component(db=db.value)
-        k, score_threshold = get_rag_settings(rag_mode=rag_mode.value, render=False)
-        rag_mode.change(
-            fn=get_rag_settings,
-            inputs=[rag_mode],
-            outputs=[k, score_threshold],
-            )
-        with gr.Row():
-            k.render()
-            score_threshold.render()
-        # ---------------- SYSTEM PROMPT AND USER MESSAGE -----------
-        with gr.Accordion('Prompt', open=True):
-            system_prompt = get_system_prompt_component(interactive=support_system_role.value)
-            user_message_with_context = get_user_message_with_context(text='', rag_mode=rag_mode.value)
-        # ---------------- SEND, CLEAR AND STOP BUTTONS ------------
         generate_event = gr.on(
             triggers=[user_message.submit, user_message_btn.click],
             fn=user_message_to_chatbot,
             inputs=[user_message, chatbot],
             outputs=[user_message, chatbot],
-            queue=False,
-        ).then(
-            fn=update_user_message_with_context,
-            inputs=[chatbot, rag_mode, db, k, score_threshold],
-            outputs=[user_message_with_context],
-        ).then(
-            fn=get_user_message_with_context,
-            inputs=[user_message_with_context, rag_mode],
-            outputs=[user_message_with_context],
         ).then(
-            fn=get_llm_response,
-            inputs=[chatbot, llm_model, user_message_with_context, rag_mode, system_prompt,
-                    support_system_role, history_len, do_sample, *generate_args],
             outputs=[chatbot],
         )
         stop_btn.click(
             fn=None,
             inputs=None,
             outputs=None,
             cancels=generate_event,
-            queue=False,
         )
         clear_btn.click(
-            fn=lambda: (None, ''),
             inputs=None,
-            outputs=[chatbot, user_message_with_context],
-            queue=False,
-            )
-    # ================= FILE DOWNLOAD PAGE =========================
-    with gr.Tab(label='Load documents'):
-        with gr.Row(variant='compact'):
-            upload_files = gr.File(file_count='multiple', label='Loading text files')
-            web_links = gr.Textbox(lines=6, label='Links to Web sites or YouTube')
-        with gr.Row(variant='compact'):
-            chunk_size = gr.Slider(50, 2000, value=500, step=50, label='Chunk size')
-            chunk_overlap = gr.Slider(0, 200, value=20, step=10, label='Chunk overlap')
-            subtitles_lang = gr.Radio(
-                SUBTITLES_LANGUAGES,
-                value=SUBTITLES_LANGUAGES[0],
-                label='YouTube subtitle language',
-                )
-        load_documents_btn = gr.Button(value='Upload documents and initialize database')
-        load_docs_log = gr.Textbox(label='Status of loading and splitting documents', interactive=False)
-        load_documents_btn.click(
-            fn=load_documents_and_create_db,
-            inputs=[upload_files, web_links, subtitles_lang, chunk_size, chunk_overlap, embed_model],
-            outputs=[documents, db, load_docs_log],
-        ).success(
-            fn=get_rag_mode_component,
-            inputs=[db],
-            outputs=[rag_mode],
-        )
-        gr.HTML("""<h3 style='text-align: center'>
-        <a href="https://github.com/sergey21000/chatbot-rag" target='_blank'>GitHub Repository</a></h3>
-        """)
-    # ================= VIEW PAGE FOR ALL DOCUMENTS =================
-    with gr.Tab(label='View documents'):
-        view_documents_btn = gr.Button(value='Show downloaded text chunks')
-        view_documents_textbox = gr.Textbox(
-            lines=1,
-            placeholder='To view chunks, load documents in the Load documents tab',
-            label='Uploaded chunks',
             )
-        sep = '=' * 20
-        view_documents_btn.click(
-            lambda documents: f'\n{sep}\n\n'.join([doc.page_content for doc in documents]),
-            inputs=[documents],
-            outputs=[view_documents_textbox],
-        )
-    # ============== GGUF MODELS DOWNLOAD PAGE =====================
-    with gr.Tab('Load LLM model'):
-        new_llm_model_repo = gr.Textbox(
             value='',
-            label='Add repository',
-            placeholder='Link to repository of HF models in GGUF format',
-            )
-        new_llm_model_repo_btn = gr.Button('Add repository')
-        curr_llm_model_repo = gr.Dropdown(
-            choices=LLM_MODEL_REPOS,
-            value=None,
-            label='HF Model Repository',
-            )
-        curr_llm_model_path = gr.Dropdown(
-            choices=[],
-            value=None,
-            label='GGUF model file',
             )
-        load_llm_model_btn = gr.Button('Loading and initializing model')
-        load_llm_model_log = gr.Textbox(
-            value=f'Model {LLM_MODEL_REPOS[0]} loaded at application startup',
             label='Model loading status',
-            lines=6,
             )
-        with gr.Group():
-            gr.Markdown('Free up disk space by deleting all models except the currently selected one')
-            clear_llm_folder_btn = gr.Button('Clear folder')
-        new_llm_model_repo_btn.click(
-            fn=add_new_model_repo,
-            inputs=[new_llm_model_repo, llm_model_repos],
-            outputs=[curr_llm_model_repo, load_llm_model_log],
         ).success(
-            fn=lambda: '',
-            inputs=None,
-            outputs=[new_llm_model_repo],
-        )
-        curr_llm_model_repo.change(
-            fn=get_gguf_model_names,
-            inputs=[curr_llm_model_repo],
-            outputs=[curr_llm_model_path],
-        )
-        load_llm_model_btn.click(
-            fn=load_llm_model,
-            inputs=[curr_llm_model_repo, curr_llm_model_path],
-            outputs=[llm_model, support_system_role, load_llm_model_log],
-        ).success(
-            fn=lambda log: log + get_memory_usage(),
-            inputs=[load_llm_model_log],
-            outputs=[load_llm_model_log],
-        ).then(
             fn=get_system_prompt_component,
             inputs=[support_system_role],
             outputs=[system_prompt],
         )
-        clear_llm_folder_btn.click(
-            fn=clear_llm_folder,
-            inputs=[curr_llm_model_path],
-            outputs=None,
-        ).success(
-            fn=lambda model_path: f'Models other than {model_path} removed',
-            inputs=[curr_llm_model_path],
-            outputs=None,
-        )
-    # ============== EMBEDDING MODELS DOWNLOAD PAGE =============
-    with gr.Tab('Load embed model'):
-        new_embed_model_repo = gr.Textbox(
-            value='',
-            label='Add repository',
-            placeholder='Link to HF model repository',
-            )
-        new_embed_model_repo_btn = gr.Button('Add repository')
-        curr_embed_model_repo = gr.Dropdown(
-            choices=EMBED_MODEL_REPOS,
-            value=None,
-            label='HF model repository',
-            )
-        load_embed_model_btn = gr.Button('Loading and initializing model')
-        load_embed_model_log = gr.Textbox(
-            value=f'Model {EMBED_MODEL_REPOS[0]} loaded at application startup',
-            label='Model loading status',
-            lines=7,
-            )
-        with gr.Group():
-            gr.Markdown('Free up disk space by deleting all models except the currently selected one')
-            clear_embed_folder_btn = gr.Button('Clear folder')
-        new_embed_model_repo_btn.click(
-            fn=add_new_model_repo,
-            inputs=[new_embed_model_repo, embed_model_repos],
-            outputs=[curr_embed_model_repo, load_embed_model_log],
-        ).success(
-            fn=lambda: '',
-            inputs=None,
-            outputs=new_embed_model_repo,
-        )
-        load_embed_model_btn.click(
-            fn=load_embed_model,
-            inputs=[curr_embed_model_repo],
-            outputs=[embed_model, load_embed_model_log],
-        ).success(
-            fn=lambda log: log + get_memory_usage(),
-            inputs=[load_embed_model_log],
-            outputs=[load_embed_model_log],
-        )
-        clear_embed_folder_btn.click(
-            fn=clear_embed_folder,
-            inputs=[curr_embed_model_repo],
-            outputs=None,
-        ).success(
-            fn=lambda model_repo: f'Models other than {model_repo} removed',
-            inputs=[curr_embed_model_repo],
-            outputs=None,
-        )
-interface.launch(server_name='0.0.0.0', server_port=7860)  # debug=True

+from pathlib import Path
+from shutil import rmtree
+from typing import Union, List, Dict, Tuple, Optional
+from tqdm import tqdm
+import requests
 import gradio as gr
+from llama_cpp import Llama
+# ================== ANNOTATIONS ========================
+CHAT_HISTORY = List[Optional[Dict[str, Optional[str]]]]
+MODEL_DICT = Dict[str, Llama]
+# ================== FUNCS =============================
+def download_file(file_url: str, file_path: Union[str, Path]) -> None:
+    response = requests.get(file_url, stream=True)
+    if response.status_code != 200:
+        raise Exception(f'Файл недоступен для скачивания по ссылке: {file_url}')
+    total_size = int(response.headers.get('content-length', 0))
+    progress_tqdm = tqdm(desc='Loading GGUF file', total=total_size, unit='iB', unit_scale=True)
+    progress_gradio = gr.Progress()
+    completed_size = 0
+    with open(file_path, 'wb') as file:
+        for data in response.iter_content(chunk_size=4096):
+            size = file.write(data)
+            progress_tqdm.update(size)
+            completed_size += size
+            desc = f'Loading GGUF file, {completed_size/1024**3:.3f}/{total_size/1024**3:.3f} GB'
+            progress_gradio(completed_size/total_size, desc=desc)
+def download_gguf_and_init_model(gguf_url: str, model_dict: MODEL_DICT) -> Tuple[MODEL_DICT, bool, str]:
+    log = ''
+    if not gguf_url.endswith('.gguf'):
+        log += f'The link must be a direct link to the GGUF file\n'
+        return model_dict, log
+    gguf_filename = gguf_url.rsplit('/')[-1]
+    model_path = MODELS_PATH / gguf_filename
+    progress = gr.Progress()
+    if not model_path.is_file():
+        progress(0.3, desc='Шаг 1/2: Loading GGUF model file')
+        try:
+            download_file(gguf_url, model_path)
+            log += f'Model file {gguf_filename} successfully loaded\n'
+        except Exception as ex:
+            log += f'Error loading model from link {gguf_url}, error code:\n{ex}\n'
+            curr_model = model_dict.get('model')
+            if curr_model is None:
+                log += f'Model is missing from dictionary "model_dict"\n'
+                return model_dict, load_log
+            curr_model_filename = Path(curr_model.model_path).name
+            log += f'Current initialized model: {curr_model_filename}\n'
+            return model_dict, log
+    else:
+        log += f'Model file {gguf_filename} loaded, initializing model...\n'
+    progress(0.7, desc='Шаг 2/2: Model initialization')
+    model = Llama(model_path=str(model_path), n_gpu_layers=-1, verbose=True)
+    model_dict = {'model': model}
+    support_system_role = 'System role not supported' not in model.metadata['tokenizer.chat_template']
+    log += f'Model {gguf_filename} initialized\n'
+    return model_dict, support_system_role, log
+def user_message_to_chatbot(user_message: str, chatbot: CHAT_HISTORY) -> Tuple[str, CHAT_HISTORY]:
+    if user_message:
+        chatbot.append({'role': 'user', 'metadata': {'title': None}, 'content': user_message})
+    return '', chatbot
+def bot_response_to_chatbot(
+        chatbot: CHAT_HISTORY,
+        model_dict: MODEL_DICT,
+        system_prompt: str,
+        support_system_role: bool,
+        history_len: int,
+        do_sample: bool,
+        *generate_args,
+        ):
+    model = model_dict.get('model')
+    if model is None:
+        gr.Info('Model not initialized')
+        yield chatbot
+        return
+    if len(chatbot) == 0 or chatbot[-1]['role'] == 'assistant':
+        yield chatbot
+        return
+    messages = []
+    if support_system_role and system_prompt:
+        messages.append({'role': 'system', 'metadata': {'title': None}, 'content': system_prompt})
+    if history_len != 0:
+        messages.extend(chatbot[:-1][-(history_len*2):])
+    messages.append(chatbot[-1])
+    gen_kwargs = dict(zip(GENERATE_KWARGS.keys(), generate_args))
+    gen_kwargs['top_k'] = int(gen_kwargs['top_k'])
+    if not do_sample:
+        gen_kwargs['top_p'] = 0.0
+        gen_kwargs['top_k'] = 1
+        gen_kwargs['repeat_penalty'] = 1.0
+    stream_response = model.create_chat_completion(
+        messages=messages,
+        stream=True,
+        **gen_kwargs,
         )
+    chatbot.append({'role': 'assistant', 'metadata': {'title': None}, 'content': ''})
+    for chunk in stream_response:
+        token = chunk['choices'][0]['delta'].get('content')
+        if token is not None:
+            chatbot[-1]['content'] += token
+            yield chatbot
 def get_system_prompt_component(interactive: bool) -> gr.Textbox:
     value = '' if interactive else 'System prompt is not supported by this model'
 def get_generate_args(do_sample: bool) -> List[gr.component]:
+    visible = do_sample
     generate_args = [
+        gr.Slider(label='temperature', value=GENERATE_KWARGS['temperature'], minimum=0.1, maximum=3, step=0.1, visible=visible),
+        gr.Slider(label='top_p', value=GENERATE_KWARGS['top_p'], minimum=0.1, maximum=1, step=0.1, visible=visible),
+        gr.Slider(label='top_k', value=GENERATE_KWARGS['top_k'], minimum=1, maximum=50, step=5, visible=visible),
+        gr.Slider(label='repeat_penalty', value=GENERATE_KWARGS['repeat_penalty'], minimum=1, maximum=5, step=0.1, visible=visible),
     ]
     return generate_args
+# ================== VARIABLES =============================
+MODELS_PATH = Path('models')
+MODELS_PATH.mkdir(exist_ok=True)
+DEFAULT_GGUF_URL = 'https://huggingface.co/bartowski/gemma-2-2b-it-GGUF/resolve/main/gemma-2-2b-it-Q8_0.gguf'
+start_model_dict, start_support_system_role, start_load_log = download_gguf_and_init_model(
+    gguf_url=DEFAULT_GGUF_URL, model_dict={},
+    )
+GENERATE_KWARGS = dict(
+    temperature=0.2,
+    top_p=0.95,
+    top_k=40,
+    repeat_penalty=1.0,
+    )
 theme = gr.themes.Base(primary_hue='green', secondary_hue='yellow', neutral_hue='zinc').set(
     loader_color='rgb(0, 255, 0)',
 )
 css = '''.gradio-container {width: 60% !important}'''
+# ================== INTERFACE =============================
+with gr.Blocks(theme=theme, css=css) as interface:
+    model_dict = gr.State(start_model_dict)
     support_system_role = gr.State(start_support_system_role)
+    # ================= CHAT BOT PAGE ======================
+    with gr.Tab('Chatbot'):
         with gr.Row():
             with gr.Column(scale=3):
                 chatbot = gr.Chatbot(
                     type='messages',  # new in gradio 5+
+                    show_copy_button=True,
+                    bubble_full_width=False,
                     height=480,
+                    )
                 user_message = gr.Textbox(label='User')
                 with gr.Row():
                     stop_btn = gr.Button('Stop')
                     clear_btn = gr.Button('Clear')
+                system_prompt = get_system_prompt_component(interactive=support_system_role.value)
             with gr.Column(scale=1, min_width=80):
                 with gr.Group():
+                    gr.Markdown('Length of message history')
                     history_len = gr.Slider(
                         minimum=0,
+                        maximum=10,
                         value=0,
                         step=1,
                         info='Number of previous messages taken into account in history',
                             inputs=do_sample,
                             outputs=generate_args,
                             show_progress=False,
+                        )
         generate_event = gr.on(
             triggers=[user_message.submit, user_message_btn.click],
             fn=user_message_to_chatbot,
             inputs=[user_message, chatbot],
             outputs=[user_message, chatbot],
         ).then(
+            fn=bot_response_to_chatbot,
+            inputs=[chatbot, model_dict, system_prompt, support_system_role, history_len, do_sample, *generate_args],
             outputs=[chatbot],
         )
         stop_btn.click(
             fn=None,
             inputs=None,
             outputs=None,
             cancels=generate_event,
         )
         clear_btn.click(
+            fn=lambda: None,
             inputs=None,
+            outputs=[chatbot],
             )
+    # ================= LOAD MODELS PAGE ======================
+    with gr.Tab('Load model'):
+        gguf_url = gr.Textbox(
             value='',
+            label='Link to GGUF',
+            placeholder='URL link to the model in GGUF format',
             )
+        load_model_btn = gr.Button('Downloading GGUF and initializing the model')
+        load_log = gr.Textbox(
+            value=start_load_log,
             label='Model loading status',
+            lines=3,
             )
+        load_model_btn.click(
+            fn=download_gguf_and_init_model,
+            inputs=[gguf_url, model_dict],
+            outputs=[model_dict, support_system_role, load_log],
         ).success(
             fn=get_system_prompt_component,
             inputs=[support_system_role],
             outputs=[system_prompt],
         )
+        gr.HTML("""<h3 style='text-align: center'>
+        <a href="https://github.com/sergey21000/gradio-llamacpp-chatbot" target='_blank'>GitHub Repository</a></h3>
+        """)
+interface.launch(server_name='0.0.0.0', server_port=7860)