Spaces:

OpenSourceRonin
/

VPTQ_demo

Running on Zero

App Files Files Community

OpenSourceRonin commited on 1 day ago

Commit

746ca46

•

1 Parent(s): f9e7dbf

build with model selection

Browse files

Files changed (2) hide show

app.py +180 -54
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,39 +1,157 @@
 import spaces
-import gradio as gr
-from huggingface_hub import InferenceClient
-from vptq.app_utils import get_chat_loop_generator
-# Update model list with annotations
-model_list_with_annotations = {
-    # "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft": "Llama 3.1 70B @ 4bit",
-    # "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft": "Llama 3.1 70B @ 3bit",
-    # "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft": "Llama 3.1 70B @ 2bit",
-    # "VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft": "Qwen2.5 72B @ 4 bits",
-    # "VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft": "Qwen2.5 72B @ 3 bits",
-    # "VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft": "Qwen2.5 72B @ 3 bits",
-    # "VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft": "Qwen2.5 32B @ 4 bits",
-    "VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft": "Qwen2.5 32B @ 3 bits",
-    "VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-0-woft": "Qwen2.5 32B @ 2 bits"
-}
-# Create a list of choices with annotations for the dropdown
-model_list_with_annotations_display = [f"{key} ({value})" for key, value in model_list_with_annotations.items()]
-model_keys = list(model_list_with_annotations.keys())
-current_model_g = model_keys[0]
-chat_completion = get_chat_loop_generator(current_model_g)
-@spaces.GPU
-def update_title_and_chatmodel(model):
-    model = str(model)
-    global chat_completion
-    global current_model_g
-    if model != current_model_g:
-        current_model_g = model
-        chat_completion = get_chat_loop_generator(current_model_g)
-    return model
 @spaces.GPU
 def respond(
@@ -43,7 +161,17 @@ def respond(
     max_tokens,
     temperature,
     top_p,
 ):
     messages = [{"role": "system", "content": system_message}]
     for val in history:
@@ -69,23 +197,21 @@ def respond(
         yield response
-css = """
-h1 {
-  text-align: center;
-  display: block;
-}
 """
-chatbot = gr.Chatbot(label="Gradio ChatInterface")
-with gr.Blocks() as demo:
-    with gr.Column(scale=1):
-        title_output = gr.Markdown("Please select a model to run")
-        chat_demo = gr.ChatInterface(
             respond,
-            additional_inputs_accordion=gr.Accordion(
-                label="⚙️ Parameters", open=False, render=False
-            ),
-            fill_height=False,
             additional_inputs=[
                 gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
                 gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
@@ -95,17 +221,17 @@ with gr.Blocks() as demo:
                     maximum=1.0,
                     value=0.95,
                     step=0.05,
-                    label="Top-p (nucleus sampling)"
                 ),
             ],
         )
-        model_select = gr.Dropdown(
-            choices=model_list_with_annotations_display,
-            label="Models",
-            value=model_list_with_annotations_display[0],
-            info="Model & Estimated Quantized Bitwidth"
-        )
-        model_select.change(update_title_and_chatmodel, inputs=[model_select], outputs=title_output)
 if __name__ == "__main__":
-    demo.launch()

 import spaces
+import os
+import threading
+from collections import deque
+import plotly.graph_objs as go
+import pynvml
+import gradio as gr
+from huggingface_hub import snapshot_download
+from vptq.app_utils import get_chat_loop_generator
+models = [
+    {
+        "name": "VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft",
+        "bits": "4 bits"
+    },
+    {
+        "name": "VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft",
+        "bits": "3 bits"
+    },
+]
+# Queues for storing historical data (saving the last 100 GPU utilization and memory usage values)
+gpu_util_history = deque(maxlen=100)
+mem_usage_history = deque(maxlen=100)
+def initialize_nvml():
+    """
+    Initialize NVML (NVIDIA Management Library).
+    """
+    pynvml.nvmlInit()
+def get_gpu_info():
+    """
+    Get GPU utilization and memory usage information.
+    Returns:
+        dict: A dictionary containing GPU utilization and memory usage information.
+    """
+    handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # Assuming a single GPU setup
+    utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+    memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
+    gpu_info = {
+        'gpu_util': utilization.gpu,
+        'mem_used': memory.used / 1024**2,  # Convert bytes to MiB
+        'mem_total': memory.total / 1024**2,  # Convert bytes to MiB
+        'mem_percent': (memory.used / memory.total) * 100
+    }
+    return gpu_info
+def update_charts(chart_height: int = 200) -> go.Figure:
+    """
+    Update the GPU utilization and memory usage charts.
+    Args:
+        chart_height (int, optional): used to set the height of the chart. Defaults to 200.
+    Returns:
+        plotly.graph_objs.Figure: The updated figure containing the GPU and memory usage charts.
+    """
+    # obtain GPU information
+    gpu_info = get_gpu_info()
+    # records the latest GPU utilization and memory usage values
+    gpu_util = round(gpu_info.get('gpu_util', 0), 1)
+    mem_used = round(gpu_info.get('mem_used', 0) / 1024, 2)  # Convert MiB to GiB
+    gpu_util_history.append(gpu_util)
+    mem_usage_history.append(mem_used)
+    # create GPU utilization line chart
+    gpu_trace = go.Scatter(
+        y=list(gpu_util_history),
+        mode='lines+markers',
+        text=list(gpu_util_history),
+        line=dict(shape='spline', color='blue'),  # Make the line smooth and set color
+        yaxis='y1'  # Link to y-axis 1
+    )
+    # create memory usage line chart
+    mem_trace = go.Scatter(
+        y=list(mem_usage_history),
+        mode='lines+markers',
+        text=list(mem_usage_history),
+        line=dict(shape='spline', color='red'),  # Make the line smooth and set color
+        yaxis='y2'  # Link to y-axis 2
+    )
+    # set the layout of the chart
+    layout = go.Layout(
+        xaxis=dict(title=None, showticklabels=False, ticks=''),
+        yaxis=dict(
+            title='GPU Utilization (%)',
+            range=[-5, 110],
+            titlefont=dict(color='blue'),
+            tickfont=dict(color='blue'),
+        ),
+        yaxis2=dict(title='Memory Usage (GiB)',
+                    range=[0, max(24,
+                                  max(mem_usage_history) + 1)],
+                    titlefont=dict(color='red'),
+                    tickfont=dict(color='red'),
+                    overlaying='y',
+                    side='right'),
+        height=chart_height,  # set the height of the chart
+        margin=dict(l=10, r=10, t=0, b=0),  # set the margin of the chart
+        showlegend=False  # disable the legend
+    )
+    fig = go.Figure(data=[gpu_trace, mem_trace], layout=layout)
+    return fig
+def initialize_history():
+    """
+    Initializes the GPU utilization and memory usage history.
+    """
+    for _ in range(100):
+        gpu_info = get_gpu_info()
+        gpu_util_history.append(round(gpu_info.get('gpu_util', 0), 1))
+        mem_usage_history.append(round(gpu_info.get('mem_percent', 0), 1))
+def enable_gpu_info():
+    pynvml.nvmlInit()
+def disable_gpu_info():
+    pynvml.nvmlShutdown()
+model_choices = [f"{model['name']} ({model['bits']})" for model in models]
+display_to_model = {f"{model['name']} ({model['bits']})": model['name'] for model in models}
+def download_model(model):
+    print(f"Downloading {model['name']}...")
+    snapshot_download(repo_id=model['name'])
+def download_models_in_background():
+    print('Downloading models for the first time...')
+    for model in models:
+        download_model(model)
+download_thread = threading.Thread(target=download_models_in_background)
+download_thread.start()
+loaded_models = {}
 @spaces.GPU
 def respond(
     max_tokens,
     temperature,
     top_p,
+    selected_model_display_label,
 ):
+    model_name = display_to_model[selected_model_display_label]
+    # Check if the model is already loaded
+    if model_name not in loaded_models:
+        # Load and store the model in the cache
+        loaded_models[model_name] = get_chat_loop_generator(model_name)
+    chat_completion = loaded_models[model_name]
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         yield response
 """
+For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
+"""
+# enable_gpu_info()
+with gr.Blocks(fill_height=True) as demo:
+    with gr.Row():
+        def update_chart():
+            return _update_charts(chart_height=200)
+        gpu_chart = gr.Plot(update_chart, every=0.1)  # update every 0.1 seconds
+    with gr.Column():
+        chat_interface = gr.ChatInterface(
             respond,
             additional_inputs=[
                 gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
                 gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
                     maximum=1.0,
                     value=0.95,
                     step=0.05,
+                    label="Top-p (nucleus sampling)",
+                ),
+                gr.Dropdown(
+                    choices=model_choices,
+                    value=model_choices[0],
+                    label="Select Model",
                 ),
             ],
         )
 if __name__ == "__main__":
+    share = os.getenv("SHARE_LINK", None) in ["1", "true", "True"]
+    demo.launch(share=share)
+    # disable_gpu_info()

requirements.txt CHANGED Viewed

@@ -1,2 +1,4 @@
 huggingface_hub>=0.22.2
-https://github.com/microsoft/VPTQ/releases/download/v0.0.1/vptq-0.0.1-cp310-cp310-manylinux1_x86_64.whl

 huggingface_hub>=0.22.2
+https://github.com/microsoft/VPTQ/releases/download/v0.0.2.post1/vptq-0.0.2.post1-cp310-cp310-manylinux1_x86_64.whl
+pynvml==11.5.3
+plotly==5.24.1