Spaces:

Intel
/

powered_by_intel_llm_leaderboard

Running

App Files Files Community

eduardo-alvarez commited on Mar 9, 2024

Commit

867b5a3

1 Parent(s): 9773d33

enabling chat functionality

Browse files

Files changed (6) hide show

app.py +40 -52
info/about.py +52 -0
info/deployment.py +29 -28
info/train_a_model.py +1 -1
status/leaderboard_status_030424.csv +4 -4
status/leaderboard_status_030824.csv +8 -0

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
 import pandas as pd
 import requests
-import socket
 from info.train_a_model import (
     LLM_BENCHMARKS_TEXT)
@@ -16,10 +15,12 @@ from info.citation import(
     CITATION_TEXT)
 from info.validated_chat_models import(
     VALIDATED_CHAT_MODELS)
 from src.processing import filter_benchmarks_table
-#inference_endpoint_url = os.environ['inference_endpoint_url']
-#inference_concurrency_limit = os.environ['inference_concurrency_limit']
 demo = gr.Blocks()
@@ -36,22 +37,8 @@ with demo:
     gr.Markdown("""A special shout-out to the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
                 team for generously sharing their code and best
                 practices, ensuring that AI Developers have a valuable and enjoyable tool at their disposal.""")
-    def get_public_ip():
-        try:
-            response = requests.get('https://api.ipify.org')
-            public_ip = response.text
-            return public_ip
-        except Exception as e:
-            return f"Error: {str(e)}"
-    public_ip = get_public_ip()
-    gr.Markdown(f"ip: {public_ip}")
     with gr.Accordion("Chat with Top Models on the Leaderboard Here 💬", open=False):
-       # import pdb
         chat_model_dropdown = gr.Dropdown(
                         choices=VALIDATED_CHAT_MODELS,
@@ -64,34 +51,33 @@ with demo:
         #chat_model_selection = chat_model_dropdown.value
         chat_model_selection = 'Intel/neural-chat-7b-v1-1'
-        #def call_api_and_stream_response(query, chat_model):
-        #    """
-        #    Call the API endpoint and yield characters as they are received.
-        #    This function simulates streaming by yielding characters one by one.
-        #    """
-        #    url = "http://localhost:5004/query-stream/"
-        #    params = {"query": query,"selected_model":chat_model}
-        #    with requests.get(url, json=params, stream=True) as r:
-        #        for chunk in r.iter_content(chunk_size=1):
-        #            if chunk:
-        #                yield chunk.decode()
-#
-        #def get_response(query, history):
-        #    """
-        #    Wrapper function to call the streaming API and compile the response.
-        #    """
-        #    response = ''
-        #
-        #    global chat_model_selection
-        #
-        #    for char in call_api_and_stream_response(query, chat_model=chat_model_selection):
-        #        if char == '<':
-        #            break
-        #        response += char
-        #        yield response
-        #
-        #gr.ChatInterface(get_response, retry_btn = None, undo_btn=None, concurrency_limit=5).launch()
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏆 LLM Leadeboard", elem_id="llm-benchmark-table", id=0):
@@ -105,10 +91,10 @@ with demo:
                                      label="Training Infrastructure*",
                                      elem_id="training_infra",
                                      value=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"])
-                    filter_affiliation = gr.CheckboxGroup(choices=["No Affiliation","Intel Innovator","Intel Student Ambassador", "Intel Liftoff", "Intel Labs", "Other"],
                                      label="Intel Program Affiliation",
                                      elem_id="program_affiliation",
-                                     value=["No Affiliation","Intel Innovator","Intel Student Ambassador", "Intel Software Liftoff", "Intel Labs", "Other"])
                 with gr.Column():
                     filter_size = gr.CheckboxGroup(choices=[1,3,5,7,13,35,60,70,100],
@@ -124,7 +110,7 @@ with demo:
                                      elem_id="model_types",
                                      value=["pretrained","fine-tuned","chat-models","merges/moerges"])
-            initial_df = pd.read_csv("./status/leaderboard_status_030424.csv")
             def update_df(hw_selected, platform_selected, affiliation_selected, size_selected, precision_selected, type_selected):
                 filtered_df = filter_benchmarks_table(df=initial_df, hw_selected=hw_selected, platform_selected=platform_selected,
@@ -133,10 +119,10 @@ with demo:
                 return filtered_df
             initial_filtered_df = update_df(["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"],
-                                ["Intel Developer Cloud","AWS","Azure","GCP","Local"],
-                                ["No Affiliation","Intel Innovator","Intel Student Ambassador", "Intel Software Liftoff", "Intel Labs", "Other"],
                                 [1,3,5,7,13,35,60,70,100],
-                                ["fp8","fp16","bf16","int8","4bit"],
                                 ["pretrained","fine-tuned","chat-models","merges/moerges"])
             gradio_df_display = gr.Dataframe(value=initial_filtered_df)
@@ -167,6 +153,8 @@ with demo:
             gr.Markdown(DEPLOY_TEXT, elem_classes="markdown-text")
         with gr.TabItem("👩‍💻 Developer Programs", elem_id="hardward-program", id=3):
             gr.Markdown(PROGRAMS_TEXT, elem_classes="markdown-text")
         with gr.TabItem("🏎️ Submit", elem_id="submit", id=4):
             gr.Markdown(SUBMIT_TEXT, elem_classes="markdown-text")
             with gr.Row():
@@ -226,7 +214,7 @@ with demo:
                         Local is the ideal choice for Core Ultra, ARC GPUs, and local data center infrastructure.""",
                     )
                     affiliation = gr.Dropdown(
-                        choices=["No Affiliation","Innovator","Student Ambassador","Intel Liftoff", "Intel Labs", "Other"],
                         label="Affiliation with Intel",
                         multiselect=False,
                         value="No Affiliation",

 import gradio as gr
 import pandas as pd
 import requests
+import os
 from info.train_a_model import (
     LLM_BENCHMARKS_TEXT)
     CITATION_TEXT)
 from info.validated_chat_models import(
     VALIDATED_CHAT_MODELS)
+from info.about import(
+    ABOUT)
 from src.processing import filter_benchmarks_table
+inference_endpoint_url = os.environ['inference_endpoint_url']
+inference_concurrency_limit = os.environ['inference_concurrency_limit']
 demo = gr.Blocks()
     gr.Markdown("""A special shout-out to the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
                 team for generously sharing their code and best
                 practices, ensuring that AI Developers have a valuable and enjoyable tool at their disposal.""")
     with gr.Accordion("Chat with Top Models on the Leaderboard Here 💬", open=False):
         chat_model_dropdown = gr.Dropdown(
                         choices=VALIDATED_CHAT_MODELS,
         #chat_model_selection = chat_model_dropdown.value
         chat_model_selection = 'Intel/neural-chat-7b-v1-1'
+        def call_api_and_stream_response(query, chat_model):
+            """
+            Call the API endpoint and yield characters as they are received.
+            This function simulates streaming by yielding characters one by one.
+            """
+            url = inference_endpoint_url
+            params = {"query": query,"selected_model":chat_model}
+            with requests.get(url, json=params, stream=True) as r:
+                for chunk in r.iter_content(chunk_size=1):
+                    if chunk:
+                        yield chunk.decode()
+        def get_response(query, history):
+            """
+            Wrapper function to call the streaming API and compile the response.
+            """
+            response = ''
+            global chat_model_selection
+            for char in call_api_and_stream_response(query, chat_model=chat_model_selection):
+                if char == '<':
+                    break
+                response += char
+                yield response
+        gr.ChatInterface(get_response, retry_btn = None, undo_btn=None, concurrency_limit=inference_concurrency_limit).launch()
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏆 LLM Leadeboard", elem_id="llm-benchmark-table", id=0):
                                      label="Training Infrastructure*",
                                      elem_id="training_infra",
                                      value=["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"])
+                    filter_affiliation = gr.CheckboxGroup(choices=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"],
                                      label="Intel Program Affiliation",
                                      elem_id="program_affiliation",
+                                     value=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"])
                 with gr.Column():
                     filter_size = gr.CheckboxGroup(choices=[1,3,5,7,13,35,60,70,100],
                                      elem_id="model_types",
                                      value=["pretrained","fine-tuned","chat-models","merges/moerges"])
+            initial_df = pd.read_csv("./status/leaderboard_status_030824.csv")
             def update_df(hw_selected, platform_selected, affiliation_selected, size_selected, precision_selected, type_selected):
                 filtered_df = filter_benchmarks_table(df=initial_df, hw_selected=hw_selected, platform_selected=platform_selected,
                 return filtered_df
             initial_filtered_df = update_df(["Gaudi","Xeon","GPU Max","Arc GPU","Core Ultra"],
+                                ["Intel Developer Cloud","AWS","Azure","Google Cloud Platform","Local"],
+                                ["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"],
                                 [1,3,5,7,13,35,60,70,100],
+                                ["fp32","fp16","bf16","int8","fp8", "int4"],
                                 ["pretrained","fine-tuned","chat-models","merges/moerges"])
             gradio_df_display = gr.Dataframe(value=initial_filtered_df)
             gr.Markdown(DEPLOY_TEXT, elem_classes="markdown-text")
         with gr.TabItem("👩‍💻 Developer Programs", elem_id="hardward-program", id=3):
             gr.Markdown(PROGRAMS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("❓ About ", elem_id="about", id=5):
+            gr.Markdown(ABOUT, elem_classes="markdown-text")
         with gr.TabItem("🏎️ Submit", elem_id="submit", id=4):
             gr.Markdown(SUBMIT_TEXT, elem_classes="markdown-text")
             with gr.Row():
                         Local is the ideal choice for Core Ultra, ARC GPUs, and local data center infrastructure.""",
                     )
                     affiliation = gr.Dropdown(
+                        choices=["No Affiliation","Intel Innovator","Student Ambassador","Intel Liftoff", "Intel Engineering", "Other"],
                         label="Affiliation with Intel",
                         multiselect=False,
                         value="No Affiliation",

info/about.py ADDED Viewed

	@@ -0,0 +1,52 @@

+def get_public_ip():
+    try:
+        response = requests.get('https://api.ipify.org')
+        public_ip = response.text
+        return public_ip
+    except Exception as e:
+        return f"Error: {str(e)}"
+public_ip = get_public_ip()
+ABOUT = f"""
+# ❓ About
+At Powered-by-Intel LLM Leaderboard we conduct the same benchmarks as the Open LLM Leaderboard and plan to add
+domain-specific benchmarks in the future. We utilize the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">
+Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of
+different evaluation tasks.
+Our current benchmarks include:
+- <a href="https://arxiv.org/abs/1803.05457" target="_blank"> AI2 Reasoning Challenge (25-shot)</a> - a set of grade-school science questions.
+- <a href="https://arxiv.org/abs/1905.07830" target="_blank"> HellaSwag (10-shot)</a> - a test of commonsense inference, which is easy for humans (~95%) but challenging for state-of-the-art models.
+- <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MMLU (5-shot)</a> - a test measuring a text model's multitask accuracy, covering 57 tasks in fields like elementary mathematics, US history, computer science, law, and more.
+- <a href="https://arxiv.org/abs/2109.07958" target="_blank"> TruthfulQA (0-shot)</a> - a test measuring a model's propensity to reproduce falsehoods commonly found online. Note: TruthfulQA is technically a 6-shot task in the Harness because each example is prepended with 6 Q/A pairs, even in the 0-shot setting.
+- <a href="https://arxiv.org/abs/1907.10641" target="_blank"> Winogrande (5-shot)</a> - an adversarial and difficult Winograd benchmark at scale, for commonsense reasoning.
+- <a href="https://arxiv.org/abs/2110.14168" target="_blank"> GSM8k (5-shot)</a> - diverse grade school math word problems measuring a model's ability to solve multi-step mathematical reasoning problems.
+For all these evaluations, a higher score is better. We've chosen these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. In the future, we plan to add domain-specific benchmarks to further evaluate our models.
+We run an adapted version of the benchmark code specifically designed to run the EleutherAI Harness benchmarks on Gaudi processors.
+This adapted evaluation harness is built into the Hugging Face Optimum Habana Library. Review the documentation [here](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation).
+## Support and Community
+Join  5000+ developers on the [Intel DevHub Discord](https://discord.gg/yNYNxK2k) to get support with your submission
+and talk about everything from GenAI, HPC, to Quantum Computing.
+## "Chat with Top Models on the Leaderboard Here 💬" Functionality
+This is a fun on-leaderboard LLM chat functionality designed to provide a quick way to test the top LLMs on the leaderboard.
+As the leaderboard matures and users submit models, we will rotate the available models for chat. Who knows!? You might find
+your model featured here soon! ⭐
+### Chat Functionality Notice
+- All the models in this demo run on 4th Generation Intel® Xeon® (Sapphire Rapids) utilizing AMX operations and quantized inference optimizations.
+- Terms of use: By using the chat functionality, users are required to agree to the following terms: The service is a research preview intended for non-commercial
+use only. It can produce factually incorrect output, and should not be relied on to produce factually accurate information.
+The service only provides limited safety measures and may generate lewd, biased or otherwise offensive content. It must not be
+used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
+- License: The chat functionality is a research preview intended for non-commercial use only.
+space ip: {public_ip}
+"""

info/deployment.py CHANGED Viewed

@@ -90,6 +90,34 @@ helps you choose the best option for your specific use case. Happy building!
 <hr>
 # Intel® Max Series GPU
 The Intel® Data Center GPU Max Series is Intel's highest performing, highest density, general-purpose discrete GPU, which packs over 100 billion transistors into one package and contains up to 128 Xe Cores--Intel's foundational GPU compute building block. You can learn more about this GPU [here](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html).
@@ -237,34 +265,7 @@ pipe("In the spring, beautiful flowers bloom...")
 <hr>
-# Intel® Gaudi Accelerators
-The Intel Gaudi 2 accelerator is Intel's most capable deep learning chip. You can learn about Gaudi 2 [here](https://habana.ai/products/gaudi2/).
-Intel Gaudi Software supports PyTorch and DeepSpeed for accelerating LLM training and inference.
-The Intel Gaudi Software graph compiler will optimize the execution of the operations accumulated in the graph
-(e.g. operator fusion, data layout management, parallelization, pipelining and memory management,
-and graph-level optimizations).
-Optimum Habana provides covenient functionality for various tasks. Below is a command line snippet to run inference on Gaudi with meta-llama/Llama-2-7b-hf.
-👍[Optimum Habana GitHub](https://github.com/huggingface/optimum-habana)
-The "run_generation.py" script below can be found [here on GitHub](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
-```bash
-python run_generation.py \
---model_name_or_path meta-llama/Llama-2-7b-hf \
---use_hpu_graphs \
---use_kv_cache \
---max_new_tokens 100 \
---do_sample \
---batch_size 2 \
---prompt "Hello world" "How are you?"
-```
-<hr>
-# Intel Arc GPUs
 You can learn more about Arc GPUs [here](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html).
 Code snippets coming soon!

 <hr>
+# Intel® Gaudi® Accelerators
+The Intel Gaudi 2 accelerator is Intel's most capable deep learning chip. You can learn about Gaudi 2 [here](https://habana.ai/products/gaudi2/).
+Intel Gaudi Software supports PyTorch and DeepSpeed for accelerating LLM training and inference.
+The Intel Gaudi Software graph compiler will optimize the execution of the operations accumulated in the graph
+(e.g. operator fusion, data layout management, parallelization, pipelining and memory management,
+and graph-level optimizations).
+Optimum Habana provides covenient functionality for various tasks. Below is a command line snippet to run inference on Gaudi with meta-llama/Llama-2-7b-hf.
+👍[Optimum Habana GitHub](https://github.com/huggingface/optimum-habana)
+The "run_generation.py" script below can be found [here on GitHub](https://github.com/huggingface/optimum-habana/tree/main/examples/text-generation)
+```bash
+python run_generation.py \
+--model_name_or_path meta-llama/Llama-2-7b-hf \
+--use_hpu_graphs \
+--use_kv_cache \
+--max_new_tokens 100 \
+--do_sample \
+--batch_size 2 \
+--prompt "Hello world" "How are you?"
+```
+<hr>
 # Intel® Max Series GPU
 The Intel® Data Center GPU Max Series is Intel's highest performing, highest density, general-purpose discrete GPU, which packs over 100 billion transistors into one package and contains up to 128 Xe Cores--Intel's foundational GPU compute building block. You can learn more about this GPU [here](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html).
 <hr>
+# Intel® Arc GPUs
 You can learn more about Arc GPUs [here](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/arc.html).
 Code snippets coming soon!

info/train_a_model.py CHANGED Viewed

@@ -8,7 +8,7 @@ Below, you can find documentation on how to access free and paid resources to tr
 ## Intel Developer Cloud - Quick Start
 The Intel Developer Cloud is one of the best places to access free and paid compute instances for model training. Intel offers Jupyter Notebook instances supported by
 224 Core 4th Generation Xeon Bare Metal nodes with 4x GPU Max Series 1100. To access these resources please follow the instructions below:
-1. Visit the [Intel Developer Cloud](https://cloud.intel.com/) and sign up for the "Standard - Free" tier to get started.
 2. Navigate to the "Training" module under the "Software" section in the left panel.
 3. Under the GenAI Essentials section, select the LLM Fine-Tuning with QLoRA notebook and click "Launch".
 4. Follow the instructions in the notebook to train your model using Intel® Data Center GPU Max 1100.

 ## Intel Developer Cloud - Quick Start
 The Intel Developer Cloud is one of the best places to access free and paid compute instances for model training. Intel offers Jupyter Notebook instances supported by
 224 Core 4th Generation Xeon Bare Metal nodes with 4x GPU Max Series 1100. To access these resources please follow the instructions below:
+1. Visit the [Intel Developer Cloud](https://bit.ly/inteldevelopercloud) and sign up for the "Standard - Free" tier to get started.
 2. Navigate to the "Training" module under the "Software" section in the left panel.
 3. Under the GenAI Essentials section, select the LLM Fine-Tuning with QLoRA notebook and click "Launch".
 4. Follow the instructions in the notebook to train your model using Intel® Data Center GPU Max 1100.

status/leaderboard_status_030424.csv CHANGED Viewed

@@ -1,5 +1,5 @@
 Model,Average,Hardware,Model Type,Precision,Size,Infrastructure,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Affiliation
-Intel/neural-chat-7b-v3-3,69.83,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,66.89,85.26,63.07,63.01,79.64,61.11,Intel Labs
-Intel/neural-chat-7b-v3-2,68.29,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,67.49,83.92,63.55,59.68,79.65,55.12,Intel Labs
-Intel/neural-chat-7b-v3-1,61.59,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,66.21,83.64,62.37,59.65,78.14,19.56,Intel Labs
-Intel/neural-chat-7b-v3,58.46,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,67.15,83.29,62.26,58.77,78.06,1.21,Intel Labs

 Model,Average,Hardware,Model Type,Precision,Size,Infrastructure,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Affiliation
+Intel/neural-chat-7b-v3-3,69.83,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,66.89,85.26,63.07,63.01,79.64,61.11,Intel Engineering
+Intel/neural-chat-7b-v3-2,68.29,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,67.49,83.92,63.55,59.68,79.65,55.12,Intel Engineering
+Intel/neural-chat-7b-v3-1,61.59,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,66.21,83.64,62.37,59.65,78.14,19.56,Intel Engineering
+Intel/neural-chat-7b-v3,58.46,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,67.15,83.29,62.26,58.77,78.06,1.21,Intel Engineering

status/leaderboard_status_030824.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+Model,Average,Hardware,Model Type,Precision,Size,Infrastructure,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,Affiliation
+Intel/neural-chat-7b-v3-3,69.83,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,66.89,85.26,63.07,63.01,79.64,61.11,Intel Engineering
+Intel/neural-chat-7b-v3-2,68.29,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,67.49,83.92,63.55,59.68,79.65,55.12,Intel Engineering
+Intel/neural-chat-7b-v3-1,61.59,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,66.21,83.64,62.37,59.65,78.14,19.56,Intel Engineering
+Intel/neural-chat-7b-v3,58.46,Gaudi,fine-tuned,fp16,7,Intel Developer Cloud,67.15,83.29,62.26,58.77,78.06,1.21,Intel Engineering
+Intel/neural-chat-7b-v3-1,61.59,Gaudi,fine-tuned,int8,7,Intel Developer Cloud,65.7,83.54,62.12,59.48,78.61,20.09,Intel Engineering
+Intel/neural-chat-7b-v3-1,61.54,Gaudi,fine-tuned,bf16,7,Intel Developer Cloud,66.3,83.6,62.44,59.54,77.98,19.41,Intel Engineering
+Intel/neural-chat-7b-v3-1,59.9,Gaudi,fine-tuned,int4,7,Intel Developer Cloud,64.25,82.49,60.79,56.4,77.35,18.12,Intel Engineering