Spaces:

aialliance
/

safetyarena

Runtime error

App Files Files Community

Clémentine commited on Apr 11, 2024

Commit

24622c4

1 Parent(s): 55cc480

simplified the template

Browse files

Files changed (12) hide show

README.md +0 -1
app.py +5 -12
main_backend.py +0 -78
scripts/create_request_file.py +0 -105
scripts/fix_harness_import.py +0 -11
src/backend/manage_requests.py +0 -122
src/backend/run_eval_suite.py +0 -57
src/backend/sort_queue.py +0 -28
src/display/css_html_js.py +0 -6
src/display/utils.py +0 -3
src/envs.py +1 -3
src/leaderboard/read_evals.py +0 -1

README.md CHANGED Viewed

@@ -37,4 +37,3 @@ Request files are created automatically by this tool.
 If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
-If you want to run your own backend, you only need to change the logic in src/backend/run_eval_suite, which at the moment launches the Eleuther AI Harness.


37
38	If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39

app.py CHANGED Viewed

@@ -26,19 +26,14 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, DEVICE, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
-subprocess.run(["python", "scripts/fix_harness_import.py"])
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
-def launch_backend():
-    _ = subprocess.run(["python", "main_backend.py"])
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
@@ -82,7 +77,7 @@ def update_table(
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
-    return df[(df[AutoEvalColumn.dummy.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
@@ -92,7 +87,7 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     ]
     # We use COLS to maintain sorting
     filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
     ]
     return filtered_df
@@ -157,7 +152,7 @@ with demo:
                             choices=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden and not c.dummy
                             ],
                             value=[
                                 c.name
@@ -200,7 +195,6 @@ with demo:
                 value=leaderboard_df[
                     [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
                     + shown_columns.value
-                    + [AutoEvalColumn.dummy.name]
                 ],
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
@@ -309,7 +303,7 @@ with demo:
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
-                        value="float16" if DEVICE != "cpu" else "float32",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
@@ -348,6 +342,5 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

     WeightType,
     Precision
 )
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
+    return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     ]
     # We use COLS to maintain sorting
     filtered_df = df[
+        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
     ]
     return filtered_df
                             choices=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
+                                if not c.hidden and not c.never_hidden
                             ],
                             value=[
                                 c.name
                 value=leaderboard_df[
                     [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
                     + shown_columns.value
                 ],
                 headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
                 datatype=TYPES,
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
                         label="Precision",
                         multiselect=False,
+                        value="float16",
                         interactive=True,
                     )
                     weight_type = gr.Dropdown(
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
 demo.queue(default_concurrency_limit=40).launch()

main_backend.py DELETED Viewed

@@ -1,78 +0,0 @@
-import logging
-import pprint
-from huggingface_hub import snapshot_download
-logging.getLogger("openai").setLevel(logging.WARNING)
-from src.backend.run_eval_suite import run_evaluation
-from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
-from src.backend.sort_queue import sort_models_by_priority
-from src.envs import QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, DEVICE, API, LIMIT, TOKEN
-from src.about import Tasks, NUM_FEWSHOT
-TASKS_HARNESS = [task.value.benchmark for task in Tasks]
-logging.basicConfig(level=logging.ERROR)
-pp = pprint.PrettyPrinter(width=80)
-PENDING_STATUS = "PENDING"
-RUNNING_STATUS = "RUNNING"
-FINISHED_STATUS = "FINISHED"
-FAILED_STATUS = "FAILED"
-snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60, token=TOKEN)
-def run_auto_eval():
-    current_pending_status = [PENDING_STATUS]
-    # pull the eval dataset from the hub and parse any eval requests
-    # check completed evals and set them to finished
-    check_completed_evals(
-        api=API,
-        checked_status=RUNNING_STATUS,
-        completed_status=FINISHED_STATUS,
-        failed_status=FAILED_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-        hf_repo_results=RESULTS_REPO,
-        local_dir_results=EVAL_RESULTS_PATH_BACKEND
-    )
-    # Get all eval request that are PENDING, if you want to run other evals, change this parameter
-    eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
-    # Sort the evals by priority (first submitted first run)
-    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
-    print(f"Found {len(eval_requests)} {','.join(current_pending_status)} eval requests")
-    if len(eval_requests) == 0:
-        return
-    eval_request = eval_requests[0]
-    pp.pprint(eval_request)
-    set_eval_request(
-        api=API,
-        eval_request=eval_request,
-        set_to_status=RUNNING_STATUS,
-        hf_repo=QUEUE_REPO,
-        local_dir=EVAL_REQUESTS_PATH_BACKEND,
-    )
-    run_evaluation(
-        eval_request=eval_request,
-        task_names=TASKS_HARNESS,
-        num_fewshot=NUM_FEWSHOT,
-        local_dir=EVAL_RESULTS_PATH_BACKEND,
-        results_repo=RESULTS_REPO,
-        batch_size=1,
-        device=DEVICE,
-        no_cache=True,
-        limit=LIMIT
-        )
-if __name__ == "__main__":
-    run_auto_eval()

scripts/create_request_file.py DELETED Viewed

@@ -1,105 +0,0 @@
-import json
-import os
-import pprint
-import re
-from datetime import datetime, timezone
-import click
-from colorama import Fore
-from huggingface_hub import HfApi, snapshot_download
-from src.envs import TOKEN, EVAL_REQUESTS_PATH, QUEUE_REPO
-precisions = ("float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ", "float32")
-model_types = ("pretrained", "fine-tuned", "RL-tuned", "instruction-tuned")
-weight_types = ("Original", "Delta", "Adapter")
-def get_model_size(model_info, precision: str):
-    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
-    try:
-        model_size = round(model_info.safetensors["total"] / 1e9, 3)
-    except (AttributeError, TypeError):
-        try:
-            size_match = re.search(size_pattern, model_info.modelId.lower())
-            model_size = size_match.group(0)
-            model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
-        except AttributeError:
-            return 0  # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
-    size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
-    model_size = size_factor * model_size
-    return model_size
-def main():
-    api = HfApi()
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN)
-    model_name = click.prompt("Enter model name")
-    revision = click.prompt("Enter revision", default="main")
-    precision = click.prompt("Enter precision", default="float16", type=click.Choice(precisions))
-    model_type = click.prompt("Enter model type", type=click.Choice(model_types))
-    weight_type = click.prompt("Enter weight type", default="Original", type=click.Choice(weight_types))
-    base_model = click.prompt("Enter base model", default="")
-    status = click.prompt("Enter status", default="FINISHED")
-    try:
-        model_info = api.model_info(repo_id=model_name, revision=revision)
-    except Exception as e:
-        print(f"{Fore.RED}Could not find model info for {model_name} on the Hub\n{e}{Fore.RESET}")
-        return 1
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        license = "?"
-    eval_entry = {
-        "model": model_name,
-        "base_model": base_model,
-        "revision": revision,
-        "private": False,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": status,
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-    }
-    user_name = ""
-    model_path = model_name
-    if "/" in model_name:
-        user_name = model_name.split("/")[0]
-        model_path = model_name.split("/")[1]
-    pprint.pprint(eval_entry)
-    if click.confirm("Do you want to continue? This request file will be pushed to the hub"):
-        click.echo("continuing...")
-        out_dir = f"{EVAL_REQUESTS_PATH}/{user_name}"
-        os.makedirs(out_dir, exist_ok=True)
-        out_path = f"{out_dir}/{model_path}_eval_request_{False}_{precision}_{weight_type}.json"
-        with open(out_path, "w") as f:
-            f.write(json.dumps(eval_entry))
-        api.upload_file(
-            path_or_fileobj=out_path,
-            path_in_repo=out_path.split(f"{EVAL_REQUESTS_PATH}/")[1],
-            repo_id=QUEUE_REPO,
-            repo_type="dataset",
-            commit_message=f"Add {model_name} to eval queue",
-        )
-    else:
-        click.echo("aborting...")
-if __name__ == "__main__":
-    main()

scripts/fix_harness_import.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""This file should be used after pip install -r requirements.
-It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
-It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
-"""
-import os
-import lm_eval
-if __name__ == "__main__":
-    lm_eval_path = lm_eval.__path__[0]
-    os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)

src/backend/manage_requests.py DELETED Viewed

@@ -1,122 +0,0 @@
-import glob
-import json
-from dataclasses import dataclass
-from typing import Optional
-from huggingface_hub import HfApi, snapshot_download
-from src.envs import TOKEN
-@dataclass
-class EvalRequest:
-    model: str
-    private: bool
-    status: str
-    json_filepath: str
-    weight_type: str = "Original"
-    model_type: str = ""  # pretrained, finetuned, with RL
-    precision: str = ""  # float16, bfloat16
-    base_model: Optional[str] = None # for adapter models
-    revision: str = "main" # commit
-    submitted_time: Optional[str] = "2022-05-18T11:40:22.519222"  # random date just so that we can still order requests by date
-    model_type: Optional[str] = None
-    likes: Optional[int] = 0
-    params: Optional[int] = None
-    license: Optional[str] = ""
-    def get_model_args(self):
-        model_args = f"pretrained={self.model},revision={self.revision}"
-        if self.precision in ["float16", "bfloat16", "float32"]:
-            model_args += f",dtype={self.precision}"
-        # Quantized models need some added config, the install of bits and bytes, etc
-        #elif self.precision == "8bit":
-        #    model_args += ",load_in_8bit=True"
-        #elif self.precision == "4bit":
-        #    model_args += ",load_in_4bit=True"
-        #elif self.precision == "GPTQ":
-            # A GPTQ model does not need dtype to be specified,
-            # it will be inferred from the config
-            pass
-        else:
-            raise Exception(f"Unknown precision {self.precision}.")
-        return model_args
-def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str, hf_repo: str, local_dir: str):
-    """Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
-    json_filepath = eval_request.json_filepath
-    with open(json_filepath) as fp:
-        data = json.load(fp)
-    data["status"] = set_to_status
-    with open(json_filepath, "w") as f:
-        f.write(json.dumps(data))
-    api.upload_file(
-        path_or_fileobj=json_filepath,
-        path_in_repo=json_filepath.replace(local_dir, ""),
-        repo_id=hf_repo,
-        repo_type="dataset",
-    )
-def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[EvalRequest]:
-    """Get all pending evaluation requests and return a list in which private
-    models appearing first, followed by public models sorted by the number of
-    likes.
-    Returns:
-        `list[EvalRequest]`: a list of model info dicts.
-    """
-    snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60, token=TOKEN)
-    json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
-    eval_requests = []
-    for json_filepath in json_files:
-        with open(json_filepath) as fp:
-            data = json.load(fp)
-        if data["status"] in job_status:
-            data["json_filepath"] = json_filepath
-            eval_request = EvalRequest(**data)
-            eval_requests.append(eval_request)
-    return eval_requests
-def check_completed_evals(
-    api: HfApi,
-    hf_repo: str,
-    local_dir: str,
-    checked_status: str,
-    completed_status: str,
-    failed_status: str,
-    hf_repo_results: str,
-    local_dir_results: str,
-):
-    """Checks if the currently running evals are completed, if yes, update their status on the hub."""
-    snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60, token=TOKEN)
-    running_evals = get_eval_requests(checked_status, hf_repo=hf_repo, local_dir=local_dir)
-    for eval_request in running_evals:
-        model = eval_request.model
-        print("====================================")
-        print(f"Checking {model}")
-        output_path = model
-        output_file = f"{local_dir_results}/{output_path}/results*.json"
-        output_file_exists = len(glob.glob(output_file)) > 0
-        if output_file_exists:
-            print(
-                f"EXISTS output file exists for {model} setting it to {completed_status}"
-            )
-            set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
-        else:
-            print(
-                f"No result file found for {model} setting it to {failed_status}"
-            )
-            set_eval_request(api, eval_request, failed_status, hf_repo, local_dir)

src/backend/run_eval_suite.py DELETED Viewed

@@ -1,57 +0,0 @@
-import json
-import os
-import logging
-from datetime import datetime
-from lm_eval import tasks, evaluator, utils
-from src.envs import RESULTS_REPO, API
-from src.backend.manage_requests import EvalRequest
-logging.getLogger("openai").setLevel(logging.WARNING)
-def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, local_dir: str, results_repo: str, no_cache=True, limit=None):
-    if limit:
-        print(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-    task_names = utils.pattern_match(task_names, tasks.ALL_TASKS)
-    print(f"Selected Tasks: {task_names}")
-    results = evaluator.simple_evaluate(
-        model="hf-causal-experimental", # "hf-causal"
-        model_args=eval_request.get_model_args(),
-        tasks=task_names,
-        num_fewshot=num_fewshot,
-        batch_size=batch_size,
-        device=device,
-        no_cache=no_cache,
-        limit=limit,
-        write_out=True,
-        output_base_path="logs"
-    )
-    results["config"]["model_dtype"] = eval_request.precision
-    results["config"]["model_name"] = eval_request.model
-    results["config"]["model_sha"] = eval_request.revision
-    dumped = json.dumps(results, indent=2)
-    print(dumped)
-    output_path = os.path.join(local_dir, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
-    os.makedirs(os.path.dirname(output_path), exist_ok=True)
-    with open(output_path, "w") as f:
-        f.write(dumped)
-    print(evaluator.make_table(results))
-    API.upload_file(
-        path_or_fileobj=output_path,
-        path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
-        repo_id=results_repo,
-        repo_type="dataset",
-    )
-    return results

src/backend/sort_queue.py DELETED Viewed

@@ -1,28 +0,0 @@
-import re
-from dataclasses import dataclass
-from huggingface_hub import HfApi
-from src.backend.manage_requests import EvalRequest
-@dataclass
-class ModelMetadata:
-    likes: int = 0
-    size: int = 15
-def sort_models_by_priority(api: HfApi, models: list[EvalRequest]) -> list[EvalRequest]:
-    private_models = [model for model in models if model.private]
-    public_models = [model for model in models if not model.private]
-    return sort_by_submit_date(private_models) + sort_by_submit_date(public_models)
-def sort_by_submit_date(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.submitted_time, reverse=False)
-def sort_by_size(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.params, reverse=False)
-def sort_by_likes(eval_requests: list[EvalRequest]) -> list[EvalRequest]:
-    return sorted(eval_requests, key=lambda x: x.likes, reverse=False)

src/display/css_html_js.py CHANGED Viewed

@@ -38,12 +38,6 @@ custom_css = """
     padding: 0px;
 }
-/* Hides the final AutoEvalColumn */
-#llm-benchmark-tab-table table td:last-child,
-#llm-benchmark-tab-table table th:last-child {
-    display: none;
-}
 /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {

     padding: 0px;
 }
 /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
 table td:first-child,
 table th:first-child {

src/display/utils.py CHANGED Viewed

@@ -19,7 +19,6 @@ class ColumnContent:
     displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
-    dummy: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
@@ -40,8 +39,6 @@ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
-# Dummy column for the search bar (hidden by the custom CSS)
-auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

     displayed_by_default: bool
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
 auto_eval_column_dict = []
 auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)

src/envs.py CHANGED Viewed

@@ -6,9 +6,7 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request file
-DEVICE = "cpu" # "cuda:0" if you add compute
-LIMIT = 20 # !!!! Should be None for actual evaluations!!!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"

 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
+OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -116,7 +116,6 @@ class EvalResult:
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.dummy.name: self.full_model,
             AutoEvalColumn.revision.name: self.revision,
             AutoEvalColumn.average.name: average,
             AutoEvalColumn.license.name: self.license,

             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.revision.name: self.revision,
             AutoEvalColumn.average.name: average,
             AutoEvalColumn.license.name: self.license,