“WadoodAbdul” commited on
Commit
215aa92
1 Parent(s): 4b57d62

added model submission functionality

Browse files
app.py CHANGED
@@ -27,13 +27,14 @@ from src.display.utils import (
27
  AutoEvalColumn,
28
  ModelType,
29
  ModelArch,
 
30
  Precision,
31
  WeightType,
32
  fields,
33
  )
34
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
35
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
36
- from src.submission.submit import add_new_eval
37
 
38
 
39
  def restart_space():
@@ -155,7 +156,61 @@ def filter_models(
155
 
156
  return filtered_df
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
 
159
  demo = gr.Blocks(css=custom_css)
160
  with demo:
161
  gr.HTML(TITLE)
@@ -370,6 +425,108 @@ with demo:
370
  with gr.Row():
371
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
 
374
  with gr.Row():
375
  with gr.Accordion("📙 Citation", open=False):
 
27
  AutoEvalColumn,
28
  ModelType,
29
  ModelArch,
30
+ PromptTemplateName,
31
  Precision,
32
  WeightType,
33
  fields,
34
  )
35
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
36
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
37
+ from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
38
 
39
 
40
  def restart_space():
 
156
 
157
  return filtered_df
158
 
159
+ def change_submit_request_form(model_architecture):
160
+ match model_architecture:
161
+ case "Encoder":
162
+ return (
163
+ gr.Textbox(label="Threshold for gliner models", visible=False),
164
+ gr.Radio(
165
+ choices=["True", "False"],
166
+ label="Load GLiNER Tokenizer",
167
+ visible=False
168
+ ),
169
+ gr.Dropdown(
170
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
171
+ label="Prompt for generation",
172
+ multiselect=False,
173
+ # value="HTML Highlighted Spans",
174
+ interactive=True,
175
+ visible=False
176
+ )
177
+ )
178
+ case "Decoder":
179
+ return (
180
+ gr.Textbox(label="Threshold for gliner models", visible=False),
181
+ gr.Radio(
182
+ choices=["True", "False"],
183
+ label="Load GLiNER Tokenizer",
184
+ visible=False
185
+ ),
186
+ gr.Dropdown(
187
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
188
+ label="Prompt for generation",
189
+ multiselect=False,
190
+ # value="HTML Highlighted Spans",
191
+ interactive=True,
192
+ visible=True
193
+ )
194
+ )
195
+ case "GLiNER Encoder":
196
+ return (
197
+ gr.Textbox(label="Threshold for gliner models", visible=True),
198
+ gr.Radio(
199
+ choices=["True", "False"],
200
+ label="Load GLiNER Tokenizer",
201
+ visible=True
202
+ ),
203
+ gr.Dropdown(
204
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
205
+ label="Prompt for generation",
206
+ multiselect=False,
207
+ # value="HTML Highlighted Spans",
208
+ interactive=True,
209
+ visible=False
210
+ )
211
+ )
212
 
213
+
214
  demo = gr.Blocks(css=custom_css)
215
  with demo:
216
  gr.HTML(TITLE)
 
425
  with gr.Row():
426
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
427
 
428
+ with gr.Column():
429
+ with gr.Accordion(
430
+ f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
431
+ open=False,
432
+ ):
433
+ with gr.Row():
434
+ finished_eval_table = gr.components.Dataframe(
435
+ value=finished_eval_queue_df,
436
+ headers=EVAL_COLS,
437
+ datatype=EVAL_TYPES,
438
+ row_count=5,
439
+ )
440
+ with gr.Accordion(
441
+ f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
442
+ open=False,
443
+ ):
444
+ with gr.Row():
445
+ running_eval_table = gr.components.Dataframe(
446
+ value=running_eval_queue_df,
447
+ headers=EVAL_COLS,
448
+ datatype=EVAL_TYPES,
449
+ row_count=5,
450
+ )
451
+
452
+ with gr.Accordion(
453
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
454
+ open=False,
455
+ ):
456
+ with gr.Row():
457
+ pending_eval_table = gr.components.Dataframe(
458
+ value=pending_eval_queue_df,
459
+ headers=EVAL_COLS,
460
+ datatype=EVAL_TYPES,
461
+ row_count=5,
462
+ )
463
+ with gr.Row():
464
+ gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
465
+
466
+ with gr.Row():
467
+ with gr.Column():
468
+
469
+ model_name_textbox = gr.Textbox(label="Model name")
470
+
471
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
472
+
473
+ model_arch = gr.Radio(
474
+ choices=[t.to_str(" : ") for t in ModelArch if t != ModelArch.Unknown],
475
+ label="Model Architecture",
476
+ )
477
+
478
+ model_type = gr.Dropdown(
479
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
480
+ label="Model type",
481
+ multiselect=False,
482
+ value=None,
483
+ interactive=True,
484
+ )
485
+
486
+ with gr.Column():
487
+ label_normalization_map = gr.Textbox(lines=6, label="Label Normalization Map", placeholder=PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG)
488
+ gliner_threshold = gr.Textbox(label="Threshold for GLiNER models", visible=False)
489
+ gliner_tokenizer_bool = gr.Radio(
490
+ choices=["True", "False"],
491
+ label="Load GLiNER Tokenizer",
492
+ visible=False
493
+ )
494
+ prompt_name = gr.Dropdown(
495
+ choices=[prompt_template.value for prompt_template in PromptTemplateName],
496
+ label="Prompt for generation",
497
+ multiselect=False,
498
+ value="HTML Highlighted Spans",
499
+ interactive=True,
500
+ visible=False
501
+ )# should be a dropdown
502
+
503
+ # parsing_function - this is tied to the prompt & therefore does not need to be specified
504
+ # generation_parameters = gr.Textbox(label="Generation params in json format") just default for now
505
+
506
+ model_arch.change(fn=change_submit_request_form, inputs=model_arch, outputs=[
507
+ gliner_threshold,
508
+ gliner_tokenizer_bool,
509
+ prompt_name])
510
+
511
+ submit_button = gr.Button("Submit Eval")
512
+ submission_result = gr.Markdown()
513
+ submit_button.click(
514
+ add_new_eval,
515
+ [
516
+ model_name_textbox,
517
+ # base_model_name_textbox,
518
+ revision_name_textbox,
519
+ model_arch,
520
+ label_normalization_map,
521
+ gliner_threshold,
522
+ gliner_tokenizer_bool,
523
+ prompt_name,
524
+ # weight_type,
525
+ model_type,
526
+ ],
527
+ submission_result,
528
+ )
529
+
530
 
531
  with gr.Row():
532
  with gr.Accordion("📙 Citation", open=False):
src/about.py CHANGED
@@ -43,27 +43,32 @@ NUM_FEWSHOT = 0 # Change with your few shot
43
 
44
 
45
  # Your leaderboard name
46
- TITLE = """<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
47
  LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
48
  # What does your leaderboard evaluate?
49
  INTRODUCTION_TEXT = """
50
- Named Entity Recognition of clinical entities is crucial for advancing natural language processing (NLP) applications in healthcare as it is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
51
- The datasets used for this evaluation encompass a wide range of medical entities, including diseases, symptoms, medications, procedures and anatomical terms. These datasets are sourced from openly available clinical data (including annotations) to ensure comprehensive coverage and reflect the complexity of real-world medical language. More details about the datasets included can be found in the "About" section.
52
- The evaluation metrics used in this leaderboard focus primarily on the F1-score, a widely recognized measure of a model's accuracy. More details about the evaluation metric can be found in the "About" section
 
 
 
 
53
  """
54
 
55
  # Which evaluations are you running? how can people reproduce what you have?
56
  LLM_BENCHMARKS_TEXT = f"""
57
 
58
- #### Disclaimer & Advisory
59
-
60
- It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
61
-
62
- ## About
63
  The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
 
64
 
65
  ## How it works
66
 
 
 
 
 
 
67
  ### Datasets
68
  📈 We evaluate the models on 4 datasets, encompassing 6 entity types
69
  - [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
@@ -81,7 +86,30 @@ To reproduce our results, follow the steps detailed [here](https://github.com/Wa
81
  """
82
 
83
  EVALUATION_QUEUE_TEXT = """
84
- Follow the steps detailed in the [medics_ner](https://github.com/WadoodAbdul/medics_ner/blob/master/docs/submit_to_leaderboard.md) repo to upload you model to the leaderoard.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  """
86
 
87
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 
43
 
44
 
45
  # Your leaderboard name
46
+ TITLE = """""" #<h1 align="center" id="space-title"> NER Leaderboard</h1>"""
47
  LOGO = """<img src="file/assets/image.png" alt="Clinical X HF" width="500" height="333">"""
48
  # What does your leaderboard evaluate?
49
  INTRODUCTION_TEXT = """
50
+ The main goal of the Named Clinical Entity Recognition Leaderboard is to evaluate and benchmark the performance of various language models in accurately identifying and classifying named clinical entities across diverse medical domains. This task is crucial for advancing natural language processing (NLP) applications in healthcare, as accurate entity recognition is foundational for tasks such as information extraction, clinical decision support, and automated documentation.
51
+
52
+ The datasets used for this evaluation encompass a wide range of medical entities, including diseases, symptoms, medications, procedures and anatomical terms. These datasets are sourced from openly available clinical data (including annotations) to ensure comprehensive coverage and reflect the complexity of real-world medical language. More details about the datasets included can be found below ("About" section).
53
+
54
+ The evaluation metrics used in this leaderboard focus primarily on the F1-score, a widely recognized measure of a model's accuracy. The different modes of evaluation are also described below.
55
+
56
+ Disclaimer: It is important to note that the purpose of this evaluation is purely academic and exploratory. The models assessed here have not been approved for clinical use, and their results should not be interpreted as clinically validated. The leaderboard serves as a platform for researchers to compare models, understand their strengths and limitations, and drive further advancements in the field of clinical NLP.
57
  """
58
 
59
  # Which evaluations are you running? how can people reproduce what you have?
60
  LLM_BENCHMARKS_TEXT = f"""
61
 
 
 
 
 
 
62
  The Named Clinical Entity Recognition Leaderboard is aimed at advancing the field of natural language processing in healthcare. It provides a standardized platform for evaluating and comparing the performance of various language models in recognizing named clinical entities, a critical task for applications such as clinical documentation, decision support, and information extraction. By fostering transparency and facilitating benchmarking, the leaderboard's goal is to drive innovation and improvement in NLP models. It also helps researchers identify the strengths and weaknesses of different approaches, ultimately contributing to the development of more accurate and reliable tools for clinical use. Despite its exploratory nature, the leaderboard aims to play a role in guiding research and ensuring that advancements are grounded in rigorous and comprehensive evaluations.
63
+ ## About
64
 
65
  ## How it works
66
 
67
+ ### Evaluation method and metrics
68
+ When training a Named Entity Recognition (NER) system, the most common evaluation methods involve measuring precision, recall, and F1-score at the token level. While these metrics are useful for fine-tuning the NER system, evaluating the predicted named entities for downstream tasks requires metrics at the full named-entity level. We include both evaluation methods: token-based and span-based. We provide an example below which helps in understanding the difference between the methods.
69
+ Example Sentence: "The patient was diagnosed with a skin cancer disease."
70
+ For simplicity, let's assume the an example sentence which contains 10 tokens, with a single two-token disease entity (as shown in the figure below).
71
+
72
  ### Datasets
73
  📈 We evaluate the models on 4 datasets, encompassing 6 entity types
74
  - [NCBI](https://huggingface.co/datasets/m42-health/clinical_ncbi)
 
86
  """
87
 
88
  EVALUATION_QUEUE_TEXT = """
89
+
90
+ Currently, the benchmark supports evaluation for models hosted on the huggingface hub and of type encoder, decoder or gliner type models.
91
+ If your model needs a custom implementation, follow the steps outlined in the [medics_ner](https://github.com/WadoodAbdul/medics_ner/blob/master/docs/custom_model_implementation.md) repo or reach out to our team!
92
+
93
+
94
+ ### Fields Explanation
95
+
96
+ #### Model Type:
97
+ - Fine-Tuned: If the training data consisted of any split/variation of the datasets on the leaderboard.
98
+ - Zero-Shot: If the model did not have any exposure to the datasets on the leaderboard while training.
99
+
100
+ #### Model Architecture:
101
+ - Encoder: The standard transformer encoder architecture with a token classification head on top.
102
+ - Decoder: Transformer based autoregressive token generation model.
103
+ - GLiNER: Architecture outlined in the [GLiNER Paper](https://arxiv.org/abs/2311.08526)
104
+
105
+ #### Label Normalization Map:
106
+ Not all models have been tuned to output the ner label names in the clinical datasets on this leaderboard. Some models cater to the same entity names with a synonym of it.
107
+ The normalization map can be used to ensure that the models's output are aligned with the labels expected in the datasets.
108
+
109
+ Note: Multiple model labels can be mapped to a single entity type in the leaderboard dataset. Ex: 'synonym' and 'disease' to 'condition'
110
+
111
+
112
+ Upon successful submission of your request, your model's result would be updated on the leaderboard within 5 working days!
113
  """
114
 
115
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
src/display/utils.py CHANGED
@@ -60,8 +60,9 @@ class EvalQueueColumn: # Queue column
60
  model = ColumnContent("model", "markdown", True)
61
  revision = ColumnContent("revision", "str", True)
62
  private = ColumnContent("private", "bool", True)
63
- precision = ColumnContent("precision", "str", True)
64
- weight_type = ColumnContent("weight_type", "str", "Original")
 
65
  status = ColumnContent("status", "str", True)
66
 
67
 
@@ -104,7 +105,7 @@ class ModelType(Enum):
104
  class ModelArch(Enum):
105
  Encoder = ModelDetails("Encoder")
106
  Decoder = ModelDetails("Decoder")
107
- EncoderDecoder = ModelDetails("EncoderDecoder")
108
  Unknown = ModelDetails(name="Other", symbol="?")
109
 
110
  def to_str(self, separator=" "):
@@ -116,8 +117,8 @@ class ModelArch(Enum):
116
  return ModelArch.Encoder
117
  if "decoder" in type:
118
  return ModelArch.Decoder
119
- if "encoder-decoder" in type:
120
- return ModelArch.EncoderDecoder
121
  # if "unknown" in type:
122
  # return ModelArch.Unknown
123
  return ModelArch.Unknown
@@ -154,6 +155,14 @@ class Precision(Enum):
154
  return Precision.Unknown
155
 
156
 
 
 
 
 
 
 
 
 
157
  # Column selection
158
  DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
159
  Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
 
60
  model = ColumnContent("model", "markdown", True)
61
  revision = ColumnContent("revision", "str", True)
62
  private = ColumnContent("private", "bool", True)
63
+ architecture = ColumnContent("model_architecture", "bool", True)
64
+ # precision = ColumnContent("precision", "str", True)
65
+ # weight_type = ColumnContent("weight_type", "str", "Original")
66
  status = ColumnContent("status", "str", True)
67
 
68
 
 
105
  class ModelArch(Enum):
106
  Encoder = ModelDetails("Encoder")
107
  Decoder = ModelDetails("Decoder")
108
+ GLiNEREncoder = ModelDetails("GLiNER Encoder")
109
  Unknown = ModelDetails(name="Other", symbol="?")
110
 
111
  def to_str(self, separator=" "):
 
117
  return ModelArch.Encoder
118
  if "decoder" in type:
119
  return ModelArch.Decoder
120
+ if "GLiNEREncoder" in type:
121
+ return ModelArch.GLiNEREncoder
122
  # if "unknown" in type:
123
  # return ModelArch.Unknown
124
  return ModelArch.Unknown
 
155
  return Precision.Unknown
156
 
157
 
158
+ class PromptTemplateName(Enum):
159
+ UniversalNERTemplate = "universal_ner"
160
+ LLMHTMLHighlightedSpansTemplate = "llm_html_highlighted_spans"
161
+ LLamaNERTemplate = "llama_70B_ner_v0.3"
162
+ MixtralNERTemplate = "mixtral_ner_v0.3.jinja"
163
+
164
+
165
+
166
  # Column selection
167
  DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.clinical_type_col]
168
  Clinical_TYPES_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col]
src/submission/check_validity.py CHANGED
@@ -59,14 +59,24 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
59
  return False, "was not found on hub!", None
60
 
61
 
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
  """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
 
 
64
  try:
65
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
  except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
 
 
 
 
 
 
 
 
70
  model_size = size_factor * model_size
71
  return model_size
72
 
@@ -88,7 +98,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
 
93
  # Select organisation
94
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
59
  return False, "was not found on hub!", None
60
 
61
 
62
+ def get_model_size(model_info: ModelInfo, precision: str=None):
63
  """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
+ size_pattern = re.compile(r"(\d+\.)?\d+(b|m)")
65
+
66
  try:
67
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
68
  except (AttributeError, TypeError):
69
+ try:
70
+ size_match = re.search(size_pattern, model_info.id.lower())
71
+ model_size = size_match.group(0)
72
+ model_size = round(float(model_size[:-1]) if model_size[-1] == "b" else float(model_size[:-1]) / 1e3, 3)
73
+ except AttributeError:
74
+ return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
75
+
76
+ if precision:
77
+ size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
78
+ else:
79
+ size_factor = 1
80
  model_size = size_factor * model_size
81
  return model_size
82
 
 
98
  continue
99
  with open(os.path.join(root, file), "r") as f:
100
  info = json.load(f)
101
+ file_names.append(f"{info['model']}_{info['revision']}")
102
 
103
  # Select organisation
104
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/submission/submit.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import os
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
@@ -10,18 +11,57 @@ from src.submission.check_validity import (
10
  get_model_size,
11
  is_model_on_hub,
12
  )
 
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def add_new_eval(
18
  model: str,
19
- base_model: str,
20
  revision: str,
21
- precision: str,
22
- weight_type: str,
 
 
 
 
 
23
  model_type: str,
24
  ):
 
 
 
 
 
 
 
 
 
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
@@ -33,26 +73,32 @@ def add_new_eval(
33
  user_name = model.split("/")[0]
34
  model_path = model.split("/")[1]
35
 
36
- precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
  if model_type is None or model_type == "":
40
  return styled_error("Please select a model type.")
 
 
41
 
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
45
 
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
 
52
- if not weight_type == "Adapter":
53
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
  if not model_on_hub:
55
  return styled_error(f'Model "{model}" {error}')
 
 
 
 
56
 
57
  # Is the model info correctly filled?
58
  try:
@@ -60,7 +106,7 @@ def add_new_eval(
60
  except Exception:
61
  return styled_error("Could not get your model information. Please fill it up properly.")
62
 
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
 
65
  # Were the model card and license filled?
66
  try:
@@ -72,15 +118,52 @@ def add_new_eval(
72
  if not modelcard_OK:
73
  return styled_error(error_msg)
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # Seems good, creating the eval
76
  print("Adding new eval")
77
 
 
78
  eval_entry = {
79
  "model": model,
80
- "base_model": base_model,
81
  "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
 
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
  "model_type": model_type,
@@ -88,16 +171,18 @@ def add_new_eval(
88
  "params": model_size,
89
  "license": license,
90
  "private": False,
 
91
  }
92
 
93
  # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
 
96
 
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
 
1
  import json
2
  import os
3
+ import ast
4
  from datetime import datetime, timezone
5
 
6
  from src.display.formatting import styled_error, styled_message, styled_warning
 
11
  get_model_size,
12
  is_model_on_hub,
13
  )
14
+ from src.display.utils import PromptTemplateName
15
 
16
  REQUESTED_MODELS = None
17
  USERS_TO_SUBMISSION_DATES = None
18
 
19
+ PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG = """{
20
+ "NCBI" : {
21
+ "" : "condition"
22
+ },
23
+ "CHIA" : {
24
+ "" : "condition"
25
+ "" : "drug"
26
+ "" : "procedure"
27
+ "" : "measurement"
28
+ },
29
+ "BIORED" : {
30
+ "" : "condition"
31
+ "" : "drug"
32
+ "" : "gene"
33
+ "" : "gene variant"
34
+ },
35
+ "BC5CDR" : {
36
+ "" : "condition"
37
+ "" : "drug"
38
+ }
39
+ }
40
+
41
+ """
42
+
43
  def add_new_eval(
44
  model: str,
45
+ # base_model: str,
46
  revision: str,
47
+ # precision: str,
48
+ # weight_type: str,
49
+ model_arch: str,
50
+ label_normalization_map: str,
51
+ gliner_threshold:str,
52
+ gliner_tokenizer_bool:str,
53
+ prompt_template_name:str,
54
  model_type: str,
55
  ):
56
+ """
57
+ Saves request if valid else returns the error.
58
+ Validity is checked based on -
59
+ - model's existence on hub
60
+ - necessary info on the model's card
61
+ - label normalization is a valid python dict and contains the keys for all datasets
62
+ - threshold for gliner is a valid float
63
+
64
+ """
65
  global REQUESTED_MODELS
66
  global USERS_TO_SUBMISSION_DATES
67
  if not REQUESTED_MODELS:
 
73
  user_name = model.split("/")[0]
74
  model_path = model.split("/")[1]
75
 
76
+ # precision = precision.split(" ")[0]
77
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
78
 
79
  if model_type is None or model_type == "":
80
  return styled_error("Please select a model type.")
81
+
82
+ model_type = model_type.split(":")[-1].strip()
83
 
84
  # Does the model actually exist?
85
  if revision == "":
86
  revision = "main"
87
 
88
+ # # Is the model on the hub?
89
+ # if weight_type in ["Delta", "Adapter"]:
90
+ # base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
91
+ # if not base_model_on_hub:
92
+ # return styled_error(f'Base model "{base_model}" {error}')
93
 
94
+ if not model_arch == "GLiNER Encoder":
95
  model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
96
  if not model_on_hub:
97
  return styled_error(f'Model "{model}" {error}')
98
+ else:
99
+ if len(list(API.list_models(model_name=model))) !=1:
100
+ return styled_error(f'Model "{model}" does not exist on the hub!')
101
+
102
 
103
  # Is the model info correctly filled?
104
  try:
 
106
  except Exception:
107
  return styled_error("Could not get your model information. Please fill it up properly.")
108
 
109
+ model_size = get_model_size(model_info=model_info)
110
 
111
  # Were the model card and license filled?
112
  try:
 
118
  if not modelcard_OK:
119
  return styled_error(error_msg)
120
 
121
+ # Verify the inference config now
122
+ try:
123
+ label_normalization_map = ast.literal_eval(label_normalization_map)
124
+ except Exception as e:
125
+ return styled_error("Please enter a valid json for the labe; normalization map")
126
+
127
+ inference_config = {
128
+ # "model_arch" : model_arch,
129
+ "label_normalization_map": label_normalization_map,
130
+ }
131
+
132
+ match model_arch:
133
+ case "Encoder":
134
+ pass
135
+ case "Decoder":
136
+ if not prompt_template_name in [prompt_template.value for prompt_template in PromptTemplateName]:
137
+ return styled_error("Prompt template name is invalid")
138
+ inference_config = {
139
+ **inference_config,
140
+ "prompt_template_name": prompt_template_name,
141
+ }
142
+ case "GLiNER Encoder":
143
+ try:
144
+ gliner_threshold = float(gliner_threshold)
145
+ gliner_tokenizer_bool = ast.literal_eval(gliner_tokenizer_bool)
146
+ inference_config = {
147
+ **inference_config,
148
+ "gliner_threshold": gliner_threshold,
149
+ "gliner_tokenizer_bool" : gliner_tokenizer_bool
150
+ }
151
+ except Exception as e:
152
+ return styled_error("Please enter a valid float for the threshold")
153
+ case _:
154
+ return styled_error("Model Architecture is invalid")
155
+
156
  # Seems good, creating the eval
157
  print("Adding new eval")
158
 
159
+
160
  eval_entry = {
161
  "model": model,
162
+ # "base_model": base_model,
163
  "revision": revision,
164
+ # "precision": precision,
165
+ # "weight_type": weight_type,
166
+ "model_architecture": model_arch,
167
  "status": "PENDING",
168
  "submitted_time": current_time,
169
  "model_type": model_type,
 
171
  "params": model_size,
172
  "license": license,
173
  "private": False,
174
+ "inference_config":inference_config,
175
  }
176
 
177
  # Check for duplicate submission
178
+
179
+ if f"{model}_{revision}" in REQUESTED_MODELS:
180
+ return styled_warning("This model has been already submitted. Add the revision if the model has been updated.")
181
 
182
  print("Creating eval file")
183
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
184
  os.makedirs(OUT_DIR, exist_ok=True)
185
+ out_path = f"{OUT_DIR}/{model_path}_{revision}_eval_request.json"
186
 
187
  with open(out_path, "w") as f:
188
  f.write(json.dumps(eval_entry))