Spaces:

CDT-BMAI-GP
/

biomed_probing_leaderboard

Runtime error

App Files Files Community

chaeeunlee commited on Feb 8, 2024

Commit

5402045

1 Parent(s): 447928c

fixed local HF_HOME

Browse files

Files changed (47) hide show

.DS_Store +0 -0
app.py +3 -0
hub/.locks/datasets--chaeeunlee--test_requests/28df5f900b358436f0267334b3e3e9af33f917ba.lock +0 -0
hub/.locks/datasets--chaeeunlee--test_requests/32897cd3e640101ba184f8c4ccd896981de3804a.lock +0 -0
hub/.locks/datasets--chaeeunlee--test_results/28df5f900b358436f0267334b3e3e9af33f917ba.lock +0 -0
hub/.locks/datasets--chaeeunlee--test_results/32897cd3e640101ba184f8c4ccd896981de3804a.lock +0 -0
hub/.locks/models--EleutherAI--pythia-70m/0204ed10c186a4c7c68f55dff8f26087a45898d6.lock +0 -0
hub/.locks/models--EleutherAI--pythia-70m/d7a9196e329eaf06d6e2802fed376e7459834236.lock +0 -0
hub/.locks/models--EleutherAI--pythia-70m/df0253c0ab197de15c12fa7fbb7edcca9b6848a3.lock +0 -0
hub/.locks/models--EleutherAI--pythia-70m/f1860edb10f80bcaf7b023fce47c68a23b724c23.lock +0 -0
hub/.locks/models--EleutherAI--pythia-70m/f74dfbfab8f97770a87769c739fb080c21c8bacc.lock +0 -0
hub/datasets--chaeeunlee--test_requests/blobs/28df5f900b358436f0267334b3e3e9af33f917ba +55 -0
hub/datasets--chaeeunlee--test_requests/blobs/32897cd3e640101ba184f8c4ccd896981de3804a +3 -0
hub/datasets--chaeeunlee--test_requests/refs/main +1 -0
hub/datasets--chaeeunlee--test_results/blobs/28df5f900b358436f0267334b3e3e9af33f917ba +55 -0
hub/datasets--chaeeunlee--test_results/blobs/32897cd3e640101ba184f8c4ccd896981de3804a +3 -0
hub/datasets--chaeeunlee--test_results/refs/main +1 -0
hub/models--EleutherAI--pythia-70m/.no_exist/a39f36b100fe8a5377810d56c3f4789b9c53ac42/added_tokens.json +0 -0
hub/models--EleutherAI--pythia-70m/.no_exist/a39f36b100fe8a5377810d56c3f4789b9c53ac42/merges.txt +0 -0
hub/models--EleutherAI--pythia-70m/.no_exist/a39f36b100fe8a5377810d56c3f4789b9c53ac42/vocab.json +0 -0
hub/models--EleutherAI--pythia-70m/blobs/0204ed10c186a4c7c68f55dff8f26087a45898d6 +5 -0
hub/models--EleutherAI--pythia-70m/blobs/d7a9196e329eaf06d6e2802fed376e7459834236 +24 -0
hub/models--EleutherAI--pythia-70m/blobs/df0253c0ab197de15c12fa7fbb7edcca9b6848a3 +294 -0
hub/models--EleutherAI--pythia-70m/blobs/f1860edb10f80bcaf7b023fce47c68a23b724c23 +9 -0
hub/models--EleutherAI--pythia-70m/blobs/f74dfbfab8f97770a87769c739fb080c21c8bacc +0 -0
hub/models--EleutherAI--pythia-70m/refs/main +1 -0
hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/README.md +1 -0
hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/config.json +1 -0
hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/special_tokens_map.json +1 -0
hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/tokenizer.json +1 -0
hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/tokenizer_config.json +1 -0
hub/version.txt +1 -0
manage_repos.ipynb +28 -20
src/.DS_Store +0 -0
src/backend/.DS_Store +0 -0
src/backend/envs.py +4 -2
src/backend/tasks/.DS_Store +0 -0
src/backend/tasks/medmcqa/.DS_Store +0 -0
src/backend/tasks/medmcqa/medmcqa.yaml +18 -0
src/backend/tasks/medmcqa/utils_medmcqa.py +19 -0
src/backend/tasks/medqa/.DS_Store +0 -0
src/backend/tasks/medqa/medqa.yaml +16 -0
src/backend/tasks/medqa/preprocess_medqa.py +8 -0
src/display/__pycache__/utils.cpython-310.pyc +0 -0
src/display/utils.py +5 -2
src/populate.py +2 -1
src/submission/__pycache__/submit.cpython-310.pyc +0 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -124,9 +124,12 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
 ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) # k the problem is that the results are only saved in _bk dirs.
 leaderboard_df = original_df.copy()
 ################################################################################################################################

 ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
+print(f"COLS = {COLS}")
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) # k the problem is that the results are only saved in _bk dirs.
 leaderboard_df = original_df.copy()
+print(f"leaderboard_df = {leaderboard_df}")
 ################################################################################################################################

hub/.locks/datasets--chaeeunlee--test_requests/28df5f900b358436f0267334b3e3e9af33f917ba.lock ADDED Viewed

File without changes

hub/.locks/datasets--chaeeunlee--test_requests/32897cd3e640101ba184f8c4ccd896981de3804a.lock ADDED Viewed

File without changes

hub/.locks/datasets--chaeeunlee--test_results/28df5f900b358436f0267334b3e3e9af33f917ba.lock ADDED Viewed

File without changes

hub/.locks/datasets--chaeeunlee--test_results/32897cd3e640101ba184f8c4ccd896981de3804a.lock ADDED Viewed

File without changes

hub/.locks/models--EleutherAI--pythia-70m/0204ed10c186a4c7c68f55dff8f26087a45898d6.lock ADDED Viewed

File without changes

hub/.locks/models--EleutherAI--pythia-70m/d7a9196e329eaf06d6e2802fed376e7459834236.lock ADDED Viewed

File without changes

hub/.locks/models--EleutherAI--pythia-70m/df0253c0ab197de15c12fa7fbb7edcca9b6848a3.lock ADDED Viewed

File without changes

hub/.locks/models--EleutherAI--pythia-70m/f1860edb10f80bcaf7b023fce47c68a23b724c23.lock ADDED Viewed

File without changes

hub/.locks/models--EleutherAI--pythia-70m/f74dfbfab8f97770a87769c739fb080c21c8bacc.lock ADDED Viewed

File without changes

hub/datasets--chaeeunlee--test_requests/blobs/28df5f900b358436f0267334b3e3e9af33f917ba ADDED Viewed

	@@ -0,0 +1,55 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

hub/datasets--chaeeunlee--test_requests/blobs/32897cd3e640101ba184f8c4ccd896981de3804a ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: mit
+---

hub/datasets--chaeeunlee--test_requests/refs/main ADDED Viewed

	@@ -0,0 +1 @@


1	+ 7d6de7e1844e0f6535382e4509843534d6f043bb

hub/datasets--chaeeunlee--test_results/blobs/28df5f900b358436f0267334b3e3e9af33f917ba ADDED Viewed

	@@ -0,0 +1,55 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.lz4 filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+# Audio files - uncompressed
+*.pcm filter=lfs diff=lfs merge=lfs -text
+*.sam filter=lfs diff=lfs merge=lfs -text
+*.raw filter=lfs diff=lfs merge=lfs -text
+# Audio files - compressed
+*.aac filter=lfs diff=lfs merge=lfs -text
+*.flac filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.ogg filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+# Image files - uncompressed
+*.bmp filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.tiff filter=lfs diff=lfs merge=lfs -text
+# Image files - compressed
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.webp filter=lfs diff=lfs merge=lfs -text

hub/datasets--chaeeunlee--test_results/blobs/32897cd3e640101ba184f8c4ccd896981de3804a ADDED Viewed

	@@ -0,0 +1,3 @@

+---
+license: mit
+---

hub/datasets--chaeeunlee--test_results/refs/main ADDED Viewed

	@@ -0,0 +1 @@


1	+ da979ab65e0398923653d6a75c5eddd5f7c7c098

hub/models--EleutherAI--pythia-70m/.no_exist/a39f36b100fe8a5377810d56c3f4789b9c53ac42/added_tokens.json ADDED Viewed

File without changes

hub/models--EleutherAI--pythia-70m/.no_exist/a39f36b100fe8a5377810d56c3f4789b9c53ac42/merges.txt ADDED Viewed

File without changes

hub/models--EleutherAI--pythia-70m/.no_exist/a39f36b100fe8a5377810d56c3f4789b9c53ac42/vocab.json ADDED Viewed

File without changes

hub/models--EleutherAI--pythia-70m/blobs/0204ed10c186a4c7c68f55dff8f26087a45898d6 ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "unk_token": "<|endoftext|>"
+}

hub/models--EleutherAI--pythia-70m/blobs/d7a9196e329eaf06d6e2802fed376e7459834236 ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "architectures": [
+    "GPTNeoXForCausalLM"
+  ],
+  "bos_token_id": 0,
+  "eos_token_id": 0,
+  "hidden_act": "gelu",
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 2048,
+  "model_type": "gpt_neox",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 6,
+  "rotary_emb_base": 10000,
+  "rotary_pct": 0.25,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.24.0",
+  "use_cache": true,
+  "use_parallel_residual": true,
+  "vocab_size": 50304
+}

hub/models--EleutherAI--pythia-70m/blobs/df0253c0ab197de15c12fa7fbb7edcca9b6848a3 ADDED Viewed

	@@ -0,0 +1,294 @@

+---
+language:
+- en
+tags:
+- pytorch
+- causal-lm
+- pythia
+license: apache-2.0
+datasets:
+- EleutherAI/pile
+library_name: gpt-neox
+---
+The *Pythia Scaling Suite* is a collection of models developed to facilitate
+interpretability research [(see paper)](https://arxiv.org/pdf/2304.01373.pdf).
+It contains two sets of eight models of sizes
+70M, 160M, 410M, 1B, 1.4B, 2.8B, 6.9B, and 12B. For each size, there are two
+models: one trained on the Pile, and one trained on the Pile after the dataset
+has been globally deduplicated. All 8 model sizes are trained on the exact
+same data, in the exact same order. We also provide 154 intermediate
+checkpoints per model, hosted on Hugging Face as branches.
+The Pythia model suite was deliberately designed to promote scientific
+research on large language models, especially interpretability research.
+Despite not centering downstream performance as a design goal, we find the
+models <a href="#evaluations">match or exceed</a> the performance of
+similar and same-sized models, such as those in the OPT and GPT-Neo suites.
+<details>
+  <summary style="font-weight:600">Details on previous early release and naming convention.</summary>
+Previously, we released an early version of the Pythia suite to the public.
+However, we decided to retrain the model suite to address a few hyperparameter
+discrepancies. This model card <a href="#changelog">lists the changes</a>;
+see appendix B in the Pythia paper for further discussion. We found no
+difference in benchmark performance between the two Pythia versions.
+The old models are
+[still available](https://huggingface.co/models?other=pythia_v0), but we
+suggest the retrained suite if you are just starting to use Pythia.<br>
+**This is the current release.**
+Please note that all models in the *Pythia* suite were renamed in January
+2023. For clarity, a <a href="#naming-convention-and-parameter-count">table
+comparing the old and new names</a> is provided in this model card, together
+with exact parameter counts.
+</details>
+<br>
+# Pythia-70M
+## Model Details
+- Developed by: [EleutherAI](http://eleuther.ai)
+- Model type: Transformer-based Language Model
+- Language: English
+- Learn more: [Pythia's GitHub repository](https://github.com/EleutherAI/pythia)
+ for training procedure, config files, and details on how to use.
+[See paper](https://arxiv.org/pdf/2304.01373.pdf) for more evals and implementation
+ details.
+- Library: [GPT-NeoX](https://github.com/EleutherAI/gpt-neox)
+- License: Apache 2.0
+- Contact: to ask questions about this model, join the [EleutherAI
+Discord](https://discord.gg/zBGx3azzUn), and post them in `#release-discussion`.
+ Please read the existing *Pythia* documentation before asking about it in the
+ EleutherAI Discord. For general correspondence: [contact@eleuther.
+ ai](mailto:[email protected]).
+<figure>
+| Pythia model | Non-Embedding Params | Layers | Model Dim | Heads | Batch Size | Learning Rate         | Equivalent Models      |
+| -----------: | -------------------: | :----: | :-------: | :---: | :--------: | :-------------------: | :--------------------: |
+| 70M          | 18,915,328           | 6      | 512       | 8     | 2M         | 1.0 x 10<sup>-3</sup> | —                      |
+| 160M         | 85,056,000           | 12     | 768       | 12    | 2M         | 6.0 x 10<sup>-4</sup> | GPT-Neo 125M, OPT-125M |
+| 410M         | 302,311,424          | 24     | 1024      | 16    | 2M         | 3.0 x 10<sup>-4</sup> | OPT-350M               |
+| 1.0B         | 805,736,448          | 16     | 2048      | 8     | 2M         | 3.0 x 10<sup>-4</sup> | —                      |
+| 1.4B         | 1,208,602,624        | 24     | 2048      | 16    | 2M         | 2.0 x 10<sup>-4</sup> | GPT-Neo 1.3B, OPT-1.3B |
+| 2.8B         | 2,517,652,480        | 32     | 2560      | 32    | 2M         | 1.6 x 10<sup>-4</sup> | GPT-Neo 2.7B, OPT-2.7B |
+| 6.9B         | 6,444,163,072        | 32     | 4096      | 32    | 2M         | 1.2 x 10<sup>-4</sup> | OPT-6.7B               |
+| 12B          | 11,327,027,200       | 36     | 5120      | 40    | 2M         | 1.2 x 10<sup>-4</sup> | —                      |
+<figcaption>Engineering details for the <i>Pythia Suite</i>. Deduped and
+non-deduped models of a given size have the same hyperparameters. “Equivalent”
+models have <b>exactly</b> the same architecture, and the same number of
+non-embedding parameters.</figcaption>
+</figure>
+## Uses and Limitations
+### Intended Use
+The primary intended use of Pythia is research on the behavior, functionality,
+and limitations of large language models. This suite is intended to provide
+a controlled setting for performing scientific experiments. We also provide
+154 checkpoints per model: initial `step0`, 10 log-spaced checkpoints
+`step{1,2,4...512}`, and 143 evenly-spaced checkpoints from `step1000` to
+`step143000`. These checkpoints are hosted on Hugging Face as branches. Note
+that branch `143000` corresponds exactly to the model checkpoint on the `main`
+branch of each model.
+You may also further fine-tune and adapt Pythia-70M for deployment,
+as long as your use is in accordance with the Apache 2.0 license. Pythia
+models work with the Hugging Face [Transformers
+Library](https://huggingface.co/docs/transformers/index). If you decide to use
+pre-trained Pythia-70M as a basis for your fine-tuned model, please
+conduct your own risk and bias assessment.
+### Out-of-scope use
+The Pythia Suite is **not** intended for deployment. It is not a in itself
+a product and cannot be used for human-facing interactions. For example,
+the model may generate harmful or offensive text. Please evaluate the risks
+associated with your particular use case.
+Pythia models are English-language only, and are not suitable for translation
+or generating text in other languages.
+Pythia-70M has not been fine-tuned for downstream contexts in which
+language models are commonly deployed, such as writing genre prose,
+or commercial chatbots. This means Pythia-70M will **not**
+respond to a given prompt the way a product like ChatGPT does. This is because,
+ unlike this model, ChatGPT was fine-tuned using methods such as Reinforcement
+Learning from Human Feedback (RLHF) to better “follow” human instructions.
+### Limitations and biases
+The core functionality of a large language model is to take a string of text
+and predict the next token. The token used by the model need not produce the
+most “accurate” text. Never rely on Pythia-70M to produce factually accurate
+output.
+This model was trained on [the Pile](https://pile.eleuther.ai/), a dataset
+known to contain profanity and texts that are lewd or otherwise offensive.
+See [Section 6 of the Pile paper](https://arxiv.org/abs/2101.00027) for a
+discussion of documented biases with regards to gender, religion, and race.
+Pythia-70M may produce socially unacceptable or undesirable text, *even if*
+the prompt itself does not include anything explicitly offensive.
+If you plan on using text generated through, for example, the Hosted Inference
+API, we recommend having a human curate the outputs of this language model
+before presenting it to other people. Please inform your audience that the
+text was generated by Pythia-70M.
+### Quickstart
+Pythia models can be loaded and used via the following code, demonstrated here
+for the third `pythia-70m-deduped` checkpoint:
+```python
+from transformers import GPTNeoXForCausalLM, AutoTokenizer
+model = GPTNeoXForCausalLM.from_pretrained(
+  "EleutherAI/pythia-70m-deduped",
+  revision="step3000",
+  cache_dir="./pythia-70m-deduped/step3000",
+)
+tokenizer = AutoTokenizer.from_pretrained(
+  "EleutherAI/pythia-70m-deduped",
+  revision="step3000",
+  cache_dir="./pythia-70m-deduped/step3000",
+)
+inputs = tokenizer("Hello, I am", return_tensors="pt")
+tokens = model.generate(**inputs)
+tokenizer.decode(tokens[0])
+```
+Revision/branch `step143000` corresponds exactly to the model checkpoint on
+the `main` branch of each model.<br>
+For more information on how to use all Pythia models, see [documentation on
+GitHub](https://github.com/EleutherAI/pythia).
+## Training
+### Training data
+[The Pile](https://pile.eleuther.ai/) is a 825GiB general-purpose dataset in
+English. It was created by EleutherAI specifically for training large language
+models. It contains texts from 22 diverse sources, roughly broken down into
+five categories: academic writing (e.g. arXiv), internet (e.g. CommonCrawl),
+prose (e.g. Project Gutenberg), dialogue (e.g. YouTube subtitles), and
+miscellaneous (e.g. GitHub, Enron Emails). See [the Pile
+paper](https://arxiv.org/abs/2101.00027) for a breakdown of all data sources,
+methodology, and a discussion of ethical implications. Consult [the
+datasheet](https://arxiv.org/abs/2201.07311) for more detailed documentation
+about the Pile and its component datasets. The Pile can be downloaded from
+the [official website](https://pile.eleuther.ai/), or from a [community
+mirror](https://the-eye.eu/public/AI/pile/).<br>
+The Pile was **not** deduplicated before being used to train Pythia-70M.
+### Training procedure
+All models were trained on the exact same data, in the exact same order. Each
+model saw 299,892,736,000 tokens during training, and 143 checkpoints for each
+model are saved every 2,097,152,000 tokens, spaced evenly throughout training,
+from `step1000` to `step143000` (which is the same as `main`). In addition, we
+also provide frequent early checkpoints: `step0` and `step{1,2,4...512}`.
+This corresponds to training for just under 1 epoch on the Pile for
+non-deduplicated models, and about 1.5 epochs on the deduplicated Pile.
+All *Pythia* models trained for 143000 steps at a batch size
+of 2M (2,097,152 tokens).<br>
+See [GitHub](https://github.com/EleutherAI/pythia) for more details on training
+ procedure, including [how to reproduce
+ it](https://github.com/EleutherAI/pythia/blob/main/README.md#reproducing-training).<br>
+Pythia uses the same tokenizer as [GPT-NeoX-
+20B](https://huggingface.co/EleutherAI/gpt-neox-20b).
+## Evaluations
+All 16 *Pythia* models were evaluated using the [LM Evaluation
+Harness](https://github.com/EleutherAI/lm-evaluation-harness). You can access
+the results by model and step at `results/json/*` in the [GitHub
+repository](https://github.com/EleutherAI/pythia/tree/main/results/json/).<br>
+Expand the sections below to see plots of evaluation results for all
+Pythia and Pythia-deduped models compared with OPT and BLOOM.
+<details>
+  <summary>LAMBADA – OpenAI</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/lambada_openai_v1.png" style="width:auto"/>
+</details>
+<details>
+  <summary>Physical Interaction: Question Answering (PIQA)</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/piqa_v1.png" style="width:auto"/>
+</details>
+<details>
+  <summary>WinoGrande</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/winogrande_v1.png" style="width:auto"/>
+</details>
+<details>
+  <summary>AI2 Reasoning Challenge—Easy Set</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/arc_easy_v1.png" style="width:auto"/>
+</details>
+<details>
+  <summary>SciQ</summary>
+  <img src="/EleutherAI/pythia-12b/resolve/main/eval_plots/sciq_v1.png" style="width:auto"/>
+</details>
+## Changelog
+This section compares differences between previously released
+[Pythia v0](https://huggingface.co/models?other=pythia_v0) and the current
+models. See Appendix B of the Pythia paper for further discussion of these
+changes and the motivation behind them. We found that retraining Pythia had no
+impact on benchmark performance.
+- All model sizes are now trained with uniform batch size of 2M tokens.
+Previously, the models of size 160M, 410M, and 1.4B parameters were trained
+with batch sizes of 4M tokens.
+- We added checkpoints at initialization (step 0) and steps {1,2,4,8,16,32,64,
+128,256,512} in addition to every 1000 training steps.
+- Flash Attention was used in the new retrained suite.
+- We remedied a minor inconsistency that existed in the original suite: all
+models of size 2.8B parameters or smaller had a learning rate (LR) schedule
+which decayed to a minimum LR of 10% the starting LR rate, but the 6.9B and
+12B models all used an LR schedule which decayed to a minimum LR of 0. In
+the redone training runs, we rectified this inconsistency: all models now were
+trained with LR decaying to a minimum of 0.1× their maximum LR.
+### Naming convention and parameter count
+*Pythia* models were renamed in January 2023. It is possible that the old
+naming convention still persists in some documentation by accident. The
+current naming convention (70M, 160M, etc.) is based on total parameter count.
+<figure style="width:32em">
+| current Pythia suffix | old suffix | total params   | non-embedding params |
+| --------------------: | ---------: | -------------: | -------------------: |
+| 70M                   | 19M        | 70,426,624     | 18,915,328           |
+| 160M                  | 125M       | 162,322,944    | 85,056,000           |
+| 410M                  | 350M       | 405,334,016    | 302,311,424          |
+| 1B                    | 800M       | 1,011,781,632  | 805,736,448          |
+| 1.4B                  | 1.3B       | 1,414,647,808  | 1,208,602,624        |
+| 2.8B                  | 2.7B       | 2,775,208,960  | 2,517,652,480        |
+| 6.9B                  | 6.7B       | 6,857,302,016  | 6,444,163,072        |
+| 12B                   | 13B        | 11,846,072,320 | 11,327,027,200       |
+</figure>
+# [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
+Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_EleutherAI__pythia-70m)
+| Metric                | Value                     |
+|-----------------------|---------------------------|
+| Avg.                  | 25.28   |
+| ARC (25-shot)         | 21.59          |
+| HellaSwag (10-shot)   | 27.29    |
+| MMLU (5-shot)         | 25.9         |
+| TruthfulQA (0-shot)   | 47.06   |
+| Winogrande (5-shot)   | 51.46   |
+| GSM8K (5-shot)        | 0.3        |
+| DROP (3-shot)         | 3.33         |

hub/models--EleutherAI--pythia-70m/blobs/f1860edb10f80bcaf7b023fce47c68a23b724c23 ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "add_prefix_space": false,
+  "bos_token": "<|endoftext|>",
+  "eos_token": "<|endoftext|>",
+  "name_or_path": "EleutherAI/gpt-neox-20b",
+  "special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
+  "tokenizer_class": "GPTNeoXTokenizer",
+  "unk_token": "<|endoftext|>"
+}

hub/models--EleutherAI--pythia-70m/blobs/f74dfbfab8f97770a87769c739fb080c21c8bacc ADDED Viewed

The diff for this file is too large to render. See raw diff

hub/models--EleutherAI--pythia-70m/refs/main ADDED Viewed

	@@ -0,0 +1 @@


1	+ a39f36b100fe8a5377810d56c3f4789b9c53ac42

hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/df0253c0ab197de15c12fa7fbb7edcca9b6848a3

hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/d7a9196e329eaf06d6e2802fed376e7459834236

hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/0204ed10c186a4c7c68f55dff8f26087a45898d6

hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/tokenizer.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/f74dfbfab8f97770a87769c739fb080c21c8bacc

hub/models--EleutherAI--pythia-70m/snapshots/a39f36b100fe8a5377810d56c3f4789b9c53ac42/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../../blobs/f1860edb10f80bcaf7b023fce47c68a23b724c23

hub/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 1

manage_repos.ipynb CHANGED Viewed

@@ -11,31 +11,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CACHE_PATH = /Users/chaeeunlee/Documents/VSC_workspaces/test_leaderboard\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/chaeeunlee/anaconda3/envs/lb/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
-     ]
-    },
     {
      "data": {
       "text/plain": [
        "'\\n( path_in_repo: str\\nrepo_id: str\\ntoken: typing.Optional[str] = None\\nrepo_type: typing.Optional[str] = Nonerevision: typing.Optional[str] = Nonecommit_message: typing.Optional[str] = Nonecommit_description: typing.Optional[str] = Nonecreate_pr: typing.Optional[bool] = Noneparent_commit: typing.Optional[str] = None )\\n'"
       ]
      },
-     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -69,14 +54,37 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "res = API.delete_folder(path_in_repo='EleutherAI/pythia-70m/', repo_id=RESULTS_REPO, repo_type='dataset')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
    "source": [
     "# res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
     "\n",

   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "'\\n( path_in_repo: str\\nrepo_id: str\\ntoken: typing.Optional[str] = None\\nrepo_type: typing.Optional[str] = Nonerevision: typing.Optional[str] = Nonecommit_message: typing.Optional[str] = Nonecommit_description: typing.Optional[str] = Nonecreate_pr: typing.Optional[bool] = Noneparent_commit: typing.Optional[str] = None )\\n'"
       ]
      },
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
    "metadata": {},
    "outputs": [],
    "source": [
+    "res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=RESULTS_REPO, repo_type='dataset')"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "metadata": {},
+   "outputs": [
+    {
+     "ename": "EntryNotFoundError",
+     "evalue": "404 Client Error. (Request ID: Root=1-65c41aaf-4be744ec4d8b25f96aac8d20;c17f9346-8c9c-44c6-af2a-d69065e58148)\n\nEntry Not Found for url: https://huggingface.co/api/datasets/chaeeunlee/test_requests/commit/main.\nA file with the name \"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\" does not exist",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mHTTPError\u001b[0m                                 Traceback (most recent call last)",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:286\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m    285\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 286\u001b[0m     \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mraise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    287\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m HTTPError \u001b[38;5;28;01mas\u001b[39;00m e:\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/requests/models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1020\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1021\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m)\n",
+      "\u001b[0;31mHTTPError\u001b[0m: 404 Client Error: Not Found for url: https://huggingface.co/api/datasets/chaeeunlee/test_requests/commit/main",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mEntryNotFoundError\u001b[0m                        Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=QUEUE_REPO, repo_type='dataset')\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m res \u001b[38;5;241m=\u001b[39m \u001b[43mAPI\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath_in_repo\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mEleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mQUEUE_REPO\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mdataset\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:118\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m    116\u001b[0m     kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/hf_api.py:4704\u001b[0m, in \u001b[0;36mHfApi.delete_file\u001b[0;34m(self, path_in_repo, repo_id, token, repo_type, revision, commit_message, commit_description, create_pr, parent_commit)\u001b[0m\n\u001b[1;32m   4698\u001b[0m commit_message \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   4699\u001b[0m     commit_message \u001b[38;5;28;01mif\u001b[39;00m commit_message \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDelete \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath_in_repo\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m with huggingface_hub\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   4700\u001b[0m )\n\u001b[1;32m   4702\u001b[0m operations \u001b[38;5;241m=\u001b[39m [CommitOperationDelete(path_in_repo\u001b[38;5;241m=\u001b[39mpath_in_repo)]\n\u001b[0;32m-> 4704\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_commit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   4705\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4706\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrepo_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrepo_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4707\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4708\u001b[0m \u001b[43m    \u001b[49m\u001b[43moperations\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moperations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4709\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4710\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcommit_message\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_message\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4711\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcommit_description\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcommit_description\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4712\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcreate_pr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcreate_pr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4713\u001b[0m \u001b[43m    \u001b[49m\u001b[43mparent_commit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mparent_commit\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   4714\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:118\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    115\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m check_use_auth_token:\n\u001b[1;32m    116\u001b[0m     kwargs \u001b[38;5;241m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[38;5;241m=\u001b[39mfn\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, has_token\u001b[38;5;241m=\u001b[39mhas_token, kwargs\u001b[38;5;241m=\u001b[39mkwargs)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/hf_api.py:1208\u001b[0m, in \u001b[0;36mfuture_compatible.<locals>._inner\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1205\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrun_as_future(fn, \u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m   1207\u001b[0m \u001b[38;5;66;03m# Otherwise, call the function normally\u001b[39;00m\n\u001b[0;32m-> 1208\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/hf_api.py:3600\u001b[0m, in \u001b[0;36mHfApi.create_commit\u001b[0;34m(self, repo_id, operations, commit_message, commit_description, token, repo_type, revision, create_pr, num_threads, parent_commit, run_as_future)\u001b[0m\n\u001b[1;32m   3598\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   3599\u001b[0m     commit_resp \u001b[38;5;241m=\u001b[39m get_session()\u001b[38;5;241m.\u001b[39mpost(url\u001b[38;5;241m=\u001b[39mcommit_url, headers\u001b[38;5;241m=\u001b[39mheaders, data\u001b[38;5;241m=\u001b[39mdata, params\u001b[38;5;241m=\u001b[39mparams)\n\u001b[0;32m-> 3600\u001b[0m     \u001b[43mhf_raise_for_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcommit_resp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mendpoint_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcommit\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3601\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m RepositoryNotFoundError \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m   3602\u001b[0m     e\u001b[38;5;241m.\u001b[39mappend_to_message(_CREATE_COMMIT_NO_REPO_ERROR_MESSAGE)\n",
+      "File \u001b[0;32m~/anaconda3/envs/lb/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:296\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m    294\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m error_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEntryNotFound\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    295\u001b[0m     message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Client Error.\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEntry Not Found for url: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 296\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m EntryNotFoundError(message, response) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01me\u001b[39;00m\n\u001b[1;32m    298\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m error_code \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGatedRepo\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    299\u001b[0m     message \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m    300\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39mstatus_code\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m Client Error.\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot access gated repo for url \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresponse\u001b[38;5;241m.\u001b[39murl\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    301\u001b[0m     )\n",
+      "\u001b[0;31mEntryNotFoundError\u001b[0m: 404 Client Error. (Request ID: Root=1-65c41aaf-4be744ec4d8b25f96aac8d20;c17f9346-8c9c-44c6-af2a-d69065e58148)\n\nEntry Not Found for url: https://huggingface.co/api/datasets/chaeeunlee/test_requests/commit/main.\nA file with the name \"EleutherAI/pythia-70m_pubmedqa_eval_request_False_float32_Original.json\" does not exist"
+     ]
+    }
+   ],
    "source": [
     "# res = API.delete_folder(path_in_repo='EleutherAI/', repo_id=QUEUE_REPO, repo_type='dataset')\n",
     "\n",

src/.DS_Store CHANGED Viewed

Binary files a/src/.DS_Store and b/src/.DS_Store differ

src/backend/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/backend/envs.py CHANGED Viewed

@@ -19,8 +19,10 @@ class Task:
 # how are these differentiated with Tasks in display/utils.py ?
 class Tasks(Enum):
     task0 = Task("pubmedqa", "acc", "PubMedQA", 0)  # 64, as in the ATLAS paper
-    task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0)  # 64, as in the ATLAS paper
-    # task2 = Task("medqa")
 num_fewshots = {

 # how are these differentiated with Tasks in display/utils.py ?
 class Tasks(Enum):
     task0 = Task("pubmedqa", "acc", "PubMedQA", 0)  # 64, as in the ATLAS paper
+    # task1 = Task("hellaswag", "acc_norm", "HellaSwag", 0)  # 64, as in the ATLAS paper
+    task1 = Task("medqa", "acc_norm", "MedQA", 0) # medqa_4options?
+    task2 = Task("medmcqa", "acc_norm", "MedMCQA", 0)
 num_fewshots = {

src/backend/tasks/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/backend/tasks/medmcqa/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/backend/tasks/medmcqa/medmcqa.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+task: medmcqa
+dataset_path: medmcqa
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: validation
+doc_to_text: !function utils_medmcqa.doc_to_text
+doc_to_target: cop
+doc_to_choice: [ 'A','B','C','D' ]
+should_decontaminate: true
+doc_to_decontamination_query: "{{question}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

src/backend/tasks/medmcqa/utils_medmcqa.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copied from Master
+def doc_to_text(doc) -> str:
+    """
+    Question: <question>
+    Choices:
+    A. <choice1>
+    B. <choice2>
+    C. <choice3>
+    D. <choice4>
+    Answer:
+    """
+    choices = [doc["opa"], doc["opb"], doc["opc"], doc["opd"]]
+    option_choices = {'A': choices[0], 'B': choices[1], 'C': choices[2], 'D': choices[3]}
+    prompt = "Question: " + doc["question"] + "\nChoices:\n"
+    for choice, option in option_choices.items():
+        prompt += f"{choice.upper()}. {option}\n"
+    prompt += "Answer:"
+    return prompt

src/backend/tasks/medqa/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

src/backend/tasks/medqa/medqa.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+task: medqa_4options
+dataset_path: GBaker/MedQA-USMLE-4-options-hf
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: test
+doc_to_text: !function preprocess_medqa.doc_to_text
+doc_to_target: !function preprocess_medqa.doc_to_target
+doc_to_choice: [ 'A', 'B', 'C', 'D' ]
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true

src/backend/tasks/medqa/preprocess_medqa.py ADDED Viewed

	@@ -0,0 +1,8 @@

+def doc_to_text(doc) -> str:
+    option_choices = {'A': doc["ending0"], 'B': doc["ending1"], 'C': doc["ending2"], 'D': doc["ending3"]}
+    answers = "".join((f"{k}. {v}\n") for k, v in option_choices.items())
+    return f"Question: {doc['sent1']}\n{answers}Answer:"
+def doc_to_target(doc) -> int:
+    return doc["label"]

src/display/__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/src/display/__pycache__/utils.cpython-310.pyc and b/src/display/__pycache__/utils.cpython-310.pyc differ

src/display/utils.py CHANGED Viewed

@@ -17,7 +17,10 @@ class Task:
 class Tasks(Enum):
     # arc = Task("arc:challenge", "acc_norm", "ARC")
-    hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
     # mmlu = Task("hendrycksTest", "acc", "MMLU")
     # truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
     # winogrande = Task("winogrande", "acc", "Winogrande")
@@ -45,7 +48,7 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

 class Tasks(Enum):
     # arc = Task("arc:challenge", "acc_norm", "ARC")
+    # hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
+    medqa = Task("medqa", "acc_norm", "MedQA") # medqa_4options?
+    medmcqa = Task("medmcqa", "acc_norm", "MedMCQA")
     # mmlu = Task("hendrycksTest", "acc", "MMLU")
     # truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
     # winogrande = Task("winogrande", "acc", "Winogrande")
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 for task in Tasks:
+    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", False)]) # hidden was true by default
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

src/populate.py CHANGED Viewed

@@ -15,13 +15,14 @@ and cleaning the data based on specific criteria. Let's break down the function
 '''
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> tuple[list[EvalResult], pd.DataFrame]:
     print(f"results_path = {results_path}")
     raw_data = get_raw_eval_results(results_path, requests_path)
-    print(f"@@@@@@@@@@@@@@@@@@@@@@@@raw_data = {raw_data}")
     all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
     # all_data_json.append(baseline_row)

 '''
+## TO-DO: if raw_data is [], return dummy df with correct columns so that the UI shows the right columns
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> tuple[list[EvalResult], pd.DataFrame]:
     print(f"results_path = {results_path}")
     raw_data = get_raw_eval_results(results_path, requests_path)
+    # print(f"@@raw_data = {raw_data}")
     all_data_json = [v.to_dict() for v in raw_data] # if v.is_complete()]
     # all_data_json.append(baseline_row)

src/submission/__pycache__/submit.cpython-310.pyc CHANGED Viewed

Binary files a/src/submission/__pycache__/submit.cpython-310.pyc and b/src/submission/__pycache__/submit.cpython-310.pyc differ