Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

dh-mc commited on Sep 13, 2024

Commit

bf13772

1 Parent(s): 921fa92

LogiQA2.0 dataset

Browse files

Files changed (46) hide show

.gitattributes +4 -1
datasets/LogiQA2.0/README.md +132 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/datasource.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/dev.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/dev_fol.jsonl +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/dev_zh.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/ood_test.jsonl +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/readme.md +7 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/statistics.py +21 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/test.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/test_fol.jsonl +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/test_zh.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/train.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/train_fol.zip +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/train_zh.txt +3 -0
datasets/LogiQA2.0/logiqa/DATA/LOGIQA/word_matching.py +26 -0
datasets/LogiQA2.0/logiqa/logiqa.sh +21 -0
datasets/LogiQA2.0/logiqa/modeling_bart.py +1416 -0
datasets/LogiQA2.0/logiqa/multi-choice-prompt.py +56 -0
datasets/LogiQA2.0/logiqa/run_mrc.py +552 -0
datasets/LogiQA2.0/logiqa/utils_mrc.py +280 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/ dev_new.txt +3 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/dev.txt +3 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/readme.md +1 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/stat.py +25 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/test.txt +3 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/test_new.txt +3 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/train.txt +3 -0
datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/train_new.txt +3 -0
datasets/LogiQA2.0/logiqa2nli/nli-prompt.py +51 -0
datasets/LogiQA2.0/logiqa2nli/qa2nli.sh +20 -0
datasets/LogiQA2.0/logiqa2nli/run_nli.py +549 -0
datasets/LogiQA2.0/logiqa2nli/scripts/anli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/scripts/cood.sh +4 -0
datasets/LogiQA2.0/logiqa2nli/scripts/mnli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/scripts/multirun.sh +8 -0
datasets/LogiQA2.0/logiqa2nli/scripts/pnli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/scripts/qa2nli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/scripts/qnli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/scripts/qood.sh +20 -0
datasets/LogiQA2.0/logiqa2nli/scripts/rte.sh +20 -0
datasets/LogiQA2.0/logiqa2nli/scripts/scitail.sh +22 -0
datasets/LogiQA2.0/logiqa2nli/scripts/snli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/scripts/wnli.sh +21 -0
datasets/LogiQA2.0/logiqa2nli/utils_nli.py +1002 -0
datasets/LogiQA2.0/requirements.yml +17 -0

.gitattributes CHANGED Viewed

@@ -33,6 +33,8 @@ unsloth/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/ filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/dev.csv filter=lfs diff=lfs merge=lfs -text
@@ -106,5 +108,6 @@ results/test_b-results_r6.csv filter=lfs diff=lfs merge=lfs -text
 mgtv_train_p1.json filter=lfs diff=lfs merge=lfs -text
 mgtv_train_p2.json filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/o1-mini.jsonl filter=lfs diff=lfs merge=lfs -text
-datasets/mgtv/Icon
 filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/gpt-4o-mini.jsonl filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.txt filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/ filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/dev.csv filter=lfs diff=lfs merge=lfs -text
 mgtv_train_p1.json filter=lfs diff=lfs merge=lfs -text
 mgtv_train_p2.json filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/o1-mini.jsonl filter=lfs diff=lfs merge=lfs -text
 filter=lfs diff=lfs merge=lfs -text
+datasets/mgtv/Icon
+ filter=lfs diff=lfs merge=lfs -text
 datasets/mgtv/gpt-4o-mini.jsonl filter=lfs diff=lfs merge=lfs -text

datasets/LogiQA2.0/README.md ADDED Viewed

	@@ -0,0 +1,132 @@

+# LogiQA2.0
+Logiqa2.0 dataset - logical reasoning in MRC and NLI tasks
+<a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by-nc-sa/4.0/88x31.png" /></a><br />This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-nc-sa/4.0/">Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License</a>.
+> This repository contains the datasets and baseline codes for our paper [LogiQA2.0 - An Improved Dataset for Logic Reasoning in Question Answering and Textual Inference](https://ieeexplore.ieee.org/abstract/document/10174688)
+## How to cite
+```
+@ARTICLE{10174688,
+  author={Liu, Hanmeng and Liu, Jian and Cui, Leyang and Teng, Zhiyang and Duan, Nan and Zhou, Ming and Zhang, Yue},
+  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
+  title={LogiQA 2.0—An Improved Dataset for Logical Reasoning in Natural Language Understanding},
+  year={2023},
+  volume={31},
+  number={},
+  pages={2947-2962},
+  doi={10.1109/TASLP.2023.3293046}}
+```
+## About
+This is the version 2 of the LogiQA dataset, first released as a multi-choice reading comprehension dataset by our previous paper [LogiQA: A Challenge  Dataset for Machine Reading Comprehension with Logical Reasoning](https://arxiv.org/abs/2007.08124).
+The dataset is collected from the [Chinese Civil Service Entrance Examination](chinagwy.org). The dataset is both in Chinese and English (by translation). you can download the version 1 of the LogiQA dataset from [here](https://github.com/lgw863/logiqa-dataset).
+To construct LogiQA2.0 dataset, we:
+* collect more newly released exam questions and practice questions. There are about 20 provinces in China that hold the exam annually. The exam materials are publicly available on the Internet after the exams. Besides, practice questions are provided by various sources.
+* hire professional translators to re-translate the dataset from Chinese to English; verify the labels and annotations with human experts. This program is conducted by [Speechocean](en.speechocean.com), a data annotation service provider. The project is accomplished with the help of Microsoft Research Asia.
+* introduce a new NLI task to the dataset. The NLI version of the dataset is converted from the MRC version of the dataset, following previous work such as [Transforming Question Answering Datasets into Natural Language Inference Datasets](https://arxiv.org/abs/1809.02922).
+## Datasets
+### MRC
+The MRC part of LogiQA2.0 dataset can be found in the `/logiqa/DATA/LOGIQA` folder.
+`train.txt`: train split of the dataset in json lines.
+`dev.txt`: dev split of the dataset in json lines.
+`test.txt`: test split of the  dataset in json lines.
+`train_zh.txt`: train split of the Chinese version of dataset in json lines.
+`dev_zh.txt`: dev split of the Chinese version of dataset in json lines.
+`test_zh.txt`: test split of the Chinese version of dataset in json lines.
+`train_fol.zip` is the training data with AMR and FOL annotations. The file is too big so we compressed it.
+`dev_fol.jsonl` is the dev data with AMR and FOL annotations.
+`test_fol.jsonl` is the test data with AMR and FOL annotations.
+An example:
+```
+{"id": 10471, "answer": 0, "text": "The medieval Arabs had many manuscripts of the ancient Greek. When needed, they translate them into Arabic. Medieval Arab philosophers were very interested in Aristotle's Theory of Poetry, which was obviously not shared by Arab poets, because a poet interested in it must want to read Homer's poems. Aristotle himself often quotes Homer's poems. However, Homer's poems were not translated into Arabic until modern times.", "question": "Which of the following options, if true, strongly supports the above argument?", "options": ["Some medieval Arab translators have manuscripts of Homer poems in ancient Greek.", "Aristotle's Theory of Poetry is often quoted and commented by modern Arab poets.", "In Aristotle's Theory of Poetry, most of the content is related to drama, and medieval Arabs also wrote plays and performed them.", "A series of medieval Arab stories, such as Arab Night, are very similar to some parts of Homer's epic."], "type": {"Sufficient Conditional Reasoning": true, "Necessry Condtional Reasoning": true, "Conjunctive Reasoning": true}}
+```
+An example of the Chinese dataset:
+```
+{"id": 8018, "answer": 0, "text": "常春藤通常指美国东部的八所大学。常春藤一词一直以来是美国名校的代名词，这八所大学不仅历史悠久，治学严谨，而且教学质量极高。这些学校的毕业生大多成为社会精英，他们中的多数人年薪超过20万美元，有很多政界领袖来自常春藤，更有为数众多的科学家毕业于长春藤。", "question": "根据以上条件，下面那个选项一定为真:", "options": ["A.有些社会精英年薪超过20万美金", "B.有些政界领袖年薪不足20万美元", "C.有些科学家年薪超过20万美元", "D.有些政界领袖是社会精英"]}
+```
+### NLI
+The NLI part of LogiQA2.0 dataset can be found in the `/logiqa2nli/DATA/QA2NLI` folder.
+`train.txt`: train split of the dataset in json lines
+`dev.txt`: dev split of the dataset in json lines
+`test.txt`: test split of the dataset in json lines
+An example:
+```
+{"label": "not entailed", "major_premise": ["Among the employees of a software company, there are three Cantonese, one Beijinger and three northerners"], "minor_premise": " Four are only responsible for software development and two are only responsible for product sales", "conclusion": "There may be at least 7 people and at most 12 people."}
+```
+## Annotations
+The translation and annotation work is outsourced to [Speechocean](en.speechocean.com), the project fund is provided by Microsoft Research Asia
+### Translation
+| Final Report |  |
+| --- | --- |
+| provider | Speechocean |
+| Project Duration | 2021/10/20-2021/12/3 |
+| Actual Working Hour | 667 hours |
+| Cost | 45000 RMB |
+Translation style/method:
+1. Maintain a unified style, and the translated English questions need to inherit the logic of the original questions;
+2. The pronoun in the question need to be unique, the translation needs to be unique and consistent without ambiguity;
+3. The translated English conforms to the form of a proper question, that is, it is a clear question from the perspective of the respondent;
+### Label consistency
+The label credibility is mannually verified after the translation was done to maintain the truthfulness of the original text. 3 workers run a consistency test on each example, if 2 or more workers give different answer compared to the original answer, the translation would be redone to guareentee the label is correct.
+### Additional annotations
+Reasoning types of each question is assigned by a total of 5 workers, each of them corresponds to one reasoning type. We give the description of reasoning types (which can be found in our paper) to the workers. The reasoning types of each question is a collection of 5 workers' decision.
+## Baseline Guidance
+### Requirements
+* python 3.6+
+* pytorch 1.0+
+* transformers 2.4.1
+* sklearn
+* tqdm
+* tensorboardX
+We recommend to use conda to manage virtual environments:
+```
+conda env update --name logiqa --file requirements.yml
+```
+### Logiqa
+The folder `logiqa` contains both the code and data to run baseline experiments of LogiQA2.0 MRC.
+To fine-tune the dataset, type following command from the terminal in your :computer:
+```
+bash logiqa.sh
+```
+### Logiqa2nli
+The folder `logiqa2nli` contains both the code and data to run baseline experiments of LogiQA2.0 NLI.
+To fine-tune the dataset, type following command from the terminal in your :computer:
+```
+bash qa2nli.sh
+```
+Note: `./scripts` contains the scripts for running other NLI benchmarks.
+## How to Cite
+## Acknowledgment
+We appreciate the suggestions and critical questions from the reviewers.

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/datasource.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c25927ef6b1229b4957b55b65e8bade028a26ba982f62ce0c7d0e9dcf447da29
+size 315

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/dev.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbefb563b7ddc02640ccdc314c1315d5727dba48539d0ecdd126fa351e511b09
+size 1770764

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/dev_fol.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e9f31a32324e5147b4fe1e963476c012a2c997b46899214c7b5639c7c4ef3c2f
+size 16840405

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/dev_zh.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a72a23160c9e12e15ea8c13e57af5032a7c37157573ebdd7e7c8e0ad34aef780
+size 1202597

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/ood_test.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85aeb366be14f4180af021356cc07758e8a13669a4253753cd86f22f1f46dfff
+size 668323

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/readme.md ADDED Viewed

	@@ -0,0 +1,7 @@

+# LogiQA 2.0 dataset
+`train_fol.zip` is the training data with AMR and FOL annotations. The file is too big so we compressed it.
+`dev_fol.jsonl` is the dev data with AMR and FOL annotations.
+`test_fol.jsonl` is the test data with AMR and FOL annotations.

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/statistics.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import json
+with open('test.txt', 'r') as f:
+    file = f.readlines()
+    n = 1
+    l = 0
+    max = 0
+    for line in file:
+        line = json.loads(line)
+        text = line['options']
+        for option in text:
+            s = 0
+            l = l + len(option.split(" "))
+            s = s + len(option.split(" "))
+            n += 1
+        if s >= max:
+            max = s
+    result = l/n
+    print(result)
+    print(max)

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/test.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71940b37ae0184b677c253a148d57ad4e75d6113447b1563c2ca82483e4e4f8d
+size 1740565

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/test_fol.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0e736cf1bf24560ae188c507e661b6e041d3661b1975dbce561c8464f31f486
+size 16807118

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/test_zh.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a8db83ccb3ebdc8d5b3886fd0ad9346c7e565722d2d592987b24dd57f251853
+size 1182510

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/train.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98eb412e8ed53b3d65da5ef75b00b7a0bbdea7970c05ad699291a2a0510922de
+size 14045351

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/train_fol.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25ce1665338ee99d3aedc4835abf134ea03263861380bba6173c22fed13fcc24
+size 23744378

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/train_zh.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d87a15811cda64cb021d43cb9bc1d282424a8dfb8e35e6a7f6d6a0b36b38a54e
+size 9581270

datasets/LogiQA2.0/logiqa/DATA/LOGIQA/word_matching.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import json
+with open('test.txt') as f:
+    file = f.readlines()
+    n = 0
+    l = 0
+    for line in file:
+        d = json.loads(line)
+        label = d['answer']
+        text = d['text']
+        options = d['options']
+        text_vocab = set(text.split(' '))
+        ratio = []
+        for option in options:
+            option_vocab = set(option.split(' '))
+            intersection = text_vocab.intersection(option_vocab)
+            ratio.append(len(intersection)/len(text_vocab))
+        value_prev = 0
+        for value in ratio:
+            if value >= value_prev:
+                value_prev = value
+        index = ratio.index(value_prev)
+        if index == label:
+            l += 1
+        n += 1
+    result = l/n
+    print(result)

datasets/LogiQA2.0/logiqa/logiqa.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=./DATA
+export TASK_NAME=LOGIQA
+CUDA_VISIBLE_DEVICES=0,1 python run_mrc.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 256 \
+    --per_gpu_eval_batch_size=4 \
+    --per_gpu_train_batch_size=4  \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    --overwrite_output_dir \

datasets/LogiQA2.0/logiqa/modeling_bart.py ADDED Viewed

	@@ -0,0 +1,1416 @@

+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BART model, ported from the fairseq repo."""
+import math
+import random
+import warnings
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.configuration_bart import BartConfig
+from transformers.file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_callable,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "BartConfig"
+_TOKENIZER_FOR_DOC = "BartTokenizer"
+BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/bart-base",
+    "facebook/bart-large",
+    "facebook/bart-large-mnli",
+    "facebook/bart-large-cnn",
+    "facebook/bart-large-xsum",
+    "facebook/mbart-large-en-ro",
+]
+# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart
+BART_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matters related to general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+BART_GENERATION_EXAMPLE = r"""
+    Summarization example::
+        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+        # see ``examples/summarization/bart/run_eval.py`` for a longer example
+        model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+        # Generate Summary
+        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+"""
+BART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+               Indices of input sequence tokens in the vocabulary. Use BartTokenizer.encode to produce them.
+            Padding will be ignored by default should you provide it.
+            Indices can be obtained using :class:`transformers.BartTokenizer.encode(text)`.
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices in input_ids.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
+            Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
+            `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
+            Used in the cross-attention of the decoder.
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
+            Provide for translation and summarization training. By default, the model will create this tensor by shifting the input_ids right, following the paper.
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
+            Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
+            If you want to change padding behavior, you should read :func:`~transformers.modeling_bart._prepare_decoder_inputs` and modify.
+            See diagram 1 in the paper for more info on the default strategy
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains pre-computed key and value hidden-states of the attention blocks.
+            Can be used to speed up decoding.
+            If ``past_key_values`` are used, the user can optionally input only the last
+            ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape
+            :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If `use_cache` is True, ``past_key_values`` are returned and can be used to speed up decoding (see
+            ``past_key_values``).
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail.
+        return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a
+            plain tuple.
+"""
+def invert_mask(attention_mask):
+    """Turns 1->0, 0->1, False->True, True-> False"""
+    assert attention_mask.dim() == 2
+    return attention_mask.eq(0)
+def _prepare_bart_decoder_inputs(
+    config, input_ids, decoder_input_ids=None, decoder_padding_mask=None, causal_mask_dtype=torch.float32
+):
+    """Prepare masks that ignore padding tokens in the decoder and a causal mask for the decoder if
+    none are provided. This mimics the default behavior in fairseq. To override it pass in masks.
+    Note: this is not called during generation
+    """
+    pad_token_id = config.pad_token_id
+    if decoder_input_ids is None:
+        decoder_input_ids = shift_tokens_right(input_ids, pad_token_id)
+    bsz, tgt_len = decoder_input_ids.size()
+    if decoder_padding_mask is None:
+        decoder_padding_mask = make_padding_mask(decoder_input_ids, pad_token_id)
+    else:
+        decoder_padding_mask = invert_mask(decoder_padding_mask)
+    if decoder_padding_mask is not None and decoder_padding_mask.shape[1] > 1:
+        # never mask leading token, even if it is pad
+        decoder_padding_mask[:, 0] = decoder_padding_mask[:, 1]
+    causal_mask = torch.triu(fill_with_neg_inf(torch.zeros(tgt_len, tgt_len)), 1).to(
+        dtype=causal_mask_dtype, device=decoder_input_ids.device
+    )
+    return decoder_input_ids, decoder_padding_mask, causal_mask
+class PretrainedBartModel(PreTrainedModel):
+    config_class = BartConfig
+    base_model_prefix = "model"
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, SinusoidalPositionalEmbedding):
+            pass
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+def _make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+# Helper Functions, mostly for making masks
+def _check_shapes(shape_1, shape2):
+    if shape_1 != shape2:
+        raise AssertionError("shape mismatch: {} != {}".format(shape_1, shape2))
+def shift_tokens_right(input_ids, pad_token_id):
+    """Shift input ids one token to the right, and wrap the last non pad token (usually <eos>)."""
+    prev_output_tokens = input_ids.clone()
+    index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
+    prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
+    prev_output_tokens[:, 1:] = input_ids[:, :-1]
+    return prev_output_tokens
+def make_padding_mask(input_ids, padding_idx=1):
+    """True for pad tokens"""
+    padding_mask = input_ids.eq(padding_idx)
+    if not padding_mask.any():
+        padding_mask = None
+    return padding_mask
+# Adapter
+class Adapter(nn.Module):
+    def __init__(self, config):
+        super(Adapter, self).__init__()
+        self.down_project = nn.Linear(config.hidden_size, config.adapter_size)
+        self.activation = ACT2FN[config.adapter_act] \
+            if isinstance(config.adapter_act, str) else config.adapter_act
+        self.up_project = nn.Linear(config.adapter_size, config.hidden_size)
+        self.init_weights(config)
+    def forward(self, hidden_states):
+        down_projected = self.down_project(hidden_states)
+        activated = self.activation(down_projected)
+        up_projected = self.up_project(activated)
+        return hidden_states + up_projected
+    def init_weights(self, config):
+        # Slightly different from the TF version which uses truncated_normal for initialization
+        # cf https://github.com/pytorch/pytorch/pull/5617
+        self.down_project.weight.data.normal_(mean=0.0, std=config.adapter_initializer_range)
+        self.down_project.bias.data.zero_()
+        self.up_project.weight.data.normal_(mean=0.0, std=config.adapter_initializer_range)
+        self.up_project.bias.data.zero_()
+# Helper Modules
+class EncoderLayer(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = Attention(self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout)
+        self.normalize_before = config.normalize_before
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+        config.adapter_size = 256
+        config.adapter_act = "gelu"
+        config.adapter_initializer_range=0.0002
+        self.adapter = Adapter(config)
+    def forward(self, x, encoder_padding_mask, output_attentions=False):
+        """
+        Args:
+            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
+                `(batch, src_len)` where padding elements are indicated by ``1``.
+            for t_tgt, t_src is excluded (or masked out), =0 means it is
+            included in attention
+        Returns:
+            encoded output of shape `(seq_len, batch, embed_dim)`
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        x, attn_weights = self.self_attn(
+            query=x, key=x, key_padding_mask=encoder_padding_mask, output_attentions=output_attentions
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # add adapter
+        x = self.adapter(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return x, attn_weights
+class BartEncoder(nn.Module):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer
+    is a :class:`EncoderLayer`.
+    Args:
+        config: BartConfig
+    """
+    def __init__(self, config: BartConfig, embed_tokens):
+        super().__init__()
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = embed_tokens.embedding_dim
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_tokens = embed_tokens
+        if config.static_position_embeddings:
+            self.embed_positions = SinusoidalPositionalEmbedding(
+                config.max_position_embeddings, embed_dim, self.padding_idx
+            )
+        else:
+            self.embed_positions = LearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                embed_dim,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+            )
+        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
+        # mbart has one extra layer_norm
+        self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None
+    def forward(
+        self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False
+    ):
+        """
+        Args:
+            input_ids (LongTensor): tokens in the source language of shape
+                `(batch, src_len)`
+            attention_mask (torch.LongTensor): indicating which indices are padding tokens.
+        Returns:
+            BaseModelOutput or Tuple comprised of:
+                - **x** (Tensor): the last encoder layer's output of
+                  shape `(src_len, batch, embed_dim)`
+                - **encoder_states** (tuple(torch.FloatTensor)): all intermediate
+                  hidden states of shape `(src_len, batch, embed_dim)`.
+                  Only populated if *output_hidden_states:* is True.
+                - **all_attentions** (tuple(torch.FloatTensor)): Attention weights for each layer.
+                During training might not be of length n_layers because of layer dropout.
+        """
+        # check attention mask and invert
+        if attention_mask is not None:
+            attention_mask = invert_mask(attention_mask)
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        embed_pos = self.embed_positions(input_ids)
+        x = inputs_embeds + embed_pos
+        x = self.layernorm_embedding(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # B x T x C -> T x B x C
+        x = x.transpose(0, 1)
+        encoder_states = [] if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states.append(x)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                attn = None
+            else:
+                x, attn = encoder_layer(x, attention_mask, output_attentions=output_attentions)
+            if output_attentions:
+                all_attentions = all_attentions + (attn,)
+        if self.layer_norm:
+            x = self.layer_norm(x)
+        if output_hidden_states:
+            encoder_states.append(x)
+            # T x B x C -> B x T x C
+            encoder_states = tuple(hidden_state.transpose(0, 1) for hidden_state in encoder_states)
+        # T x B x C -> B x T x C
+        x = x.transpose(0, 1)
+        if not return_dict:
+            return tuple(v for v in [x, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions)
+class DecoderLayer(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.normalize_before = config.normalize_before
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.encoder_attn = Attention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            encoder_decoder_attention=True,
+        )
+        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+        config.adapter_size = 256
+        config.adapter_act = "gelu"
+        config.adapter_initializer_range=0.0002
+        self.adapter = Adapter(config)
+    def forward(
+        self,
+        x,
+        encoder_hidden_states,
+        encoder_attn_mask=None,
+        layer_state=None,
+        causal_mask=None,
+        decoder_padding_mask=None,
+        output_attentions=False,
+    ):
+        residual = x
+        if layer_state is None:
+            layer_state = {}
+        if self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        # Self Attention
+        x, self_attn_weights = self.self_attn(
+            query=x,
+            key=x,
+            layer_state=layer_state,  # adds keys to layer state
+            key_padding_mask=decoder_padding_mask,
+            attn_mask=causal_mask,
+            output_attentions=output_attentions,
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = self.adapter(x)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.self_attn_layer_norm(x)
+        # Cross attention
+        residual = x
+        assert self.encoder_attn.cache_key != self.self_attn.cache_key
+        if self.normalize_before:
+            x = self.encoder_attn_layer_norm(x)
+        x, _ = self.encoder_attn(
+            query=x,
+            key=encoder_hidden_states,
+            key_padding_mask=encoder_attn_mask,
+            layer_state=layer_state,  # mutates layer state
+        )
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.encoder_attn_layer_norm(x)
+        # Fully Connected
+        residual = x
+        if self.normalize_before:
+            x = self.final_layer_norm(x)
+        x = self.activation_fn(self.fc1(x))
+        x = F.dropout(x, p=self.activation_dropout, training=self.training)
+        x = self.fc2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = residual + x
+        if not self.normalize_before:
+            x = self.final_layer_norm(x)
+        return (
+            x,
+            self_attn_weights,
+            layer_state,
+        )  # just self_attn weights for now, following t5, layer_state = cache for decoding
+class BartDecoder(nn.Module):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer
+    is a :class:`DecoderLayer`.
+    Args:
+        config: BartConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+    def __init__(self, config: BartConfig, embed_tokens: nn.Embedding):
+        super().__init__()
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = embed_tokens.padding_idx
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        self.embed_tokens = embed_tokens
+        if config.static_position_embeddings:
+            self.embed_positions = SinusoidalPositionalEmbedding(
+                config.max_position_embeddings, config.d_model, config.pad_token_id
+            )
+        else:
+            self.embed_positions = LearnedPositionalEmbedding(
+                config.max_position_embeddings,
+                config.d_model,
+                self.padding_idx,
+                config.extra_pos_embeddings,
+            )
+        self.layers = nn.ModuleList(
+            [DecoderLayer(config) for _ in range(config.decoder_layers)]
+        )  # type: List[DecoderLayer]
+        self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
+        self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
+    def forward(
+        self,
+        input_ids,
+        encoder_hidden_states,
+        encoder_padding_mask,
+        decoder_padding_mask,
+        decoder_causal_mask,
+        past_key_values=None,
+        use_cache=False,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=False,
+        **unused,
+    ):
+        """
+        Includes several features from "Jointly Learning to Align and
+        Translate with Transformer Models" (Garg et al., EMNLP 2019).
+        Args:
+            input_ids (LongTensor): previous decoder outputs of shape
+                `(batch, tgt_len)`, for teacher forcing
+            encoder_hidden_states: output from the encoder, used for
+                encoder-side attention
+            encoder_padding_mask: for ignoring pad tokens
+            past_key_values (dict or None): dictionary used for storing state during generation
+        Returns:
+            BaseModelOutputWithPast or tuple:
+                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
+                - the cache
+                - hidden states
+                - attentions
+        """
+        if "decoder_cached_states" in unused:
+            warnings.warn(
+                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
+                FutureWarning,
+            )
+            past_key_values = unused.pop("decoder_cached_states")
+        if "decoder_past_key_values" in unused:
+            warnings.warn(
+                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
+                FutureWarning,
+            )
+            past_key_values = unused.pop("decoder_past_key_values")
+        # check attention mask and invert
+        if encoder_padding_mask is not None:
+            encoder_padding_mask = invert_mask(encoder_padding_mask)
+        # embed positions
+        positions = self.embed_positions(input_ids, use_cache=use_cache)
+        if use_cache:
+            input_ids = input_ids[:, -1:]
+            positions = positions[:, -1:]  # happens after we embed them
+            # assert input_ids.ne(self.padding_idx).any()
+        x = self.embed_tokens(input_ids) * self.embed_scale
+        x += positions
+        x = self.layernorm_embedding(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        x = x.transpose(0, 1)
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = []
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (x,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            layer_state = past_key_values[idx] if past_key_values is not None else None
+            x, layer_self_attn, layer_past = decoder_layer(
+                x,
+                encoder_hidden_states,
+                encoder_attn_mask=encoder_padding_mask,
+                decoder_padding_mask=decoder_padding_mask,
+                layer_state=layer_state,
+                causal_mask=decoder_causal_mask,
+                output_attentions=output_attentions,
+            )
+            if use_cache:
+                next_decoder_cache.append(layer_past.copy())
+            if self.layer_norm and (idx == len(self.layers) - 1):  # if config.add_final_layer_norm (mBART)
+                x = self.layer_norm(x)
+            if output_attentions:
+                all_self_attns += (layer_self_attn,)
+        # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        if output_hidden_states:
+            all_hidden_states = tuple(hidden_state.transpose(0, 1) for hidden_state in all_hidden_states)
+        x = x.transpose(0, 1)
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=x, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns
+        )
+def _reorder_buffer(attn_cache, new_order):
+    for k, input_buffer_k in attn_cache.items():
+        if input_buffer_k is not None:
+            attn_cache[k] = input_buffer_k.index_select(0, new_order)
+    return attn_cache
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        encoder_decoder_attention=False,  # otherwise self_attention
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+        self.encoder_decoder_attention = encoder_decoder_attention
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+    def forward(
+        self,
+        query,
+        key: Optional[Tensor],
+        key_padding_mask: Optional[Tensor] = None,
+        layer_state: Optional[Dict[str, Optional[Tensor]]] = None,
+        attn_mask: Optional[Tensor] = None,
+        output_attentions=False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time(SeqLen) x Batch x Channel"""
+        static_kv: bool = self.encoder_decoder_attention
+        tgt_len, bsz, embed_dim = query.size()
+        assert embed_dim == self.embed_dim
+        assert list(query.size()) == [tgt_len, bsz, embed_dim]
+        # get here for encoder decoder cause of static_kv
+        if layer_state is not None:  # reuse k,v and encoder_padding_mask
+            saved_state = layer_state.get(self.cache_key, {})
+            if "prev_key" in saved_state and static_kv:
+                # previous time steps are cached - no need to recompute key and value if they are static
+                key = None
+        else:
+            saved_state = None
+            layer_state = {}
+        q = self.q_proj(query) * self.scaling
+        if static_kv:
+            if key is None:
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+        else:
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        q = self._shape(q, tgt_len, bsz)
+        if k is not None:
+            k = self._shape(k, -1, bsz)
+        if v is not None:
+            v = self._shape(v, -1, bsz)
+        if saved_state is not None:
+            k, v, key_padding_mask = self._use_saved_state(k, v, saved_state, key_padding_mask, static_kv, bsz)
+        # Update cache
+        layer_state[self.cache_key] = {
+            "prev_key": k.view(bsz, self.num_heads, -1, self.head_dim),
+            "prev_value": v.view(bsz, self.num_heads, -1, self.head_dim),
+            "prev_key_padding_mask": key_padding_mask if not static_kv else None,
+        }
+        assert k is not None
+        src_len = k.size(1)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
+        if attn_mask is not None:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+        assert key_padding_mask is None or key_padding_mask.size()[:2] == (
+            bsz,
+            src_len,
+        )
+        if key_padding_mask is not None:  # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
+            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_probs = F.dropout(
+            attn_weights,
+            p=self.dropout,
+            training=self.training,
+        )
+        assert v is not None
+        attn_output = torch.bmm(attn_probs, v)
+        assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn_output = self.out_proj(attn_output)
+        if output_attentions:
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights = None
+        return attn_output, attn_weights
+    def _use_saved_state(self, k, v, saved_state, key_padding_mask, static_kv, bsz):
+        # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+        if "prev_key" in saved_state:
+            _prev_key = saved_state["prev_key"]
+            assert _prev_key is not None
+            prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+            if static_kv:
+                k = prev_key
+            else:
+                assert k is not None
+                k = torch.cat([prev_key, k], dim=1)
+        if "prev_value" in saved_state:
+            _prev_value = saved_state["prev_value"]
+            assert _prev_value is not None
+            prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+            if static_kv:
+                v = prev_value
+            else:
+                assert v is not None
+                v = torch.cat([prev_value, v], dim=1)
+        assert k is not None and v is not None
+        prev_key_padding_mask: Optional[Tensor] = saved_state.get("prev_key_padding_mask", None)
+        if prev_key_padding_mask is not None:
+            if static_kv:
+                new_key_padding_mask = prev_key_padding_mask
+            else:
+                new_key_padding_mask = torch.cat([prev_key_padding_mask, key_padding_mask], dim=1)
+        else:
+            new_key_padding_mask = key_padding_mask
+        return k, v, new_key_padding_mask
+class BartClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+    # This can trivially be shared with RobertaClassificationHead
+    def __init__(
+        self,
+        input_dim,
+        inner_dim,
+        num_classes,
+        pooler_dropout,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+    def forward(self, x):
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = offset
+        assert padding_idx is not None
+        num_embeddings += offset
+        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+    def forward(self, input_ids, use_cache=False):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input_ids.shape[:2]
+        if use_cache:
+            positions = input_ids.data.new(1, 1).fill_(seq_len - 1)  # called before slicing
+        else:
+            # starts at 0, ends at 1-seq_len
+            positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
+        return super().forward(positions + self.offset)
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
+    if torch.cuda.is_available():
+        try:
+            from apex.normalization import FusedLayerNorm
+            return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
+        except ImportError:
+            pass
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+def fill_with_neg_inf(t):
+    """FP16-compatible function that fills a input_ids with -inf."""
+    return t.float().fill_(float("-inf")).type_as(t)
+# Public API
+def _get_shape(t):
+    return getattr(t, "shape", None)
+@add_start_docstrings(
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
+)
+class BartModel(PretrainedBartModel):
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+        self.encoder = BartEncoder(config, self.shared)
+        self.decoder = BartDecoder(config, self.shared)
+        self.init_weights()
+        for param in self.parameters():
+            param.requires_grad = False
+        for name, sub_module in self.named_modules():
+            if isinstance(sub_module, (Adapter, torch.nn.LayerNorm,
+                                       )):
+                for param_name, param in sub_module.named_parameters():
+                    param.requires_grad = True
+    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=BaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        decoder_input_ids=None,
+        encoder_outputs: Optional[Tuple] = None,
+        decoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        if "decoder_past_key_values" in kwargs:
+            warnings.warn(
+                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
+                FutureWarning,
+            )
+            past_key_values = kwargs.pop("decoder_past_key_values")
+        if decoder_input_ids is None:
+            use_cache = False
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # make masks if user doesn't supply
+        if not use_cache:
+            decoder_input_ids, decoder_padding_mask, causal_mask = _prepare_bart_decoder_inputs(
+                self.config,
+                input_ids,
+                decoder_input_ids=decoder_input_ids,
+                decoder_padding_mask=decoder_attention_mask,
+                causal_mask_dtype=self.shared.weight.dtype,
+            )
+        else:
+            decoder_padding_mask, causal_mask = None, None
+        assert decoder_input_ids is not None
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            decoder_input_ids,
+            encoder_outputs[0],
+            attention_mask,
+            decoder_padding_mask,
+            decoder_causal_mask=causal_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+    def get_output_embeddings(self):
+        return _make_linear_from_emb(self.shared)  # make it on the fly
+@add_start_docstrings(
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
+)
+class BartAdapterForConditionalGeneration(PretrainedBartModel):
+    base_model_prefix = "model"
+    authorized_missing_keys = [r"final_logits_bias", r"encoder\.version", r"decoder\.version"]
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        base_model = BartModel(config)
+        self.model = base_model
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        old_num_tokens = self.model.shared.num_embeddings
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self.model.shared = new_embeddings
+        self._resize_final_logits_bias(new_num_tokens, old_num_tokens)
+        return new_embeddings
+    def _resize_final_logits_bias(self, new_num_tokens: int, old_num_tokens: int) -> None:
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_outputs=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        past_key_values=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **unused,
+    ):
+        r"""
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+                Labels for computing the masked language modeling loss.
+                Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
+                Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
+                with labels in ``[0, ..., config.vocab_size]``.
+        Returns:
+        Conditional generation example::
+                # Mask filling only works for bart-large
+                from transformers import BartTokenizer, BartForConditionalGeneration
+                tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+                TXT = "My friends are <mask> but they eat too many carbs."
+                model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+                input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+                logits = model(input_ids).logits
+                masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+                probs = logits[0, masked_index].softmax(dim=0)
+                values, predictions = probs.topk(5)
+                tokenizer.decode(predictions).split()
+                # ['good', 'great', 'all', 'really', 'very']
+        """
+        if "lm_labels" in unused:
+            warnings.warn(
+                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = unused.pop("lm_labels")
+        if "decoder_cached_states" in unused:
+            warnings.warn(
+                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
+                FutureWarning,
+            )
+            past_key_values = unused.pop("decoder_cached_states")
+        if "decoder_past_key_values" in unused:
+            warnings.warn(
+                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
+                FutureWarning,
+            )
+            past_key_values = unused.pop("decoder_past_key_values")
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # TODO(SS): do we need to ignore pad tokens in labels?
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, decoder_input_ids, past, attention_mask, use_cache, encoder_outputs, **kwargs
+    ):
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+    def adjust_logits_during_generation(self, logits, cur_len, max_length):
+        if cur_len == 1 and self.config.force_bos_token_to_be_generated:
+            self._force_token_ids_generation(logits, self.config.bos_token_id)
+        elif cur_len == max_length - 1 and self.config.eos_token_id is not None:
+            self._force_token_ids_generation(logits, self.config.eos_token_id)
+        return logits
+    def _force_token_ids_generation(self, scores, token_id) -> None:
+        """force one of token_ids to be generated by setting prob of all other tokens to 0 (logprob=-float("inf"))"""
+        scores[:, [x for x in range(self.config.vocab_size) if x != token_id]] = -float("inf")
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = []
+        for layer_past in past:
+            # get the correct batch idx from decoder layer's batch dim for cross and self-attn
+            layer_past_new = {
+                attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
+            }
+            reordered_past.append(layer_past_new)
+        return reordered_past
+    def get_encoder(self):
+        return self.model.encoder
+    def get_output_embeddings(self):
+        return _make_linear_from_emb(self.model.shared)  # make it on the fly
+@add_start_docstrings(
+    """Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """,
+    BART_START_DOCSTRING,
+)
+class BartForSequenceClassification(PretrainedBartModel):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = BartModel(config)
+        self.classification_head = BartClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classif_dropout,
+        )
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_outputs=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        x = outputs[0]  # last hidden state
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+        if len(torch.unique(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
+        logits = self.classification_head(sentence_representation)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+@add_start_docstrings(
+    """BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layer on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BART_START_DOCSTRING,
+)
+class BartForQuestionAnswering(PretrainedBartModel):
+    def __init__(self, config):
+        super().__init__(config)
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+        self.model = BartModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.model._init_weights(self.qa_outputs)
+    @add_start_docstrings_to_callable(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="facebook/bart-large",
+        output_type=Seq2SeqQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        encoder_outputs=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        start_positions=None,
+        end_positions=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+class SinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+    def __init__(self, num_positions, embedding_dim, padding_idx=None):
+        super().__init__(num_positions, embedding_dim)
+        if embedding_dim % 2 != 0:
+            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
+        self.weight = self._init_weight(self.weight)
+    @staticmethod
+    def _init_weight(out: nn.Parameter):
+        """Identical to the XLM create_sinusoidal_embeddings except features are not interleaved.
+        The cos features are in the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = out.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out[:, 0 : dim // 2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))  # This line breaks for odd n_pos
+        out[:, dim // 2 :] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        out.detach_()
+        out.requires_grad = False
+        return out
+    @torch.no_grad()
+    def forward(self, input_ids, use_cache=False):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bsz, seq_len = input_ids.shape[:2]
+        if use_cache:
+            positions = input_ids.data.new(1, 1).fill_(seq_len - 1)  # called before slicing
+        else:
+            # starts at 0, ends at 1-seq_len
+            positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
+        return super().forward(positions)

datasets/LogiQA2.0/logiqa/multi-choice-prompt.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import json
+import time
+import openai
+import sklearn
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
+openai.api_key = ''
+incontext = "Input\nWrite a multi-choice question for the following article:\nArticle: David knows Mr. Zhang's friend Jack, and Jack knows David's friend Ms. Lin. Everyone of them who knows Jack has a master's degree, and everyone of them who knows Ms. Lin is from Shanghai.\nQuestion: \nWho is from Shanghai and has a master's degree?\nOptions:\nA David\nB Jack\nC Mr Zhang\nD Ms. Lin\nAnswer:\nA\nInput\nWrite a multi-choice question for the following article:\nArticle: Jimmy asked Hank to go to the mall the next day. Hank said, If it doesn't rain tomorrow, I'll go climbing. The next day, there was a drizzle. Jimmy thought that Hank would not go climbing, so he went to pick up Henry to the mall. Nevertheless, Hank went climbing the mountain. When the two met again, Jimmy blamed Hank for not keeping his word.\nQuestion: \nWhich of the following comments is appropriate?\nOptions:\nA This argument between Jimmy and Hank is meaningless\nB Jimmy's reasoning is illogical\nC Two people have different understandings of a drizzle\nD Hank broke his promise and caused the debate\nAnswer:\nB\nInput\nWrite a multi-choice question for the following article:\nArticle: Only if the government reinforce basic education can we improve our nation's education to a new stage. In order to stand out among other nations, we need to have a strong educational enterprise.\nQuestion: \nWhich can be inferred from the statement above?\nOptions:\nA The whole society should be focused on education\nB In order to stand out among nations, we should reinforce basic education\nC In order to improve our education to a new stage, it is necessary to increase the salary of college teachers\nD In order to reinforce basic education, all primary school teachers must have a bachelor degree or above.\nAnswer:\nB\nInput\nWrite a multi-choice question for the following article:\nArticle: Last night, Mark either went to play in the gym or visited his teacher Tony. If Mark drove last night, he didn't go to play in the gym. Mark would go visit his teacher Tony only if he and his teacher had an appointment. In fact, Mark had no appointment with his teacher Tony in advance.\nQuestion: \nWhich is true based on the above statement?\nOptions:\nA Mark went to the gym with his teacher Tony last night\nB Mark visited his teacher Tony last night\nC Mark didn't drive last night\nD Mark didn't go to the gym last night.\nAnswer:\nC\nInput\nWrite a multi-choice question for the following article:\nArticle: The coach of a national football team found that the best cooperative arrangement of the players U, V, W, X, Y, and Z during the training are: (1) V and X cannot be on the field at the same time, and neither can be off the field the same time. (2) V is not on the field only if U is not on the field. (3) If W is on the field, then X is on the field. (4) If Ｙ and Ｚ are on the field, then W must be on the field. This arrangement can yield the best performance.\nQuestion: \nIf U and Z are both on the field, for best performance, which of the following arrangement is appropriate?\nOptions:\nA X is on the eld and Y is not on the field\nB V is on the eld and Y is not on the field\nC V and W are both on the field\nD V and Y are not on the field\nAnswer:\nB\n"
+label_map = {0: "A", 1: "B", 2: "C", 3: "D"}
+def gpt3_api(prompt):
+   response = openai.Completion.create(
+      model="text-davinci-002",
+      prompt=incontext + prompt,
+      temperature=0,
+      max_tokens=60,
+      top_p=1.0,
+      frequency_penalty=0.0,
+      presence_penalty=0.0
+   )
+   return response
+with open('test.json') as f:
+   y_true = []
+   y_pred = []
+   lines = f.readlines()
+   for i, line in enumerate(lines):
+   		line_dict = json.loads(line)
+   		article = line_dict['text']
+   		answer = line_dict['answer']
+   		label = label_map[answer]
+   		y_true.append(label)
+   		question = line_dict['question']
+   		options_old = line_dict['options']
+   		options = ""
+   		for j, option in enumerate(options_old):
+   			options += label_map[j] + " " + option + "\n"
+   		prompt_input = "Write a multi-choice question for the following article:\nArticle: " + article + "\nQuestion: " + question + "\nOptions: " + options + "\nAnswer: \n"
+   		prompt = prompt_input
+   		output = gpt3_api(prompt)
+   		time.sleep(5)
+   		pred = output.choices[0].text
+   		y_pred.append(pred)
+   	print(y_true)
+   	print(y_pred)
+   	f_score = f1_score(y_true, y_pred, average='binary')
+	p_score = precision_score(y_true, y_pred, average='binary')
+	r_score = recall_score(y_true, y_pred, average='binary')
+	acc = accuracy_score(y_true, y_pred)
+	print(f_score)
+	print(p_score)
+	print(r_score)
+	print(acc)

datasets/LogiQA2.0/logiqa/run_mrc.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This Script is Modified for Multi-Choice Reading Comprehension Fine-tuning.
+All the datasets can be downloaded from this repo.
+"""
+from __future__ import absolute_import, division, print_function
+import argparse
+import glob
+import logging
+import os
+import random
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+from tqdm import tqdm, trange
+from transformers import (WEIGHTS_NAME, BertConfig,
+                          BertForMultipleChoice, BertTokenizer,
+                          RobertaConfig,
+                          RobertaForMultipleChoice,
+                          RobertaTokenizer,
+                          XLNetConfig,
+                          XLNetForMultipleChoice,
+                          XLNetTokenizer,
+                          )
+from transformers import AdamW, get_linear_schedule_with_warmup
+from utils_mrc import compute_metrics
+from utils_mrc import output_modes
+from utils_mrc import processors
+from utils_mrc import convert_examples_to_features
+logger = logging.getLogger(__name__)
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig,
+                                                                                RobertaConfig,)), ())
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
+    'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer),
+}
+def select_field(features, field):
+    return [[choice[field] for choice in feature.choices_features] for feature in features]
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
+                                                num_training_steps=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                args.train_batch_size * args.gradient_accumulation_steps * (
+                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids': batch[0],
+                      'attention_mask': batch[1],
+                      'labels': batch[3]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
+                                                                           'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model,
+                                                            'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+    return global_step, tr_loss / global_step
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+        # multi-gpu eval
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            with torch.no_grad():
+                inputs = {'input_ids': batch[0],
+                          'attention_mask': batch[1],
+                          'labels': batch[3]}
+                if args.model_type != 'distilbert':
+                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
+                                                                               'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = {"eval": compute_metrics(eval_task, preds, out_label_ids), "loss": eval_loss}
+        results.update(result)
+        output_pred_file = os.path.join(eval_output_dir, prefix, "pred_results.txt")
+        with open(output_pred_file, "a") as writer:
+            for pred in preds:
+                writer.write(str(pred)+"\n")
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+    return results
+def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+    processor = processors[task]()
+    # Load data features from cache or dataset file
+    if evaluate:
+        cached_mode = "dev"
+    elif test:
+        cached_mode = "test"
+    else:
+        cached_mode = "train"
+    assert not (evaluate and test)
+    cached_features_file = os.path.join(
+        args.data_dir,
+        "cached_{}_{}_{}_{}".format(
+            cached_mode,
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+            str(task),
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        if evaluate:
+            examples = processor.get_dev_examples(args.data_dir)
+        elif test:
+            examples = processor.get_test_examples(args.data_dir)
+        else:
+            examples = processor.get_train_examples(args.data_dir)
+        logger.info("Training number: %s", str(len(examples)))
+        features = convert_examples_to_features(
+            examples,
+            label_list,
+            args.max_seq_length,
+            tokenizer,
+            pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+        )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
+    all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
+def main():
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
+                            ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+    if os.path.exists(args.output_dir) and os.listdir(
+            args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir))
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+    # Setup logging
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    # Set seed
+    set_seed(args)
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    print(processors)
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          num_labels=num_labels,
+                                          finetuning_task=args.task_name,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    model.to(args.device)
+    logger.info("Training/evaluation parameters %s", args)
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model,
+                                                'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+    return results
+if __name__ == "__main__":
+    main()

datasets/LogiQA2.0/logiqa/utils_mrc.py ADDED Viewed

	@@ -0,0 +1,280 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This Script is Modified for Natural Language Inference Datasets fine-tuning.
+All the datasets can be downloaded from this repo.
+"""
+import logging
+import os
+import sys
+import json
+from typing import List
+import tqdm
+from transformers import PreTrainedTokenizer
+from transformers.file_utils import is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+logger = logging.getLogger(__name__)
+class InputExample(object):
+    """A single training/test example for multiple choice"""
+    def __init__(self, example_id, question, contexts, endings, label=None):
+        """Constructs a InputExample.
+        Args:
+            example_id: Unique id for the example.
+            contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
+            question: string. The untokenized text of the second sequence (question).
+            endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.example_id = example_id
+        self.question = question
+        self.contexts = contexts
+        self.endings = endings
+        self.label = label
+class InputFeatures(object):
+    def __init__(self, example_id, choices_features, label):
+        self.example_id = example_id
+        self.choices_features = [
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
+            for input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+class DataProcessor(object):
+    """Base class for data converters for multiple choice data sets."""
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the test set."""
+        raise NotImplementedError()
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+def convert_examples_to_features(
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+    pad_token_segment_id=0,
+    pad_on_left=False,
+    pad_token=0,
+    mask_padding_with_zero=True,
+) -> List[InputFeatures]:
+    """
+    Loads a data file into a list of `InputFeatures`
+    """
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        choices_features = []
+        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
+            text_a = context
+            if example.question.find("_") != -1:
+                # this is for cloze question
+                text_b = example.question.replace("_", ending)
+            else:
+                text_b = example.question + " " + ending
+            inputs = tokenizer.encode_plus(text_a, text_b, add_special_tokens=True, max_length=max_length, return_token_type_ids=True)
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are poping question + options,"
+                    "you need to try to use a bigger max seq length!"
+                )
+            input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+            # Zero-pad up to the sequence length.
+            padding_length = max_length - len(input_ids)
+            if pad_on_left:
+                input_ids = ([pad_token] * padding_length) + input_ids
+                attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+                token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+            else:
+                input_ids = input_ids + ([pad_token] * padding_length)
+                attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+                token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+            assert len(input_ids) == max_length
+            assert len(attention_mask) == max_length
+            assert len(token_type_ids) == max_length
+            choices_features.append((input_ids, attention_mask, token_type_ids))
+        label = label_map[example.label]
+        if ex_index < 2:
+            logger.info("*** Example ***")
+            logger.info("race_id: {}".format(example.example_id))
+            for choice_idx, (input_ids, attention_mask, token_type_ids) in enumerate(choices_features):
+                logger.info("choice: {}".format(choice_idx))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("attention_mask: {}".format(" ".join(map(str, attention_mask))))
+                logger.info("token_type_ids: {}".format(" ".join(map(str, token_type_ids))))
+                logger.info("label: {}".format(label))
+        features.append(InputFeatures(example_id=example.example_id, choices_features=choices_features, label=label,))
+    return features
+class LogiProcessor(DataProcessor):
+    """Processor for the ReClor data set."""
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} train".format(data_dir))
+        return self._create_examples(self._read_json(os.path.join(data_dir, "train.txt")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {} dev".format(data_dir))
+        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.txt")), "dev")
+    def get_test_examples(self, data_dir):
+        logger.info("LOOKING AT {} test".format(data_dir))
+        return self._create_examples(self._read_json(os.path.join(data_dir, "test.txt")), "test")
+    def get_labels(self):
+        """See base class."""
+        return [0, 1, 2, 3]
+    def _read_json(self, input_file):
+        with open(input_file, 'r') as f:
+            lines = []
+            file = f.readlines()
+            for line in file:
+                line = json.loads(line)
+                lines.append(line)
+        return lines
+    # def _read_json(self, input_file):
+    #     with open(input_file, "r") as f:
+    #         lines = json.load(f)
+    #     return lines
+    def _create_examples(self, lines, type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for d in lines:
+            context = d['text']
+            question = d['question']
+            answers = d['options']
+            label = 0 if type == "test" else d['answer'] # for test set, there is no label. Just use 0 for convenience.
+            id_string = d['id']
+            examples.append(
+                InputExample(
+                    example_id = id_string,
+                    question = question,
+                    contexts=[context, context, context, context],  # this is not efficient but convenient
+                    endings=[answers[0], answers[1], answers[2], answers[3]],
+                    label = label
+                    )
+                )
+        return examples
+try:
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import matthews_corrcoef, f1_score, confusion_matrix
+    _has_sklearn = True
+except (AttributeError, ImportError):
+    _has_sklearn = False
+def is_sklearn_available():
+    return _has_sklearn
+if _has_sklearn:
+    def simple_accuracy(preds, labels):
+        return (preds == labels).mean()
+    def acc_and_f1(preds, labels):
+        acc = simple_accuracy(preds, labels)
+        f1 = f1_score(y_true=labels, y_pred=preds)
+        return {
+            "acc": acc,
+            "f1": f1,
+            "acc_and_f1": (acc + f1) / 2,
+        }
+    def pearson_and_spearman(preds, labels):
+        pearson_corr = pearsonr(preds, labels)[0]
+        spearman_corr = spearmanr(preds, labels)[0]
+        return {
+            "pearson": pearson_corr,
+            "spearmanr": spearman_corr,
+            "corr": (pearson_corr + spearman_corr) / 2,
+        }
+    def compute_metrics(task_name, preds, labels):
+        assert len(preds) == len(labels)
+        if task_name == "logiqa":
+            return {"acc": simple_accuracy(labels, preds)}
+        else:
+            raise KeyError(task_name)
+tasks_num_labels = {
+    "logiqa": 4,
+}
+processors = {
+    "logiqa": LogiProcessor,
+}
+output_modes = {
+    "logiqa": "classification",
+}

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/ dev_new.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1542bf081f8020b99efade38d67552f3687e47bc47636e5a4ea90f439c508b3
+size 2978446

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/dev.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924ac0ea84135e8ebcc2332e093dc84bf85be722e67df77ee20ec508ecf12f36
+size 2453230

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/readme.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # LogiQA 2.0 NLI version

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/stat.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import json
+with open('test.txt', 'r') as f:
+    file = f.readlines()
+    n = 1
+    l = 0
+    for line in file:
+        line = json.loads(line)
+        text1 = line['major_premise']
+        text2 = line['minor_premise']
+        if type(text1) == str:
+            l = l + len(text1.split(" "))
+        else:
+            for text in text1:
+                l = l + len(text.split(" "))
+        if type(text2) == str:
+            l = l + len(text2.split(" "))
+        else:
+            for text in text2:
+                l = l + len(text.split(" "))
+        n += 1
+    result = l/n
+    print(result)

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/test.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf5716861d7611d7d2aeed831c4651b1da9e717d318fcee043153bba34af8cfb
+size 2458217

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/test_new.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1d1a4db8ea93c979f7209dfa5db03598c1b645a66dfc7db89421461e85013ec
+size 2484632

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/train.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ade56a6c7a4ee9ff68a7193b07e0630df0955f305412ef561c50cb5b0a601f2
+size 19684892

datasets/LogiQA2.0/logiqa2nli/DATA/QA2NLI/train_new.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6669ad14ec82772471807f1231ae2b244d2be3983642c09cf00601e5c29d522
+size 5325690

datasets/LogiQA2.0/logiqa2nli/nli-prompt.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import json
+import time
+import openai
+import sklearn
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
+openai.api_key = ''
+incontext = "Given the fact: All Cantonese are southerners. Some Cantonese don't like chili. Does it follow that: Some southerners don't like chili. Yes or no? yes\nGiven the fact: It is difficult for cactus to survive in humid climates; citrus is difficult to grow in cold climates. In most parts of a province, at least one species is not difficult to survive and grow between cactus and citrus. Does it follow that: Half of the province is humid and cold. Yes or no? no\nGiven the fact: It is difficult for cactus to survive in humid climates; citrus is difficult to grow in cold climates. In most parts of a province, at least one species is not difficult to survive and grow between cactus and citrus. Does it follow that: Most of the province is hot. Yes or no? no\nGiven the fact: It is difficult for cactus to survive in humid climates; citrus is difficult to grow in cold climates. In most parts of a province, at least one species is not difficult to survive and grow between cactus and citrus. Does it follow that: Most of the province is either dry or warm. Yes or no? yes\n"
+def gpt3_api(prompt):
+   response = openai.Completion.create(
+      model="text-davinci-002",
+      prompt=incontext + prompt,
+      temperature=0,
+      max_tokens=60,
+      top_p=1.0,
+      frequency_penalty=0.0,
+      presence_penalty=0.0
+   )
+   return response
+with open('test1.txt') as f:
+   c = 0
+   y_true = []
+   y_pred = []
+   lines = f.readlines()
+   for i, line in enumerate(lines):
+      line_dict = json.loads(line)
+      label = 0 if line_dict['label']=="not entailed" else 1
+      maj_premise = ' '.join(line_dict['major_premise'])
+      min_premise = ' '.join(line_dict['minor_premise'])
+      hypo = line_dict['conclusion']
+      prompt_input = "Given the fact: " + maj_premise + ' ' + min_premise + " Does it follow that: " + hypo + " Yes or no?"
+      y_true.append(label)
+      prompt = prompt_input
+      output = gpt3_api(prompt)
+      time.sleep(5)
+      pred = output.choices[0].text.lower()
+      y_pred.append(pred)
+   print(y_true)
+   print(y_pred)
+   f_score = f1_score(y_true, y_pred, average='binary')
+   p_score = precision_score(y_true, y_pred, average='binary')
+   r_score = recall_score(y_true, y_pred, average='binary')
+   acc = accuracy_score(y_true, y_pred)
+   print(f_score)
+   print(p_score)
+   print(r_score)
+   print(acc)

datasets/LogiQA2.0/logiqa2nli/qa2nli.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+export DATA_DIR=./DATA
+export TASK_NAME=QA2NLI
+CUDA_VISIBLE_DEVICES=0,1 python run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/run_nli.py ADDED Viewed

	@@ -0,0 +1,549 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
+from __future__ import absolute_import, division, print_function
+import argparse
+import glob
+import logging
+import os
+import random
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+from tqdm import tqdm, trange
+from transformers import (WEIGHTS_NAME, BertConfig,
+                          BertForSequenceClassification, BertTokenizer,
+                          RobertaConfig,
+                          RobertaForSequenceClassification,
+                          RobertaTokenizer,
+                          XLMConfig, XLMForSequenceClassification,
+                          XLMTokenizer, XLNetConfig,
+                          XLNetForSequenceClassification,
+                          XLNetTokenizer,
+                          DistilBertConfig,
+                          DistilBertForSequenceClassification,
+                          DistilBertTokenizer,
+                          AlbertConfig,
+                          AlbertForSequenceClassification,
+                          AlbertTokenizer,
+                          )
+from transformers import AdamW, get_linear_schedule_with_warmup
+from utils_nli import compute_metrics
+from utils_nli import output_modes
+from utils_nli import processors
+from utils_nli import convert_examples_to_features
+logger = logging.getLogger(__name__)
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig,
+                                                                                RobertaConfig, DistilBertConfig)), ())
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
+    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
+}
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+         'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
+                                                num_training_steps=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                args.train_batch_size * args.gradient_accumulation_steps * (
+                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids': batch[0],
+                      'attention_mask': batch[1],
+                      'labels': batch[3]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
+                                                                           'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model,
+                                                            'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+    return global_step, tr_loss / global_step
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+        # multi-gpu eval
+        if args.n_gpu > 1:
+            model = torch.nn.DataParallel(model)
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            with torch.no_grad():
+                inputs = {'input_ids': batch[0],
+                          'attention_mask': batch[1],
+                          'labels': batch[3]}
+                if args.model_type != 'distilbert':
+                    inputs['token_type_ids'] = batch[2] if args.model_type in ['bert',
+                                                                               'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = {"eval": compute_metrics(eval_task, preds, out_label_ids), "loss": eval_loss}
+        results.update(result)
+        output_pred_file = os.path.join(eval_output_dir, prefix, "pred_results.txt")
+        with open(output_pred_file, "a") as writer:
+            for pred in preds:
+                writer.write(str(pred)+"\n")
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+    return results
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+            # HACK(label indices are swapped in RoBERTa pretrained model)
+            label_list[1], label_list[2] = label_list[2], label_list[1]
+        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(
+            args.data_dir)
+        features = convert_examples_to_features(examples,
+                                                tokenizer,
+                                                label_list=label_list,
+                                                max_length=args.max_seq_length,
+                                                output_mode=output_mode,
+                                                pad_on_left=bool(args.model_type in ['xlnet']),
+                                                # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+                                                )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+    dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
+    return dataset
+def main():
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
+                            ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+    if os.path.exists(args.output_dir) and os.listdir(
+            args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir))
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+    # Setup logging
+    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt='%m/%d/%Y %H:%M:%S',
+                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+    # Set seed
+    set_seed(args)
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    print(processors)
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          num_labels=num_labels,
+                                          finetuning_task=args.task_name,
+                                          cache_dir=args.cache_dir if args.cache_dir else None)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case,
+                                                cache_dir=args.cache_dir if args.cache_dir else None)
+    model = model_class.from_pretrained(args.model_name_or_path,
+                                        from_tf=bool('.ckpt' in args.model_name_or_path),
+                                        config=config,
+                                        cache_dir=args.cache_dir if args.cache_dir else None)
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+    model.to(args.device)
+    logger.info("Training/evaluation parameters %s", args)
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model,
+                                                'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+    return results
+if __name__ == "__main__":
+    main()

datasets/LogiQA2.0/logiqa2nli/scripts/anli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=../DATA
+export TASK_NAME=ANLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/cood.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+export DATA_DIR=../DATA
+export TASK_NAME=COOD
+python ../run_nli.py  --model_type bert  --model_name_or_path bert-base-uncased --task_name $TASK_NAME --do_train --do_eval --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 128 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=16   --gradient_accumulation_steps 2 --logging_steps 1000 --save_steps 1000 --learning_rate 2e-5  --eval_all_checkpoints --num_train_epochs 10.0 --output_dir ./tmp/$TASK_NAME/bert-base/

datasets/LogiQA2.0/logiqa2nli/scripts/mnli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=../DATA
+export TASK_NAME=MNLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 2e-5 \
+    --num_train_epochs 2.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/multirun.sh ADDED Viewed

	@@ -0,0 +1,8 @@

+export DATA_DIR=../DATA
+export TASK_NAME=QA2NLI
+python ../run_nli.py  --model_type bert  --model_name_or_path bert-base-uncased --task_name $TASK_NAME --do_train --eval_all_checkpoints --do_eval --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 128 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=8   --gradient_accumulation_steps 3 --logging_steps 5000 --save_steps 5000 --eval_all_checkpoints --learning_rate 2e-5   --num_train_epochs 2.0 --output_dir ./tmp/$TASK_NAME/bert-base/
+#python ../run_nli.py  --model_type bert  --model_name_or_path bert-large-uncased --task_name $TASK_NAME --do_train --evaluate_during_training --do_eval --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 128 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=8   --gradient_accumulation_steps 1 --learning_rate 2e-5 --save_steps 200 --adam_epsilon 1e-6 --no_clip_grad_norm --warmup_proportion 0.1 --num_train_epochs 5.0 --output_dir ./tmp/$TASK_NAME/bertlarge/
+python ../run_nli.py  --model_type roberta  --model_name_or_path roberta-base --task_name $TASK_NAME  --do_train --do_eval --eval_all_checkpoints --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 256 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=8   --gradient_accumulation_steps 3 --logging_steps 5000 --save_steps 5000 --eval_all_checkpoints --learning_rate 1e-5  --num_train_epochs 2.0 --output_dir ./tmp/$TASK_NAME/roberta/
+#python ../run_nli.py  --model_type roberta  --model_name_or_path /home/bimu/PycharmProjects/liu_nli/tmp-1/QNLI/roberta/ --task_name $TASK_NAME --do_train  --do_eval --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 128 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=16   --gradient_accumulation_steps 2 --learning_rate 2e-5  --num_train_epochs 5.0 --output_dir ./tmp/$TASK_NAME/roberta/
+#python ../run_nli.py  --model_type xlnet  --model_name_or_path xlnet-base-cased --task_name $TASK_NAME --do_train --do_eval --eval_all_checkpoints --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 128 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=8   --gradient_accumulation_steps 3 --logging_steps 500 --save_steps 500 --eval_all_checkpoints --learning_rate 2e-5  --adam_epsilon 1e-6 --num_train_epochs 5.0 --output_dir ./tmp/$TASK_NAME/xlnet/
+#python ../run_nli.py  --model_type bert  --model_name_or_path bert-base-uncased --task_name $TASK_NAME  --do_train --do_eval --do_lower_case --data_dir $DATA_DIR/$TASK_NAME --max_seq_length 128 --per_gpu_eval_batch_size=16 --per_gpu_train_batch_size=16   --gradient_accumulation_steps 2 --logging_steps 500 --save_steps 500 --eval_all_checkpoints --learning_rate 2e-5   --num_train_epochs 5.0 --output_dir ./tmp/$TASK_NAME/bert-base/

datasets/LogiQA2.0/logiqa2nli/scripts/pnli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=../DATA
+export TASK_NAME=PNLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 512 \
+    --per_gpu_eval_batch_size=8 \
+    --per_gpu_train_batch_size=8  \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/qa2nli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=../DATA
+export TASK_NAME=QA2NLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/qnli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=../DATA
+export TASK_NAME=QNLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/qood.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+export DATA_DIR=../DATA
+export TASK_NAME=QOOD
+CUDA_VISIBLE_DEVICES=0,1 python run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/rte.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+export DATA_DIR=../DATA
+export TASK_NAME=RTE
+CUDA_VISIBLE_DEVICES=0,1 python ../run_DATA.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 256 \
+    --per_gpu_eval_batch_size=16 \
+    --per_gpu_train_batch_size=16   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/scitail.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+export DATA_DIR=../DATA
+export TASK_NAME=SCITAIL
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --evaluate_during_training \
+    --learning_rate 2e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    --overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/snli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=./DATA
+export TASK_NAME=SNLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 2e-5 \
+    --num_train_epochs 2.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/scripts/wnli.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+export DATA_DIR=../DATA
+export TASK_NAME=WNLI
+CUDA_VISIBLE_DEVICES=0,1 python ../run_nli.py \
+    --model_type bert \
+    --model_name_or_path bert-base-uncased \
+    --task_name $TASK_NAME \
+    --do_train \
+    --do_eval \
+    --do_lower_case \
+    --data_dir $DATA_DIR/$TASK_NAME \
+    --max_seq_length 128 \
+    --per_gpu_eval_batch_size=64 \
+    --per_gpu_train_batch_size=64   \
+    --gradient_accumulation_steps 2\
+    --learning_rate 1e-5 \
+    --num_train_epochs 10.0 \
+    --logging_steps 5000 \
+    --save_steps 5000 \
+    --output_dir ./tmp/$TASK_NAME/ \
+    #--overwrite_output_dir \

datasets/LogiQA2.0/logiqa2nli/utils_nli.py ADDED Viewed

	@@ -0,0 +1,1002 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This Script is Modified for Natural Language Inference Datasets fine-tuning.
+All the datasets can be downloaded from this repo.
+"""
+import logging
+import os
+import sys
+import json
+from transformers.data.processors.utils import DataProcessor, InputExample, InputFeatures
+from transformers.file_utils import is_tf_available
+if is_tf_available():
+    import tensorflow as tf
+logger = logging.getLogger(__name__)
+def convert_examples_to_features(examples, tokenizer,
+                                      max_length=512,
+                                      task=None,
+                                      label_list=None,
+                                      output_mode=None,
+                                      pad_on_left=False,
+                                      pad_token=0,
+                                      pad_token_segment_id=0,
+                                      mask_padding_with_zero=True):
+    is_tf_dataset = False
+    if is_tf_available() and isinstance(examples, tf.data.Dataset):
+        is_tf_dataset = True
+    if task is not None:
+        processor = glue_processors[task]()
+        if label_list is None:
+            label_list = processor.get_labels()
+            logger.info("Using label list %s for task %s" % (label_list, task))
+        if output_mode is None:
+            output_mode = glue_output_modes[task]
+            logger.info("Using output mode %s for task %s" % (output_mode, task))
+    label_map = {label: i for i, label in enumerate(label_list)}
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+        if is_tf_dataset:
+            example = processor.get_example_from_tensor_dict(example)
+            example = processor.tfds_map(example)
+        inputs = tokenizer.encode_plus(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+        )
+        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        padding_length = max_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
+            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
+        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
+        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
+        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
+        if output_mode == "classification":
+            label = label_map[example.label]
+        elif output_mode == "regression":
+            label = float(example.label)
+        else:
+            raise KeyError(output_mode)
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s" % (example.guid))
+            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
+            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
+            logger.info("label: %s (id = %d)" % (example.label, label))
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              attention_mask=attention_mask,
+                              token_type_ids=token_type_ids,
+                              label=label))
+    if is_tf_available() and is_tf_dataset:
+        def gen():
+            for ex in features:
+                yield ({'input_ids': ex.input_ids,
+                         'attention_mask': ex.attention_mask,
+                         'token_type_ids': ex.token_type_ids},
+                        ex.label)
+        return tf.data.Dataset.from_generator(gen,
+            ({'input_ids': tf.int32,
+              'attention_mask': tf.int32,
+              'token_type_ids': tf.int32},
+             tf.int64),
+            ({'input_ids': tf.TensorShape([None]),
+              'attention_mask': tf.TensorShape([None]),
+              'token_type_ids': tf.TensorShape([None])},
+             tf.TensorShape([])))
+    return features
+class SnliProcessor(DataProcessor):
+    """Processor for the SNLI dataset (converted)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "train.jsonl")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "dev.jsonl")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["e", "n", "c"]
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            dict_line = json.loads(line)
+            guid = "%s-%s" % (set_type, i)
+            label = dict_line['label']
+            text_a = dict_line['premise'].strip()
+            text_b = dict_line['hypothesis'].strip()
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class MnliProcessor(DataProcessor):
+    """Processor for the MultiNLI data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+            "dev_matched")
+    def get_labels(self):
+        """See base class."""
+        return ["contradiction", "entailment", "neutral"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8]
+            text_b = line[9]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_b, text_b=text_a, label=label))
+        return examples
+class MnliMismatchedProcessor(MnliProcessor):
+    """Processor for the MultiNLI Mismatched data set (GLUE version)."""
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/dev_mismatched.tsv")),
+            "dev_matched")
+class ColaProcessor(DataProcessor):
+    """Processor for the CoLA data set (GLUE version). <Linguistic Acceptability>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            None,
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+class CoodProcessor(DataProcessor):
+    """Processor for the CoLA-ood data set. <Linguistic Acceptability>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            None,
+                            str(tensor_dict['label'].numpy()))
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "binary_train.txt")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "binary_dev.txt")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return [0, 1]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            guid = "%s-%s" % (set_type, i)
+            dict_line = eval(line)
+            print(i)
+            text_a = dict_line['text']
+            label = dict_line['label']
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+class Sst2Processor(DataProcessor):
+    """Processor for the SST-2 data set (GLUE version). <Sentiment Analysis>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            None,
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[0]
+            label = line[1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+        return examples
+class StsbProcessor(DataProcessor):
+    """Processor for the STS-B data set (GLUE version). <Text Similarity>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return [None]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[7]
+            text_b = line[8]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+class QqpProcessor(DataProcessor):
+    """Processor for the QQP data set (GLUE version). <Paraphrase>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['question1'].numpy().decode('utf-8'),
+                            tensor_dict['question2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            try:
+                text_a = line[3]
+                text_b = line[4]
+                label = line[5]
+            except IndexError:
+                continue
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+class QnliProcessor(DataProcessor):
+    """Processor for the QNLI data set (GLUE version). <Question>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['question'].numpy().decode('utf-8'),
+                            tensor_dict['sentence'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")),
+            "dev_matched")
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+class RteProcessor(DataProcessor):
+    """Processor for the RTE data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+class WnliProcessor(DataProcessor):
+    """Processor for the WNLI data set (GLUE version)."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[1]
+            text_b = line[2]
+            label = line[-1]
+            examples.append(
+                InputExample(guid=guid, text_a='', text_b=text_a, label=label))
+        return examples
+class PnliProcessor(DataProcessor):
+    """Processor for the ConTRoL dataset (multi-sentence/paragraph/passage level). """
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['context'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "train.jsonl")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "dev.jsonl")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["c", "e", "n"]
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            dict_line = json.loads(line)
+            guid = "%s-%s" % (set_type, i)
+            label = dict_line['label']
+            text_a = dict_line['premise'].strip()
+            text_b = dict_line['hypothesis'].strip()
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+"""Below is the data reader for long/short segmentation of the ConTRoL data"""
+    # def get_train_examples(self, data_dir):
+    #     """See base class."""
+    #     return self._create_examples(
+    #         self._read_tsv(os.path.join(data_dir, "short/train.tsv")), "train")
+    # def get_dev_examples(self, data_dir):
+    #     """See base class."""
+    #     return self._create_examples(
+    #         self._read_tsv(os.path.join(data_dir, "short/dev.tsv")), "dev")
+    # def get_labels(self):
+    #     """See base class."""
+    #     return ["c", "e", "n"]
+    # def _create_examples(self, lines, set_type):
+    #     """Creates examples for the training and dev sets."""
+    #     examples = []
+    #     for (i, line) in enumerate(lines):
+    #         if i == 0:
+    #             continue
+    #         if len(line) == 3:
+    #             guid = "%s-%s" % (set_type, line[0])
+    #             text_a = line[0]
+    #             text_b = line[1]
+    #             label = line[-1][-1].lower()
+    #             examples.append(
+    #                 InputExample(guid=guid, text_a=text_b, text_b=text_a, label=label))
+    #     return examples
+class Qa2nliProcessor(DataProcessor):
+    """Processor for the logiqa2nli data set."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise_par_new'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "train.txt")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "dev.txt")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ['entailed', 'not entailed']
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            dict_line = json.loads(line)
+            guid = "%s-%s" % (set_type, i)
+            label = dict_line['label']
+            text_a = "".join(_ for _ in dict_line['major_premise']) + " " + "".join(_ for _ in dict_line['minor_premise'])
+            text_a = text_a.strip()
+            text_b = dict_line['conclusion'].strip()
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class SciProcessor(DataProcessor):
+    """Processor for the SciTail data set."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "snli_format/train.txt")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "snli_format/dev.txt")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "neutral"]
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            dict_line = json.loads(line)
+            guid = "%s-%s" % (set_type, i)
+            label = dict_line['gold_label']
+            text_a = dict_line['sentence1'].strip()
+            text_b = dict_line['sentence2'].strip()
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class AnliProcessor(DataProcessor):
+    """Processor for the ANLI data set."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "r3/train.jsonl")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "r3/dev.jsonl")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["e", "n", "c"]
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            dict_line = json.loads(line)
+            guid = "%s-%s" % (set_type, i)
+            label = dict_line['label']
+            text_a = dict_line['premise'].strip()
+            text_b = dict_line['hypothesis'].strip()
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class QoodProcessor(DataProcessor):
+    """Processor for the QNLI-ood data set."""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['premise'].numpy().decode('utf-8'),
+                            tensor_dict['hypothesis'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "train.txt")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_txt(os.path.join(data_dir, "dev.txt")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["entailment", "not_entailment"]
+    def _read_txt(self, dir):
+        with open(dir, "r", encoding="utf-8") as f:
+            lines = []
+            for line in f.readlines():
+                if sys.version_info[0] == 2:
+                    line = list(unicode(cell, 'utf-8') for cell in line)
+                lines.append(line)
+        return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            dict_line = json.loads(line)
+            guid = "%s-%s" % (set_type, i)
+            label = dict_line['label']
+            text_a = dict_line['question'].strip()
+            text_b = dict_line['sentence'].strip()
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)
+            )
+        return examples
+class MrpcProcessor(DataProcessor):
+    """Processor for the MRPC data set (GLUE version). <Paraphrase>"""
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """See base class."""
+        return InputExample(tensor_dict['idx'].numpy(),
+                            tensor_dict['sentence1'].numpy().decode('utf-8'),
+                            tensor_dict['sentence2'].numpy().decode('utf-8'),
+                            str(tensor_dict['label'].numpy()))
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "short/train.tsv")))
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/train.tsv")), "train")
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        return self._create_examples(
+            self._read_tsv(os.path.join(data_dir, "short/dev.tsv")), "dev")
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1"]
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, i)
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
+            examples.append(
+                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+        return examples
+try:
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import matthews_corrcoef, f1_score, confusion_matrix
+    _has_sklearn = True
+except (AttributeError, ImportError):
+    _has_sklearn = False
+def is_sklearn_available():
+    return _has_sklearn
+#if _has_sklearn:
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+def acc_and_f1(preds, labels):
+    acc = simple_accuracy(preds, labels)
+    f1 = f1_score(y_true=labels, y_pred=preds)
+    return {
+        "acc": acc,
+        "f1": f1,
+        "acc_and_f1": (acc + f1) / 2,
+    }
+def pearson_and_spearman(preds, labels):
+    pearson_corr = pearsonr(preds, labels)[0]
+    spearman_corr = spearmanr(preds, labels)[0]
+    return {
+        "pearson": pearson_corr,
+        "spearmanr": spearman_corr,
+        "corr": (pearson_corr + spearman_corr) / 2,
+    }
+def compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "cola":
+        return {"mcc": matthews_corrcoef(labels, preds)}
+    elif task_name == "cood":
+        return {"confusion matrix": confusion_matrix(preds, labels), "mcc": matthews_corrcoef(labels, preds), "f1 score": acc_and_f1(preds, labels)}
+    elif task_name == "sst-2":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mrpc":
+        return acc_and_f1(preds, labels)
+    elif task_name == "sts-b":
+        return pearson_and_spearman(preds, labels)
+    elif task_name == "qqp":
+        return acc_and_f1(preds, labels)
+    elif task_name == "mnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "mnli-mm":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "rte":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "wnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "hans":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "scitail":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "snli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qa2nli":
+        return {"confusion matrix": confusion_matrix(preds, labels), "mcc": matthews_corrcoef(labels, preds), "f1 score": acc_and_f1(preds, labels)}
+    elif task_name == "anli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "pnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    elif task_name == "qood":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+def xnli_compute_metrics(task_name, preds, labels):
+    assert len(preds) == len(labels)
+    if task_name == "xnli":
+        return {"acc": simple_accuracy(preds, labels)}
+    else:
+        raise KeyError(task_name)
+tasks_num_labels = {
+    "pnli": 3,
+    "cola": 2,
+    "cood": 2,
+    "snli": 3,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+    "qa2nli": 2,
+    "scitail": 2,
+    "anli": 3,
+    "qood": 2,
+}
+processors = {
+    "cola": ColaProcessor,
+    "cood": CoodProcessor,
+    "snli": SnliProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+    "pnli": PnliProcessor,
+    "qa2nli": Qa2nliProcessor,
+    "scitail": SciProcessor,
+    "anli": AnliProcessor,
+    "qood": QoodProcessor,
+}
+output_modes = {
+    "cola": "classification",
+    "cood": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+    "pnli": "classification",
+    "qa2nli": "classification",
+    "scitail": "classification",
+    "snli": "classification",
+    "anli": "classification",
+    "qood": "classification",
+}

datasets/LogiQA2.0/requirements.yml ADDED Viewed

	@@ -0,0 +1,17 @@

+name: logiqa
+dependencies:
+- nvidia::cudatoolkit=10.2.89
+- numpy
+- pillow
+- pip
+- python=3.6
+- pytorch::pytorch=1.10.2=py3.6_cuda11.7_cudnn8.0.5_0
+- scipy
+- tqdm
+- scikit-learn
+- tensorboard
+- tensorboardX
+- pip:
+  - transformers==2.4.1
+  - nltk
+  - wandb