alignscore-safetensor / alignscore /generate_training_data.py
PANH's picture
Upload 15 files
ffca110 verified
raw
history blame contribute delete
No virus
60.3 kB
from logging import error
from datasets import load_dataset
import transformers
from random import sample
import random
import torch
import json
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu
import pandas as pd
import re
'''
data format
{text_a, text_b, label:None or 0_1, }
'''
DATASET_HUGGINGFACE = {
'cnndm': ['cnn_dailymail', '3.0.0', 'train'],
'mnli': ['multi_nli', 'default', 'train'],
'squad': ['squad', 'plain_text', 'train'],
'squad_v2': ['squad_v2', 'squad_v2', 'train'],
'paws': ['paws', 'labeled_final', 'train'],
'vitaminc': ['tals/vitaminc', 'v1.0', 'train'],
'xsum': ['xsum', 'default', 'train'],
'stsb': ['glue', 'stsb', 'train'],
'sick': ['sick', 'default', 'train'],
'race': ['race', 'all', 'train'],
'race_val': ['race', 'all', 'validation'],
'anli_r1': ['anli', 'plain_text', 'train_r1'],
'anli_r2': ['anli', 'plain_text', 'train_r2'],
'anli_r3': ['anli', 'plain_text', 'train_r3'],
'snli': ['snli', 'plain_text', 'train'],
'wikihow': ['wikihow', 'all', 'train'],
'mrpc': ['glue', 'mrpc', 'train'],
'msmarco': ['ms_marco', 'v2.1', 'train'],
'mrpc_val': ['glue', 'mrpc', 'validation'],
'paws_val': ['paws', 'labeled_final', 'validation'],
'paws_unlabeled': ['paws', 'unlabeled_final', 'train'],
'qqp': ['glue', 'qqp', 'train'],
'qqp_val': ['glue', 'qqp', 'validation'],
'squad_v2_new': ['squad_v2', 'squad_v2', 'train'],
'adversarial_qa': ['adversarial_qa', 'adversarialQA', 'train'],
'drop': ['drop', 'train'],
'duorc_self': ['duorc', 'SelfRC', 'train'],
'duorc_paraphrase': ['duorc', 'ParaphraseRC', 'train'],
'quoref': ['quoref', 'train'],
'hotpot_qa_distractor': ['hotpot_qa', 'distractor', 'train'],
'hotpot_qa_fullwiki': ['hotpot_qa', 'fullwiki', 'train'],
'ropes': ['ropes', 'train'],
'boolq': ['boolq', 'train'],
'eraser_multi_rc': ['eraser_multi_rc', 'train'],
'quail': ['quail', 'train'],
'sciq': ['sciq', 'train'],
'strategy_qa': ['metaeval/strategy-qa', 'train'],
'gap': ['gap', 'train'],
}
DATASET_CONFIG = {
'cnndm': {'task': 'summarization', 'text_a': 'article', 'text_b': 'highlights', 'label': None, 'huggingface': True},
'mnli': {'task': 'nli', 'text_a': 'premise', 'text_b': 'hypothesis', 'label': 'label', 'huggingface': True},
'nli_fever': {'task': 'fact_checking', 'text_a': 'context', 'text_b': 'query', 'label': 'label','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/nli_fever/train_fitems.jsonl' },
'doc_nli': {'task': 'bin_nli', 'text_a': 'premise', 'text_b': 'hypothesis', 'label': 'label','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/DocNLI_dataset/train.json' },
'squad': {'task': 'extractive_qa', 'text_a': 'context', 'text_b': ['question', 'answers'], 'label': None, 'huggingface': True},
'squad_v2': {'task': 'qa', 'text_a': 'context', 'text_b': ['question', 'answers'], 'label': None, 'huggingface': True},
'paws': {'task': 'paraphrase', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': 'label', 'huggingface': True},
'vitaminc': {'task': 'fact_checking', 'text_a': 'evidence', 'text_b': 'claim', 'label': 'label', 'huggingface': True},
'xsum': {'task': 'summarization', 'text_a': 'document', 'text_b': 'summary', 'label': None, 'huggingface': True, 'cliff_path': 'data/model_generated_data/cliff_summ/xsum_train.jsonl'},
'stsb': {'task': 'sts', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': 'label', 'huggingface': True},
'sick': {'task': 'sts', 'text_a': 'sentence_A', 'text_b': 'sentence_B', 'label': 'relatedness_score', 'huggingface': True},
'race': {'task': 'qa', 'text_a': 'article', 'text_b': ['question', 'options'], 'label': 'answer', 'huggingface': True},
'race_val': {'task': 'qa', 'text_a': 'article', 'text_b': ['question', 'options'], 'label': 'answer', 'huggingface': True},
'anli_r1': {'task': 'nli', 'text_a': 'premise', 'text_b': 'hypothesis', 'label': 'label', 'huggingface': True},
'anli_r2': {'task': 'nli', 'text_a': 'premise', 'text_b': 'hypothesis', 'label': 'label', 'huggingface': True},
'anli_r3': {'task': 'nli', 'text_a': 'premise', 'text_b': 'hypothesis', 'label': 'label', 'huggingface': True},
'snli': {'task': 'nli', 'text_a': 'premise', 'text_b': 'hypothesis', 'label': 'label', 'huggingface': True},
'wikihow': {'task': 'summarization', 'text_a': 'text', 'text_b': 'headline', 'label': None, 'huggingface': False, 'using_hf_api': True, 'data_dir': 'data/wikihow_raw'},
'mrpc': {'task': 'paraphrase', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': 'label','huggingface': True},
'mrpc_val': {'task': 'paraphrase', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': 'label','huggingface': True},
'paws_val': {'task': 'paraphrase', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': 'label', 'huggingface': True},
'paws_unlabeled': {'task': 'paraphrase', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': 'label', 'huggingface': True},
'msmarco': {'task': 'ir', 'text_a': 'passages', 'text_b': ['query', 'answers'], 'label': None,'huggingface': True},
'paws_qqp': {'task': 'paraphrase', 'text_a': 'sentence1', 'text_b': 'sentence2', 'label': None,'huggingface': False, 'using_hf_api': False, 'using_pandas': True, 'data_path':'paws_qqp/output/train.tsv' },
'wiki103': {'task': 'paraphrase', 'text_a': 'original_sent', 'text_b': 'paraphrase', 'label': None,'huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json': True, 'data_path':'data/model_generated_data/backtranslation/wiki103_single_sent_backtranslation.json'},
'qqp': {'task': 'paraphrase', 'text_a':'question1', 'text_b':'question2', 'label': 'label', 'huggingface': True},
'qqp_val': {'task': 'paraphrase', 'text_a':'question1', 'text_b':'question2', 'label': 'label', 'huggingface': True},
'wmt17xxx': {'task': 'wmt', 'text_a': 'ref', 'text_b': 'mt', 'label': 'score','huggingface': False, 'using_hf_api': False, 'using_pandas': True, 'data_path':'data/wmt/wmt17/2017-da.csv' },
'wmt15': {'task': 'wmt', 'text_a': 'ref', 'text_b': 'mt', 'label': 'score','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/eval/wmt15_eval.jsonl' },
'wmt16': {'task': 'wmt', 'text_a': 'ref', 'text_b': 'mt', 'label': 'score','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/eval/wmt16_eval.jsonl' },
'wmt17': {'task': 'wmt', 'text_a': 'ref', 'text_b': 'mt', 'label': 'score','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/eval/wmt17_eval.jsonl' },
'wmt18': {'task': 'wmt', 'text_a': 'ref', 'text_b': 'mt', 'label': 'score','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/eval/wmt18_eval.jsonl' },
'wmt19': {'task': 'wmt', 'text_a': 'ref', 'text_b': 'mt', 'label': 'score','huggingface': False, 'using_hf_api': False, 'using_pandas': False, 'using_json':True, 'data_path':'data/eval/wmt19_eval.jsonl' },
'squad_v2_new': {'task': 'qa', 'huggingface': True},
'adversarial_qa': {'task': 'qa', 'huggingface': True},
'drop': {'task': 'qa', 'huggingface': True},
'duorc_self': {'task': 'qa', 'huggingface': True},
'duorc_paraphrase': {'task': 'qa', 'huggingface': True},
'quoref': {'task': 'qa', 'huggingface': True},
'hotpot_qa_distractor': {'task': 'qa', 'huggingface': True},
'hotpot_qa_fullwiki': {'task': 'qa', 'huggingface': True},
'newsqa': {'task': 'qa', 'using_json': True, 'raw_json': True, 'data_path': 'data/newsqa_raw/combined-newsqa-data-v1.json'},
'ropes': {'task': 'qa', 'huggingface': True},
'boolq': {'task': 'qa', 'huggingface': True},
'eraser_multi_rc': {'task': 'qa', 'huggingface': True},
'quail': {'task': 'qa', 'huggingface': True},
'sciq': {'task': 'qa', 'huggingface': True},
'strategy_qa': {'task': 'qa', 'huggingface': True},
'gap': {'task': 'coreference', 'huggingface': True},
}
class QA2D():
def __init__(self, batch_size=32, device='cuda', verbose=True) -> None:
from transformers import BartTokenizer, BartForConditionalGeneration
self.tokenizer = BartTokenizer.from_pretrained("MarkS/bart-base-qa2d")
self.model = BartForConditionalGeneration.from_pretrained("MarkS/bart-base-qa2d").to(device)
self.batch_size = batch_size
self.device=device
self.verbose = verbose
def generate(self, questions: list, answers: list):
assert len(questions) == len(answers)
qa_list = []
for q, a in zip(questions, answers):
qa_list.append(f"question: {q} answer: {a}")
output = []
for qa_pairs in tqdm(
self.chunks(qa_list, self.batch_size),
desc="QA to Declarative",
total=int(len(qa_list)/self.batch_size),
disable=(not self.verbose)
):
input_text = qa_pairs
input_token = self.tokenizer(
input_text, return_tensors='pt', padding=True, truncation=True).to(self.device)
dec_sents = self.model.generate(
input_token.input_ids, max_length=512)
result = self.tokenizer.batch_decode(
dec_sents, skip_special_tokens=True)
output.extend(result)
return output
def chunks(self, lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
class QAnswering():
"""
To answer not-answerable questions
"""
def __init__(self, batch_size=32, device='cuda') -> None:
from transformers import T5Tokenizer, T5ForConditionalGeneration
self.tokenizer = T5Tokenizer.from_pretrained(
"valhalla/t5-base-qa-qg-hl")
self.model = T5ForConditionalGeneration.from_pretrained(
"valhalla/t5-base-qa-qg-hl").to(device)
self.batch_size = batch_size
self.device = device
def generate(self, questions: list, contexts: list):
assert len(questions) == len(contexts)
answers = []
for qs, cs in tqdm(zip(self.chunks(questions, self.batch_size), self.chunks(contexts, self.batch_size)), desc="Generating Answers for not answerable", total=int(len(questions)/self.batch_size)):
qc_pairs = []
assert len(qs) == len(cs)
for one_q, one_c in zip(qs, cs):
qc_pairs.append(f"""question: {one_q} context: {one_c}""")
input_ids = self.tokenizer(
qc_pairs, padding=True, truncation=True, return_tensors='pt').to(self.device).input_ids
outputs = self.model.generate(input_ids, max_length=512)
answers.extend(self.tokenizer.batch_decode(
outputs, skip_special_tokens=True))
return answers
def chunks(self, lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
class MLMGeneratorWithPairedData():
def __init__(self, corpra: list, device='cuda', batch_size=8, mask_percent=0.25) -> None:
self.device = device
self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(
"distilbert-base-uncased")
self.model = transformers.DistilBertForMaskedLM.from_pretrained(
"distilbert-base-uncased").to(self.device)
self.mask_percent = mask_percent
self.batch_size = batch_size
self.dataset = corpra # text needs to be noised
def chunks(self, lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
def generate(self):
sents_output = []
for examples in tqdm(self.chunks(self.dataset, self.batch_size), total=int(len(self.dataset)/self.batch_size), desc="MLM Generating"):
sents_to_be_noised = [each for each in examples]
sents_noised = self.mlm_infiller(sents_to_be_noised)
sents_output.extend(sents_noised)
return sents_output
def mlm_infiller(self, batch):
"""
input a batch of sentences, list
"""
masked_batch = []
masked_batch_ids = []
for each_sent in batch:
sent_tokens = self.tokenizer.tokenize(each_sent)
sent_token_ids = self.tokenizer(each_sent)['input_ids']
mask_list = sample(list(range(len(sent_tokens))), int(
self.mask_percent * len(sent_tokens)))
sent_tokens = [
each if i not in mask_list else self.tokenizer.mask_token for i, each in enumerate(sent_tokens)]
masked_batch_ids.append(
[each if i-1 not in mask_list else self.tokenizer.mask_token_id for i, each in enumerate(sent_token_ids)])
masked_batch.append(' '.join(sent_tokens))
inputs = self.tokenizer(
masked_batch, padding=True, truncation=True, return_tensors="pt").to(self.device)
with torch.no_grad():
logits = self.model(**inputs).logits
infill_tokens = []
for i in range(len(masked_batch)):
mask_token_index = (inputs.input_ids == self.tokenizer.mask_token_id)[
i].nonzero(as_tuple=True)[0]
predicted_token_id = logits[i, mask_token_index].argmax(axis=-1)
infill_tokens.append(predicted_token_id)
infilled_sent = []
for masked_sent_ids, infill_token in zip(masked_batch_ids, infill_tokens):
for infill_one_token in infill_token:
for i, each_id in enumerate(masked_sent_ids):
if each_id == self.tokenizer.mask_token_id:
masked_sent_ids[i] = infill_one_token
break
infilled_sent.append(self.tokenizer.decode(
masked_sent_ids, skip_special_tokens=True))
return infilled_sent
class ExtractiveSummarizationGenerator():
def __init__(self) -> None:
pass
def generate(self, texts):
'''
texts: list of string
'''
from summa.summarizer import summarize
summaries = []
for text in tqdm(texts, desc="Extracting Summary"):
for prop in range(1, 20):
summ = summarize(text, ratio=prop/20.)
if len(summ) > 0:
break
summaries.append(summ)
return summaries
class DataGenerator():
def __init__(self, dataset_names) -> None:
self.dataset_names = dataset_names
self.datasets = dict()
self.t5_qa = None
self.t5_tokenizer = None
self.load_dataset_from_huggingface()
def load_dataset_from_huggingface(self):
for each_dataset in self.dataset_names:
if DATASET_CONFIG[each_dataset].get('huggingface'):
self.datasets[each_dataset] = load_dataset(
*DATASET_HUGGINGFACE[each_dataset][:-1])[DATASET_HUGGINGFACE[each_dataset][-1]]
elif DATASET_CONFIG[each_dataset].get('using_hf_api'):
self.datasets[each_dataset] = load_dataset(
*DATASET_HUGGINGFACE[each_dataset][:-1], data_dir=DATASET_CONFIG[each_dataset]['data_dir'])[DATASET_HUGGINGFACE[each_dataset][-1]]
elif DATASET_CONFIG[each_dataset].get('using_pandas'):
if DATASET_CONFIG[each_dataset]['data_path'].split('.')[-1] == 'tsv':
self.datasets[each_dataset] = pd.read_csv(
DATASET_CONFIG[each_dataset]['data_path'], sep='\t')
elif DATASET_CONFIG[each_dataset]['data_path'].split('.')[-1] == 'csv':
self.datasets[each_dataset] = pd.read_csv(
DATASET_CONFIG[each_dataset]['data_path'])
elif DATASET_CONFIG[each_dataset].get('using_json'):
self.datasets[each_dataset] = []
if DATASET_CONFIG[each_dataset].get('raw_json'):
with open(DATASET_CONFIG[each_dataset]['data_path'], 'r', encoding='utf8') as f:
self.datasets[each_dataset] = json.load(f)
else:
try:
json_file = json.load(
open(DATASET_CONFIG[each_dataset]['data_path'], 'r', encoding='utf8'))
for example in json_file:
self.datasets[each_dataset].append(example)
except:
with open(DATASET_CONFIG[each_dataset]['data_path'], 'r', encoding='utf8') as f:
for example in f:
self.datasets[each_dataset].append(
json.loads(example))
else:
error('unable to locate raw dataset...')
def process_squad(self):
from rake_nltk import Rake
r = Rake()
topk = 5
threshold = 0.6
output = []
label = -1
for example in tqdm(self.datasets['squad'], desc=f'Constructing squad'):
text_a = example[DATASET_CONFIG['squad']['text_a']]
question = example[DATASET_CONFIG['squad']['text_b'][0]]
answer = example[DATASET_CONFIG['squad']
['text_b'][1]]['text'] # a list
text_b = [question+' '+answer_ele for answer_ele in answer]
text_c = []
r.extract_keywords_from_text(text_a)
keywords_in_context = r.get_ranked_phrases()[:topk]
for each_keyword in keywords_in_context:
# then it is an incorrect answer
if sentence_bleu([answer_ele.lower().split() for answer_ele in answer], each_keyword.split(), weights=(0.33, 0.33, 0.33)) < threshold:
text_c.append(question+' '+each_keyword)
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_squad_v2(self):
# first collect answerable items
not_answerable_contexts = []
not_answerable_questions = []
not_answerable_answers = []
answerable_contexts = []
answerable_questions = []
answerable_answers = []
qa_generator = QAnswering(batch_size=32, device='cuda')
qa2d_generator = QA2D(batch_size=32, device='cuda')
for example in tqdm(self.datasets['squad_v2'], desc=f'Collecting (not)answerable examples'):
if len(example['answers']['text']) == 0:
not_answerable_contexts.append(example['context'])
not_answerable_questions.append(example['question'])
else:
answerable_contexts.append(example['context'])
answerable_questions.append(example['question'])
answerable_answers.append(example['answers']['text'][0])
not_answerable_answers = qa_generator.generate(
not_answerable_questions, not_answerable_contexts)
answerable_declarative_sents = qa2d_generator.generate(
answerable_questions, answerable_answers)
not_answerable_declarative_sents = qa2d_generator.generate(
not_answerable_questions, not_answerable_answers)
output = []
for i, dec_sent in enumerate(answerable_declarative_sents):
output.append({
'text_a': answerable_contexts[i],
'text_b': [dec_sent],
'text_c': [],
'label': 1
})
for i, dec_sent in enumerate(not_answerable_declarative_sents):
output.append({
'text_a': not_answerable_contexts[i],
'text_b': [dec_sent],
'text_c': [],
'label': 0
})
return output
def process_race(self):
qa2d_generator = QA2D(batch_size=32, device='cuda')
option_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
output = []
correct_context = []
correct_question = []
correct_answer = []
wrong_context = []
wrong_question = []
wrong_answer = []
for example in tqdm(self.datasets['race'], desc=f'Constructing race'):
text_a = example[DATASET_CONFIG['race']['text_a']]
label = -1
question = example[DATASET_CONFIG['race']['text_b'][0]]
if "_" in question:
answer_id = option_dict[example[DATASET_CONFIG['race']['label']]]
for i, options in enumerate(example[DATASET_CONFIG['race']['text_b'][1]]):
if i == answer_id:
output.append({
'text_a': text_a,
'text_b': [' '.join(question.replace("_", " "+options+" ").split())],
'text_c': [],
'label': 1
})
else:
output.append({
'text_a': text_a,
'text_b': [' '.join(question.replace("_", " "+options+" ").split())],
'text_c': [],
'label': 0
})
else:
answer_id = option_dict[example[DATASET_CONFIG['race']['label']]]
for i, options in enumerate(example[DATASET_CONFIG['race']['text_b'][1]]):
if i == answer_id:
output.append({
'text_a': text_a,
'text_b': [question],
'text_c': [options],
'label': 1
})
else:
output.append({
'text_a': text_a,
'text_b': [question],
'text_c': [options],
'label': 0
})
return output
def process_race_val(self):
qa2d_generator = QA2D(batch_size=32, device='cuda')
option_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
output = []
correct_context = []
correct_question = []
correct_answer = []
wrong_context = []
wrong_question = []
wrong_answer = []
for example in tqdm(self.datasets['race_val'], desc=f'Constructing race_val'):
text_a = example[DATASET_CONFIG['race_val']['text_a']]
label = -1
question = example[DATASET_CONFIG['race_val']['text_b'][0]]
if "_" in question:
answer_id = option_dict[example[DATASET_CONFIG['race_val']['label']]]
for i, options in enumerate(example[DATASET_CONFIG['race_val']['text_b'][1]]):
if i == answer_id:
output.append({
'text_a': text_a,
'text_b': [' '.join(question.replace("_", " "+options+" ").split())],
'text_c': [],
'label': 1
})
else:
output.append({
'text_a': text_a,
'text_b': [' '.join(question.replace("_", " "+options+" ").split())],
'text_c': [],
'label': 0
})
else:
answer_id = option_dict[example[DATASET_CONFIG['race_val']['label']]]
for i, options in enumerate(example[DATASET_CONFIG['race_val']['text_b'][1]]):
if i == answer_id:
correct_context.append(text_a)
correct_question.append(question)
correct_answer.append(options)
else:
wrong_context.append(text_a)
wrong_question.append(question)
wrong_answer.append(options)
correct_declarative = qa2d_generator.generate(
correct_question, correct_answer)
wrong_declarative = qa2d_generator.generate(
wrong_question, wrong_answer)
assert len(correct_context) == len(correct_declarative)
assert len(wrong_context) == len(wrong_declarative)
for context, dec in zip(correct_context, correct_declarative):
output.append({
'text_a': context,
'text_b': [dec],
'text_c': [],
'label': 1
})
for context, dec in zip(wrong_context, wrong_declarative):
output.append({
'text_a': context,
'text_b': [dec],
'text_c': [],
'label': 0
})
return output
def process_race_test(self):
option_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
output = []
for example in tqdm(self.datasets['race_test'], desc=f'Constructing race_test'):
text_a = example[DATASET_CONFIG['race_test']['text_a']]
text_b = [] # pos
text_c = [] # neg
label = -1
question = example[DATASET_CONFIG['race_test']['text_b'][0]]
if "_" in question:
answer_id = option_dict[example[DATASET_CONFIG['race_test']['label']]]
for i, options in enumerate(example[DATASET_CONFIG['race_test']['text_b'][1]]):
if i == answer_id:
text_b.append(' '.join(question.replace(
"_", " "+options+" ").split()))
else:
text_c.append(' '.join(question.replace(
"_", " "+options+" ").split()))
else:
answer_id = option_dict[example[DATASET_CONFIG['race_test']['label']]]
for i, options in enumerate(example[DATASET_CONFIG['race_test']['text_b'][1]]):
if i == answer_id:
text_b.append(question+" "+options+" ")
else:
text_c.append(question+" "+options+" ")
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_xsum(self):
'''
text_a: raw_text
text_b: raw_summary + ***extractive summ*** removed
text_c: cliff xsum + DistillBERT from raw_text_b + ***DistillBERT from extractive summ text_b***
'''
output = []
gold_summary = [example[DATASET_CONFIG['xsum']['text_b']]
for example in self.datasets['xsum']]
ext_summarizer = ExtractiveSummarizationGenerator()
extracted_summ = ext_summarizer.generate(
[example[DATASET_CONFIG['xsum']['text_a']] for example in self.datasets['xsum']])
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=gold_summary, device='cuda:0', batch_size=64, mask_percent=0.25)
gold_summary_hallucinated = mlm_hallucinator.generate()
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=extracted_summ, device='cuda:0', batch_size=64, mask_percent=0.25)
extracted_summ_hallucinated = mlm_hallucinator.generate()
assert len(self.datasets['xsum']) == len(gold_summary_hallucinated) and len(
self.datasets['xsum']) == len(extracted_summ_hallucinated)
for i, example in tqdm(enumerate(self.datasets['xsum']), desc="Constructing xsum", total=len(self.datasets['xsum'])):
text_a = example[DATASET_CONFIG['xsum']['text_a']]
text_b = [gold_summary[i], extracted_summ[i]]
text_c = [gold_summary_hallucinated[i],
extracted_summ_hallucinated[i]]
label = -1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_cnndm(self):
'''
text_a: raw_text
text_b: raw_summary + ***extractive summ*** removed
text_c: DistillBERT from raw_text_b + ***DistillBERT from extractive summ text_b***
'''
# interpretation of fairseq-generate output: https://github.com/facebookresearch/fairseq/issues/3000
output = []
gold_summary = [example[DATASET_CONFIG['cnndm']['text_b']]
for example in self.datasets['cnndm']]
ext_summarizer = ExtractiveSummarizationGenerator()
extracted_summ = ext_summarizer.generate(
[example[DATASET_CONFIG['cnndm']['text_a']] for example in self.datasets['cnndm']])
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=gold_summary, device='cuda:0', batch_size=64, mask_percent=0.25)
gold_summary_hallucinated = mlm_hallucinator.generate()
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=extracted_summ, device='cuda:0', batch_size=64, mask_percent=0.25)
extracted_summ_hallucinated = mlm_hallucinator.generate()
assert len(self.datasets['cnndm']) == len(gold_summary_hallucinated) and len(
self.datasets['cnndm']) == len(extracted_summ_hallucinated)
for i, example in tqdm(enumerate(self.datasets['cnndm']), desc="Constructing cnndm", total=len(self.datasets['cnndm'])):
text_a = example[DATASET_CONFIG['cnndm']['text_a']]
text_b = [gold_summary[i], extracted_summ[i]]
text_c = [gold_summary_hallucinated[i],
extracted_summ_hallucinated[i]]
label = -1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_wikihow(self):
'''
text_a: raw_text
text_b: raw_summary + ***extractive summ*** removed
text_c: DistillBERT from raw_text_b + ***DistillBERT from extractive summ text_b***
'''
# interpretation of fairseq-generate output: https://github.com/facebookresearch/fairseq/issues/3000
output = []
gold_summary = [example[DATASET_CONFIG['wikihow']['text_b']]
for example in self.datasets['wikihow']]
ext_summarizer = ExtractiveSummarizationGenerator()
extracted_summ = ext_summarizer.generate(
[example[DATASET_CONFIG['wikihow']['text_a']] for example in self.datasets['wikihow']])
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=gold_summary, device='cuda:0', batch_size=64, mask_percent=0.25)
gold_summary_hallucinated = mlm_hallucinator.generate()
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=extracted_summ, device='cuda:0', batch_size=64, mask_percent=0.25)
extracted_summ_hallucinated = mlm_hallucinator.generate()
assert len(self.datasets['wikihow']) == len(gold_summary_hallucinated) and len(
self.datasets['wikihow']) == len(extracted_summ_hallucinated)
for i, example in tqdm(enumerate(self.datasets['wikihow']), desc="Constructing wikihow", total=len(self.datasets['wikihow'])):
text_a = example[DATASET_CONFIG['wikihow']['text_a']]
text_b = [gold_summary[i], extracted_summ[i]]
text_c = [gold_summary_hallucinated[i],
extracted_summ_hallucinated[i]]
label = -1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_wiki103(self):
output = []
paraphrases = [example[DATASET_CONFIG['wiki103']['text_b']]
for example in self.datasets['wiki103']]
mlm_hallucinator = MLMGeneratorWithPairedData(
corpra=paraphrases, device='cuda:3', batch_size=64, mask_percent=0.25)
paraphrase_hallucinated = mlm_hallucinator.generate()
assert len(self.datasets['wiki103']) == len(paraphrase_hallucinated)
for i, example in tqdm(enumerate(self.datasets['wiki103']), desc=f'Constructing wiki103'):
output.append({
'text_a': example[DATASET_CONFIG['wiki103']['text_a']],
'text_b': [example[DATASET_CONFIG['wiki103']['text_b']]],
'text_c': [],
'label': 1
})
output.append({
'text_a': example[DATASET_CONFIG['wiki103']['text_a']],
'text_b': [paraphrase_hallucinated[i]],
'text_c': [],
'label': 0
})
return output
def process_mnli(self):
output = []
for example in tqdm(self.datasets['mnli'], desc=f'Constructing mnli'):
text_a = example[DATASET_CONFIG['mnli']['text_a']]
text_b = [example[DATASET_CONFIG['mnli']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['mnli']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_nli_fever(self):
output = []
for example in tqdm(self.datasets['nli_fever'], desc=f'Constructing nli_fever'):
text_a = example[DATASET_CONFIG['nli_fever']['text_a']]
text_b = [example[DATASET_CONFIG['nli_fever']['text_b']]]
text_c = []
raw_label = example[DATASET_CONFIG['nli_fever']['label']]
if raw_label == 'SUPPORTS': # convert to nli style label
label = 0
elif raw_label == 'REFUTES':
label = 2
else:
label = 1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_doc_nli(self):
output = []
for example in tqdm(self.datasets['doc_nli'], desc=f'Constructing doc_nli'):
text_a = example[DATASET_CONFIG['doc_nli']['text_a']]
text_b = [example[DATASET_CONFIG['doc_nli']['text_b']]]
text_c = []
raw_label = example[DATASET_CONFIG['doc_nli']['label']]
if raw_label == 'entailment': # convert to paraphrase style label
label = 1
else:
label = 0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_anli_r1(self):
output = []
for example in tqdm(self.datasets['anli_r1'], desc=f'Constructing anli_r1'):
text_a = example[DATASET_CONFIG['anli_r1']['text_a']]
text_b = [example[DATASET_CONFIG['anli_r1']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['anli_r1']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_anli_r2(self):
output = []
for example in tqdm(self.datasets['anli_r2'], desc=f'Constructing anli_r2'):
text_a = example[DATASET_CONFIG['anli_r2']['text_a']]
text_b = [example[DATASET_CONFIG['anli_r2']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['anli_r2']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_anli_r3(self):
output = []
for example in tqdm(self.datasets['anli_r3'], desc=f'Constructing anli_r3'):
text_a = example[DATASET_CONFIG['anli_r3']['text_a']]
text_b = [example[DATASET_CONFIG['anli_r3']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['anli_r3']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_snli(self):
output = []
for example in tqdm(self.datasets['snli'], desc=f'Constructing snli'):
text_a = example[DATASET_CONFIG['snli']['text_a']]
text_b = [example[DATASET_CONFIG['snli']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['snli']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_paws(self):
output = []
for example in tqdm(self.datasets['paws'], desc=f'Constructing paws'):
text_a = example[DATASET_CONFIG['paws']['text_a']]
text_b = [example[DATASET_CONFIG['paws']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['paws']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_vitaminc(self):
output = []
for example in tqdm(self.datasets['vitaminc'], desc=f'Constructing vitaminc'):
text_a = example[DATASET_CONFIG['vitaminc']['text_a']]
text_b = [example[DATASET_CONFIG['vitaminc']['text_b']]]
text_c = []
raw_label = example[DATASET_CONFIG['vitaminc']['label']]
if raw_label == 'SUPPORTS': # convert to nli style label
label = 0
elif raw_label == 'REFUTES':
label = 2
else:
label = 1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_stsb(self):
output = []
for example in tqdm(self.datasets['stsb'], desc=f'Constructing stsb'):
text_a = example[DATASET_CONFIG['stsb']['text_a']]
text_b = [example[DATASET_CONFIG['stsb']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['stsb']['label']] / 5.0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_sick(self):
output = []
for example in tqdm(self.datasets['sick'], desc=f'Constructing sick'):
text_a = example[DATASET_CONFIG['sick']['text_a']]
text_b = [example[DATASET_CONFIG['sick']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['sick']['label']] / 5.0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_mrpc(self):
output = []
for example in tqdm(self.datasets['mrpc'], desc=f'Constructing mrpc'):
text_a = example[DATASET_CONFIG['mrpc']['text_a']]
text_b = [example[DATASET_CONFIG['mrpc']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['mrpc']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_mrpc_val(self):
output = []
for example in tqdm(self.datasets['mrpc_val'], desc=f'Constructing mrpc_val'):
text_a = example[DATASET_CONFIG['mrpc_val']['text_a']]
text_b = [example[DATASET_CONFIG['mrpc_val']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['mrpc_val']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_paws_val(self):
output = []
for example in tqdm(self.datasets['paws_val'], desc=f'Constructing paws_val'):
text_a = example[DATASET_CONFIG['paws_val']['text_a']]
text_b = [example[DATASET_CONFIG['paws_val']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['paws_val']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_paws_unlabeled(self):
output = []
for example in tqdm(self.datasets['paws_unlabeled'], desc=f'Constructing paws_unlabeled'):
text_a = example[DATASET_CONFIG['paws_unlabeled']['text_a']]
text_b = [example[DATASET_CONFIG['paws_unlabeled']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['paws_unlabeled']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_qqp(self):
output = []
for example in tqdm(self.datasets['qqp'], desc=f'Constructing qqp'):
text_a = example[DATASET_CONFIG['qqp']['text_a']]
text_b = [example[DATASET_CONFIG['qqp']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['qqp']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_qqp_val(self):
output = []
for example in tqdm(self.datasets['qqp_val'], desc=f'Constructing qqp_val'):
text_a = example[DATASET_CONFIG['qqp_val']['text_a']]
text_b = [example[DATASET_CONFIG['qqp_val']['text_b']]]
text_c = []
label = example[DATASET_CONFIG['qqp_val']['label']]
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_msmarco(self):
qa2d_generator = QA2D(batch_size=32, device='cuda')
output = []
correct_contexts = []
correct_questions = []
correct_answers = []
wrong_contexts = []
wrong_questions = []
wrong_answers = []
filtered_examples = []
questions = []
answers = []
declaratives = []
for example in tqdm(self.datasets['msmarco'], desc=f'Collecting msmarco'):
if sum(example['passages']['is_selected']) > 0: # has answer
questions.append(example['query'])
answers.append(example['answers'][0] if len(
example['wellFormedAnswers']) == 0 else example['wellFormedAnswers'][0])
filtered_examples.append(example)
for example in filtered_examples:
for i, is_selected in enumerate(example['passages']['is_selected']):
if is_selected == 1:
output.append({
'text_a': example['passages']['passage_text'][i],
'text_b': [example['query']],
'text_c': [],
'label': 1
}
)
else:
output.append({
'text_a': example['passages']['passage_text'][i],
'text_b': [example['query']],
'text_c': [],
'label': 0
}
)
return output
def process_paws_qqp(self):
output = []
for i in range(len(self.datasets['paws_qqp'])):
text_a = self.datasets['paws_qqp'].iloc[i]['sentence1'][2:-1]
text_b = [self.datasets['paws_qqp'].iloc[i]['sentence2'][2:-1]]
text_c = []
label = self.datasets['paws_qqp'].iloc[i]['label']
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': int(label)
})
return output
def process_wmt15(self):
output = []
for example in self.datasets['wmt15']:
text_a = example['reference']
text_b = [example['candidate']]
text_c = []
label = example['score']
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_wmt16(self):
output = []
for example in self.datasets['wmt16']:
text_a = example['reference']
text_b = [example['candidate']]
text_c = []
label = example['score']
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_wmt17(self):
output = []
for example in self.datasets['wmt17']:
text_a = example['reference']
text_b = [example['candidate']]
text_c = []
label = example['score']
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_wmt18(self):
output = []
for example in self.datasets['wmt18']:
text_a = example['reference']
text_b = [example['candidate']]
text_c = []
label = example['score']
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_wmt19(self):
output = []
for example in self.datasets['wmt19']:
text_a = example['reference']
text_b = [example['candidate']]
text_c = []
label = example['score']
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_boolq(self):
output = []
for example in self.datasets['boolq']:
text_a = example['passage']
text_b = [example['question']]
text_c = ["Yes." if example['answer'] else "No."]
label = 1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
text_a = example['passage']
text_b = [example['question']]
text_c = ["Yes." if not example['answer'] else "No."]
label = 0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_eraser_multi_rc(self):
output = []
for example in self.datasets['eraser_multi_rc']:
text_a = example['passage']
text_b = [example['query_and_answer'].replace("|", "")]
text_c = []
label = int(example['label'])
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_quail(self):
output = []
for example in self.datasets['quail']:
for i, ans in enumerate(example['answers']):
text_a = example['context']
text_b = [example['question']]
text_c = [ans]
label = 1 if i == example['correct_answer_id'] else 0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_sciq(self):
output = []
for example in self.datasets['sciq']:
text_a = example['support']
output.append({
'text_a': text_a,
'text_b': [example['question']],
'text_c': [example['distractor1']],
'label': 0
})
output.append({
'text_a': text_a,
'text_b': [example['question']],
'text_c': [example['distractor2']],
'label': 0
})
output.append({
'text_a': text_a,
'text_b': [example['question']],
'text_c': [example['distractor3']],
'label': 0
})
output.append({
'text_a': text_a,
'text_b': [example['question']],
'text_c': [example['correct_answer']],
'label': 1
})
return output
def process_strategy_qa(self):
output = []
for example in self.datasets['strategy_qa']:
text_a = ' '.join(example['facts'])
text_b = [example['question']]
text_c = ["Yes." if example['answer'] else "No."]
label = 1
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
text_a = ' '.join(example['facts'])
text_b = [example['question']]
text_c = ["Yes." if not example['answer'] else "No."]
label = 0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def process_gap(self):
output = []
for example in self.datasets['gap']:
text_a = example['Text']
text_b = [example['Text'][:example['Pronoun-offset']]+example['A']+example['Text'][(example['Pronoun-offset']+len(example['Pronoun'])):]]
text_c = []
label = 1 if example['A-coref'] else 0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
text_a = example['Text']
text_b = [example['Text'][:example['Pronoun-offset']]+example['B']+example['Text'][(example['Pronoun-offset']+len(example['Pronoun'])):]]
text_c = []
label = 1 if example['B-coref'] else 0
output.append({
'text_a': text_a,
'text_b': text_b,
'text_c': text_c,
'label': label
})
return output
def init_qa_t5(self):
from transformers import T5Tokenizer, T5ForConditionalGeneration
if self.t5_qa is None:
self.t5_tokenizer = T5Tokenizer.from_pretrained(
"t5-base", model_max_length=800)
self.t5_qa = T5ForConditionalGeneration.from_pretrained("t5-base")
self.t5_qa.to('cuda:1')
self.t5_qa.eval()
@staticmethod
def mask_answer(context, answers):
answers = sorted(answers, key=len, reverse=True)
for answer in answers:
pattern = f'(?<![\w\\-\u2013]){re.escape(answer)}(?![\w\\-\u2013])'
context = re.sub(pattern, '', context, flags=re.IGNORECASE)
return context
def generate_fake_answer(self, context, question, answers):
self.init_qa_t5()
context_no_answer = self.mask_answer(context, answers)
input_ids = self.t5_tokenizer(
f'question: {question} context: {context_no_answer}',
return_tensors="pt",
truncation='only_first'
).input_ids.to(self.t5_qa.device)
outputs = self.t5_qa.generate(
input_ids,
max_new_tokens=40,
remove_invalid_values=True
)
return self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
def negative_sample_qa(self, samples, negative_sample_no_ans_only=True):
outputs = []
for context, question, answers in samples:
if answers:
outputs.append({
'text_a': context,
'text_b': [question],
'text_c': answers,
'label': 1
})
if not answers or not negative_sample_no_ans_only:
fake_answer = self.generate_fake_answer(
context, question, answers)
outputs.append({
'text_a': context,
'text_b': [question],
'text_c': [fake_answer],
'label': 0
})
return outputs
def process_squad_v2_new(self):
samples = (
(sample['context'], sample['question'], sample['answers']['text'])
for sample in tqdm(self.datasets['squad_v2_new'], desc=f'squad_v2_new')
)
return self.negative_sample_qa(samples)
def process_adversarial_qa(self):
samples = (
(sample['context'], sample['question'], sample['answers']['text'])
for sample in tqdm(self.datasets['adversarial_qa'], desc=f'adversarial_qa')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_drop(self):
samples = (
(sample['passage'], sample['question'],
sample['answers_spans']['spans'])
for sample in tqdm(self.datasets['drop'], desc=f'drop')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_duorc_self(self):
samples = (
(sample['plot'], sample['question'],
sample['answers'])
for sample in tqdm(self.datasets['duorc_self'], desc=f'duorc_self')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_duorc_paraphrase(self):
samples = (
(sample['plot'], sample['question'],
sample['answers'])
for sample in tqdm(self.datasets['duorc_paraphrase'], desc=f'duorc_paraphrase')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_quoref(self):
samples = (
(sample['context'], sample['question'], sample['answers']['text'])
for sample in tqdm(self.datasets['quoref'], desc=f'quoref')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
@staticmethod
def prepare_hotpot_qa_samples(dateset):
for sample in dateset:
question = sample['question']
answer = sample['answer']
supporting_docs = set(sample['supporting_facts']['title'])
irrelevant_docs = []
context_paragraphs = []
for title, setences in zip(sample['context']['title'], sample['context']['sentences']):
doc = ''.join(setences)
if title in supporting_docs:
context_paragraphs.append(doc)
else:
irrelevant_docs.append(doc)
# Add some irrelevant documents
if irrelevant_docs and len(context_paragraphs) < 4:
context_paragraphs.append(random.choice(irrelevant_docs))
random.shuffle(context_paragraphs)
yield '\n'.join(context_paragraphs), question, [answer]
def process_hotpot_qa_distractor(self):
samples = self.prepare_hotpot_qa_samples(
tqdm(self.datasets['hotpot_qa_distractor'],
desc=f'hotpot_qa_distractor')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_hotpot_qa_fullwiki(self):
samples = self.prepare_hotpot_qa_samples(
tqdm(self.datasets['hotpot_qa_fullwiki'],
desc=f'hotpot_qa_fullwiki')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_newsqa(self):
def get_samples(dataset):
for story in tqdm(dataset['data'], desc='newsqa'):
if story['type'] != 'train':
continue
context = story['text']
for question in story['questions']:
if question.get('isQuestionBad', 0.) > 0.2:
continue
answers = []
if 's' in question['consensus']:
start = question['consensus']['s']
end = question['consensus']['e']
answers.append(context[start:end].strip())
yield context, question['q'], answers
samples = get_samples(self.datasets['newsqa'])
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def process_ropes(self):
samples = (
(
sample['situation'] + ' ' + sample['background'],
sample['question'], sample['answers']['text']
)
for sample in tqdm(self.datasets['ropes'], desc=f'ropes')
)
return self.negative_sample_qa(samples, negative_sample_no_ans_only=False)
def generate(self):
for each_dataset in self.datasets:
with open(f'./data/training/{each_dataset}.json', 'w', encoding='utf8') as outfile:
outfile.write("")
for each_dataset in self.datasets:
outputs = eval(f'self.process_{each_dataset}()')
for each_output in outputs:
dict_write_to_file = {
'task': DATASET_CONFIG[each_dataset]['task'],
'text_a': each_output['text_a'], # string
# list of positive examples
'text_b': each_output['text_b'],
# list of negative examples
'text_c': each_output['text_c'],
# original label, if -1 only has positive pairs and negative pairs
'orig_label': each_output['label']
}
with open(f'./data/training/{each_dataset}.json', 'a', encoding='utf8') as outfile:
json.dump(dict_write_to_file, outfile, ensure_ascii=False)
outfile.write('\n')
if __name__ == "__main__":
random.seed(42)
gen = DataGenerator(list(DATASET_CONFIG.keys()))
gen.generate()