import torch from torch.utils.data import Dataset import json import os import collections class IntentDataset(Dataset): def __init__(self, loc, tokenizer, mode, toy=False, max_length=180): ''' You can fine-tune a model with your own data!! Feel free to create (or collect!) your own utterances and give it a shot! loc: relative directory where the data lies tokenizer: huggingface tokenizer to preprocess utterances mode: one of train, val, test (should match the respective *.json files) toy: load a very small amount of data (for debugging purposes) max_length:max length of tokenized input ''' self.tokenizer = tokenizer self.mode = mode self.max_length=max_length with open(os.path.join(loc, 'all_intents.json'), 'r') as all_intents_json: self.all_intents = json.load(all_intents_json) # contains the written out names of intents. also implicitly # defines how many intents your chatbot's neural intent detection will support with open(os.path.join(loc, mode + '.json'), 'r') as json_data: self.all_data = json.load(json_data) if toy: self.all_data = self.all_data[:10] print(f"Loaded Intent detection dataset. {len(self.all_data)} examples. ({mode}). {'Toy example' if toy else ''}") def __len__(self): # torch Datasets need a __len__ method and __getitem__, with len as the total amount of examples... return len(self.all_data) def __getitem__(self, index): #... and __getitem__ as a way to get an example given an index >= 0 and < __len__ data_item = self.all_data[index] if len(data_item) == 3: tokenized_input = self.tokenizer(data_item[0], data_item[1], return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length) else: tokenized_input = self.tokenizer(data_item[0], return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length) output_item = { 'input_ids': tokenized_input['input_ids'].squeeze(0), 'attention_mask': tokenized_input['attention_mask'].squeeze(0), 'label': torch.tensor(self.all_intents.index(data_item[-1])) } if 'token_type_ids' in tokenized_input: output_item['token_type_ids'] = tokenized_input['token_type_ids'].squeeze(0), return output_item