File size: 1,623 Bytes
dae67e9
c90fdd5
 
6ccd417
dae67e9
6ccd417
dae67e9
6ccd417
 
 
dae67e9
 
6ccd417
 
dae67e9
6ccd417
 
 
 
 
 
dae67e9
 
 
6ccd417
dae67e9
6ccd417
 
 
dae67e9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from transformers import T5TokenizerFast, T5ForConditionalGeneration, GenerationConfig
from model import Model
class T5(Model):
    def __init__(self,
            model_dir:str='./models/pko_t5_COMU_patience10',
            max_input_length:int=64,
            max_target_length:int=64
            ):
        self.model = T5ForConditionalGeneration.from_pretrained(model_dir)
        self.tokenizer = T5TokenizerFast.from_pretrained(model_dir)
        self.gen_config = GenerationConfig.from_pretrained(model_dir, 'gen_config.json')
        
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.INPUT_FORMAT = 'qa question: <INPUT>'
        
        # add tokens
        self.tokenizer.add_tokens(["#ν™”μž#", "#청자#", "#(λ‚¨μž)청자#", "#(λ‚¨μž)ν™”μž#", "#(μ—¬μž)청자#", "(μ—¬μž)ν™”μž"])
        self.model.resize_token_embeddings(len(self.tokenizer))
        self.model.config.max_length = max_target_length
        self.tokenizer.model_max_length = max_target_length
    
    def generate(self, inputs):
        inputs = self.INPUT_FORMAT.replace("<INPUT>", inputs)
        input_ids = self.tokenizer(inputs, max_length=self.max_input_length, truncation=True, return_tensors="pt")
        output_tensor = self.model.generate(**input_ids, generation_config=self.gen_config)
        output_ids = self.tokenizer.batch_decode(output_tensor, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        outputs = str(output_ids)
        outputs = outputs.replace('[', '').replace(']', '').replace("'", '').replace("'", '')
        return outputs