File size: 1,626 Bytes
b2db4f9
 
ce348f5
b2db4f9
 
 
 
 
ce348f5
b2db4f9
 
 
 
 
 
 
 
 
2df5a70
b2db4f9
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import torch
from torch import nn
from transformers import AutoProcessor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM

class CombinedModel(nn.Module):
    def __init__(self, stt_model_name, nmt_model_name,device = "cuda"):
        super(CombinedModel, self).__init__()

        self.stt_processor = AutoProcessor.from_pretrained(stt_model_name)
        self.stt_model = Wav2Vec2ForCTC.from_pretrained(stt_model_name)
        self.nmt_tokenizer = AutoTokenizer.from_pretrained(nmt_model_name)
        self.nmt_model = AutoModelForSeq2SeqLM.from_pretrained(nmt_model_name)
        self.device = device

    def forward(self, batch, *args, **kwargs):
        # Use stt_model to transcribe the audio to text
        device = self.device
        audio = torch.tensor(batch["audio"][0]).to(self.device)
        input_features = self.stt_processor(audio,sampling_rate=16000, return_tensors="pt",max_length=220000, padding=True, truncation=True)
        stt_output = self.stt_model(input_features.input_values.to(device), attention_mask= input_features.attention_mask.to(device) )
        transcription = self.stt_processor.decode(torch.squeeze(stt_output.logits.argmax(axis=-1)).to(device))
        input_nmt_tokens = self.nmt_tokenizer(transcription, return_tensors="pt", padding=True, truncation=True)
        output_nmt_output = self.nmt_model.generate(input_ids = input_nmt_tokens.input_ids.to(device), attention_mask= input_nmt_tokens.attention_mask.to(device))
        decoded_nmt_output = self.nmt_tokenizer.batch_decode(output_nmt_output, skip_special_tokens=True)


        return transcription, decoded_nmt_output