File size: 6,123 Bytes
01af94c
4ed3d6e
f1cf01a
72fab60
 
 
f1cf01a
544843e
0ad1db9
 
3ebab96
1879824
29efdf2
11b23f3
 
3ebab96
 
 
 
 
 
 
 
 
fdcaaaa
3ebab96
 
fdcaaaa
3ebab96
11b23f3
 
 
 
3ebab96
544843e
 
 
 
 
11b23f3
 
 
 
 
 
f1cf01a
11b23f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d11ace4
f1cf01a
11b23f3
3ebab96
9860135
f1cf01a
11b23f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa69b98
11b23f3
aa69b98
 
11b23f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa69b98
11b23f3
 
 
 
 
fdcaaaa
11b23f3
2b0908b
7ad5c60
2b0908b
7ad5c60
b41a9b0
 
fdcaaaa
3ebab96
 
e4ebcb4
d9ca05f
e4ebcb4
d9ca05f
b41a9b0
7ad5c60
b41a9b0
7ad5c60
b41a9b0
d9ca05f
 
 
 
3ebab96
 
 
 
 
11b23f3
a44078d
dc433d9
fdcaaaa
d9ca05f
3ebab96
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import torch   
import gradio as gr 
import librosa 
import tempfile
from typing import Optional
from TTS.config import load_config
from transformers import AutoFeatureExtractor, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer


first_generation = True
device = 'cuda' if torch.cuda.is_available() else 'cpu'


def load_and_fix_data(input_file, model_sampling_rate):
    speech, sample_rate = librosa.load(input_file)
    if len(speech.shape) > 1:
        speech = speech[:, 0] + speech[:, 1]
    if sample_rate != model_sampling_rate:
        speech = librosa.resample(speech, sample_rate, model_sampling_rate)
    return speech


feature_extractor = AutoFeatureExtractor.from_pretrained("jonatasgrosman/wav2vec2-xls-r-1b-spanish")
sampling_rate = feature_extractor.sampling_rate

asr = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-xls-r-1b-spanish")

prefix = ''
model_checkpoint = "hackathon-pln-es/es_text_neutralizer"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


manager = ModelManager()
MODEL_NAMES = manager.list_tts_models()


def postproc(input_sentence, preds):
    try:
        preds = preds.replace('De el', 'Del').replace('de el', 'del').replace('  ', ' ')
        if preds[0].islower():
            preds = preds.capitalize()
        preds = preds.replace(' . ', '. ').replace(' , ', ', ')

        # Nombres en mayusculas
        prev_letter = ''
        for word in input_sentence.split(' '):
            if word:
                if word[0].isupper():
                    if word.lower() in preds and word != input_sentence.split(' ')[0]:
                        if prev_letter == '.':
                            preds = preds.replace('. ' + word.lower() + ' ', '. ' + word + ' ')
                        else:
                            if word[-1] == '.':
                                preds = preds.replace(word.lower(), word)
                            else:
                                preds = preds.replace(word.lower() + ' ', word + ' ')
                prev_letter = word[-1]
        preds = preds.strip()  # quitar ultimo espacio
    except:
        pass
    return preds
    
model_name = "es/mai/tacotron2-DDC"    
MAX_TXT_LEN = 100

def predict_and_ctc_lm_decode(input_file, speaker_idx: str=None):
    speech = load_and_fix_data(input_file, sampling_rate)
    transcribed_text = asr(speech, chunk_length_s=10, stride_length_s=1)
    transcribed_text = transcribed_text["text"]
    inputs = tokenizer([prefix + transcribed_text], return_tensors="pt", padding=True)
    with torch.no_grad():
        if first_generation:
            output_sequence = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                do_sample=False,  # disable sampling to test if batching affects output
            )
        else:

            output_sequence = model.generate(
                input_ids=inputs["input_ids"].to(device),
                attention_mask=inputs["attention_mask"].to(device),
                do_sample=False,  
                num_beams=2,
                repetition_penalty=2.5, 
                # length_penalty=1.0, 
                early_stopping=True# disable sampling to test if batching affects output
            )
    text = postproc(transcribed_text,
                     preds=tokenizer.decode(output_sequence[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
    if len(text) > MAX_TXT_LEN:
        text = text[:MAX_TXT_LEN]
        print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.")
    print(text, model_name)
    # download model
    model_path, config_path, model_item = manager.download_model(f"tts_models/{model_name}")
    vocoder_name: Optional[str] = model_item["default_vocoder"]
    # download vocoder
    vocoder_path = None
    vocoder_config_path = None
    if vocoder_name is not None:
        vocoder_path, vocoder_config_path, _ = manager.download_model(vocoder_name)
    # init synthesizer
    synthesizer = Synthesizer(
        model_path, config_path, None, None, vocoder_path, vocoder_config_path,
    )
    # synthesize
    if synthesizer is None:
        raise NameError("model not found")
    wavs = synthesizer.tts(text, speaker_idx)
    # return output
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        synthesizer.save_wav(wavs, fp)
        return fp.name

description = """This is a Gradio demo for generating gender-neutralized audios. To use it, simply provide an audio input (via microphone or audio recording), which will then be transcribed and gender-neutralized using pre-trained models. Finally, with the help of Coqui's TTS model, gender neutralized audio is generated.

Pre-trained model used for Spanish ASR: [jonatasgrosman/wav2vec2-xls-r-1b-spanish](https://huggingface.co./jonatasgrosman/wav2vec2-xls-r-1b-spanish)

Pre-trained model used for Gender Neutralization: [hackathon-pln-es/es_text_neutralizer](https://huggingface.co./hackathon-pln-es/es_text_neutralizer)

Pre-trained model used for TTS: 🐸💬 CoquiTTS => model_name = "es/mai/tacotron2-DDC"  

"""


article = """ **ACKNOWLEDGEMENT:**

**This project is based on the following Spaces:**

[CoquiTTS](https://huggingface.co./spaces/coqui/CoquiTTS)

[es_nlp_gender_neutralizer](https://huggingface.co./spaces/hackathon-pln-es/es_nlp_gender_neutralizer)

[Hindi_ASR](https://huggingface.co./spaces/anuragshas/Hindi_ASR)

"""


gr.Interface(
    predict_and_ctc_lm_decode,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", label="Record your audio")
    ],
    outputs=gr.outputs.Audio(label="Output"),
    examples=[["Example1.wav"],["Example2.wav"],["Example3.wav"]],
    title="Generate-Gender-Neutralized-Audios",
    description = description,
    article=article,
    layout="horizontal",
    theme="huggingface",
).launch(enable_queue=True, cache_examples=True)