File size: 4,598 Bytes
b4c7847
4300fed
b4c7847
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efd546f
 
 
 
 
 
 
4300fed
f6147bb
 
 
 
 
 
 
 
 
4300fed
b4c7847
efd546f
 
 
f6147bb
b4c7847
efd546f
b4c7847
efd546f
70cbf96
 
b4c7847
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import spaces
import gradio as gr
import torch
from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
from string import punctuation
import re


from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed

device = "cuda:0" if torch.cuda.is_available() else "cpu"


repo_id =  "PHBJT/french_parler_tts_mini_v0.1"

model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)


SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42

default_text = "La voix humaine est un instrument de musique au-dessus de tous les autres."
default_description = "A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue."
examples = [
[
"La voix humaine est un instrument de musique au-dessus de tous les autres.",
"A male voice speaks slowly with a very noisy background, displaying a touch of expressiveness and animation. The sound is very distant, adding an air of intrigue.",
None,
],
[
"Tout ce qu'un homme est capable d'imaginer, d'autres hommes seront capables de le réaliser.",
"A male voice delivers a slightly expressive and animated speech with a moderate speed. The recording features a low-pitch voice, creating a close-sounding audio experience.",
None,
],
[
"La machine elle-même, si perfectionnée qu'on la suppose, n'est qu'un outil.",
"A male voice provides a monotone yet slightly fast delivery, with a very close recording that almost has no background noise.",
None,
],
[
"Le progrès fait naître plus de besoins qu'il n'en satisfait.",
"A female voice, in a very poor recording quality, delivers slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. The voice is slightly higher pitched than average.",
None,
],
]
number_normalizer = EnglishNumberNormalizer()

def preprocess(text):
    text = number_normalizer(text).strip()
    text = text.replace("-", " ")
    if text[-1] not in punctuation:
        text = f"{text}."
    
    abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
    
    def separate_abb(chunk):
        chunk = chunk.replace(".","")
        print(chunk)
        return " ".join(chunk)
    
    abbreviations = re.findall(abbreviations_pattern, text)
    for abv in abbreviations:
        if abv in text:
            text = text.replace(abv, separate_abb(abv))
    return text

@spaces.GPU
def gen_tts(text, description):
    inputs = tokenizer(description.strip(), return_tensors="pt").to(device)
    prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)

    set_seed(SEED)
    generation = model.generate(
        input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask, prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
    )
    audio_arr = generation.cpu().numpy().squeeze()

    return SAMPLE_RATE, audio_arr


def extract_text(file):
    from pypdf import PdfReader
    reader = PdfReader(file)
    number_of_pages = len(reader.pages)
    text = ''.join(page.extract_text() for page in reader.pages[:10])
    return text

with gr.Blocks() as demo:
    gr.Markdown("""# PDF reader

Un lecteur pdf construit avec [MeloTTS](https://github.com/myshell-ai/MeloTTS).

### Comment l'utiliser ?

1. Téléversez le document pdf à lire.
2. Cliquez sur "Extraire le texte" pour extraire les 10 premières pages.
3. Cliquez sur "Réciter le texte" pour générer l'audio.""")
    with gr.Group():
        speaker_description = gr.Textbox(value='A male voice delivers a slightly expressive and animated speech with a quick speed. The recording features a low-pitch voice, creating a close-sounding audio experience.', label='Description de la voix')
        file = gr.File(label="Document à lire")
    btn_extract = gr.Button('Extraire le texte', variant='primary')
    text = gr.Textbox(label="Texte extrait")
    btn = gr.Button('Réciter le texte', variant='primary')
    audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
    btn_extract.click(extract_text, inputs=[file], outputs=[text])
    btn.click(gen_tts, inputs=[text, speaker_description], outputs=[audio_out])
    gr.Markdown('Demo by [m-ric](https://x.com/AymericRoucher).')


demo.queue(api_open=True, default_concurrency_limit=10).launch(show_api=True, share=True)