Kushtrim commited on
Commit
c9e69f1
·
verified ·
1 Parent(s): 9f0bbf1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor
2
+ from transformers.pipelines.audio_utils import ffmpeg_read
3
+ from huggingface_hub import login
4
+ import yt_dlp as youtube_dl
5
+ import gradio as gr
6
+ import tempfile
7
+ import spaces
8
+ import torch
9
+ import time
10
+ import os
11
+
12
+ login(os.environ["HF"], add_to_git_credential=True)
13
+
14
+ BATCH_SIZE = 16
15
+ FILE_LIMIT_MB = 1000
16
+ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
17
+
18
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
20
+ model_id = "Kushtrim/whisper-base-shqip"
21
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
22
+ model_id, torch_dtype=torch_dtype, use_safetensors=True, token=True).to(device)
23
+ processor = AutoProcessor.from_pretrained(model_id, token=True)
24
+ pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor,
25
+ chunk_length_s=30, torch_dtype=torch_dtype, device=device,
26
+ token=os.environ["HF"])
27
+
28
+ # pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor,
29
+ # max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device,
30
+ # token=os.environ["HF"])
31
+
32
+ @spaces.GPU
33
+ def transcribe(inputs, task):
34
+ if inputs is None:
35
+ raise gr.Error(
36
+ "No audio file submitted! Please upload or record an audio file before submitting your request.")
37
+
38
+ text = pipe(inputs, generate_kwargs={
39
+ "task": task, 'language': 'sq'}, return_timestamps=True)["text"]
40
+ return text
41
+
42
+
43
+ def _return_yt_html_embed(yt_url):
44
+ video_id = yt_url.split("?v=")[-1]
45
+ HTML_str = (
46
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
47
+ " </center>"
48
+ )
49
+ return HTML_str
50
+
51
+
52
+ def download_yt_audio(yt_url, filename):
53
+ info_loader = youtube_dl.YoutubeDL()
54
+
55
+ try:
56
+ info = info_loader.extract_info(yt_url, download=False)
57
+ except youtube_dl.utils.DownloadError as err:
58
+ raise gr.Error(str(err))
59
+
60
+ file_length = info["duration_string"]
61
+ file_h_m_s = file_length.split(":")
62
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
63
+
64
+ if len(file_h_m_s) == 1:
65
+ file_h_m_s.insert(0, 0)
66
+ if len(file_h_m_s) == 2:
67
+ file_h_m_s.insert(0, 0)
68
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
69
+
70
+ if file_length_s > YT_LENGTH_LIMIT_S:
71
+ yt_length_limit_hms = time.strftime(
72
+ "%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
73
+ file_length_hms = time.strftime(
74
+ "%HH:%MM:%SS", time.gmtime(file_length_s))
75
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
76
+
77
+ ydl_opts = {"outtmpl": filename,
78
+ "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
79
+
80
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
81
+ try:
82
+ ydl.download([yt_url])
83
+ except youtube_dl.utils.ExtractorError as err:
84
+ raise gr.Error(str(err))
85
+
86
+
87
+ def yt_transcribe(yt_url, task, max_filesize=75.0):
88
+ html_embed_str = _return_yt_html_embed(yt_url)
89
+
90
+ with tempfile.TemporaryDirectory() as tmpdirname:
91
+ filepath = os.path.join(tmpdirname, "video.mp4")
92
+ download_yt_audio(yt_url, filepath)
93
+ with open(filepath, "rb") as f:
94
+ inputs = f.read()
95
+
96
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
97
+ inputs = {"array": inputs,
98
+ "sampling_rate": pipe.feature_extractor.sampling_rate}
99
+
100
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={
101
+ "task": task}, return_timestamps=True)["text"]
102
+
103
+ return html_embed_str, text
104
+
105
+
106
+ demo = gr.Blocks()
107
+
108
+ file_transcribe = gr.Interface(
109
+ fn=transcribe,
110
+ inputs=[
111
+ gr.Audio(sources=["upload"], type="filepath", label="Audio file"),
112
+ gr.Radio(choices=["transcribe"], label="Task"),
113
+ ],
114
+ outputs="text",
115
+ title="Whisper Large V3 Turbo Shqip: Transcribe Audio",
116
+ description=(
117
+ "Easily transcribe long-form audio inputs in Albanian with high accuracy! This demo utilizes the fine-tuned "
118
+ f"Whisper model [{model_id}](https://huggingface.co/{model_id}), specially adapted for the Albanian language, "
119
+ "powered by 🤗 Transformers. With just a click, transform microphone or audio file inputs of any length into "
120
+ "text with exceptional transcription quality."
121
+ ),
122
+ allow_flagging="never",
123
+ )
124
+
125
+ mf_transcribe = gr.Interface(
126
+ fn=transcribe,
127
+ inputs=[
128
+ gr.Audio(sources=["microphone"], type="filepath"),
129
+ gr.Radio(choices=["transcribe"], label="Task"),
130
+ ],
131
+ outputs="text",
132
+ title="Whisper Large V3 Turbo Shqip: Transcribe Audio",
133
+ description=(
134
+ "Easily transcribe long-form audio inputs in Albanian with high accuracy! This demo utilizes the fine-tuned "
135
+ f"Whisper model [{model_id}](https://huggingface.co/{model_id}), specially adapted for the Albanian language, "
136
+ "powered by 🤗 Transformers. With just a click, transform microphone or audio file inputs of any length into "
137
+ "text with exceptional transcription quality."
138
+ ),
139
+ allow_flagging="never",
140
+ )
141
+
142
+ yt_transcribe = gr.Interface(
143
+ fn=yt_transcribe,
144
+ inputs=[
145
+ gr.Textbox(
146
+ lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
147
+ gr.Radio(choices=["transcribe"], label="Task")
148
+ ],
149
+ outputs=["html", "text"],
150
+ title="Whisper Large V3 Turbo Shqip: Transcribe Audio",
151
+ description=(
152
+ "Easily transcribe long-form audio inputs in Albanian with high accuracy! This demo utilizes the fine-tuned "
153
+ f"Whisper model [{model_id}](https://huggingface.co/{model_id}), specially adapted for the Albanian language, "
154
+ "powered by 🤗 Transformers. With just a click, transform microphone or audio file inputs of any length into "
155
+ "text with exceptional transcription quality."
156
+ ),
157
+ allow_flagging="never",
158
+ )
159
+
160
+ with demo:
161
+ gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
162
+
163
+ demo.launch()