ytdlp-whisper / app.py
lanbogao's picture
Update app.py
60c9382
raw
history blame
No virus
4.03 kB
import gradio as gr
import whisper
import subprocess
import os
from pytube import YouTube
from fastapi import FastAPI, Response, Request
import yt_dlp
langs = ["None"] + sorted(list(whisper.tokenizer.LANGUAGES.values()))
model_size = list(whisper._MODELS.keys())
@app.get("/subtitle")
async def get_subtitle(url: str):
# Download the subtitle with download_subtitle()
subtitle_url = download_subtitle(url)
# Stream the subtitle as a response
return StreamingResponse(requests.get(subtitle_url, stream=True).iter_content(chunk_size=1024))
def get_subtitle(url, lang='en'):
# Download subtitles if available
ydl_opts = {
'writesubtitles': True,
'outtmpl': '%(id)s.%(ext)s',
'subtitleslangs': [lang],
'skip_download': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=True)
video_id = info_dict.get("id", None)
if video_id is None:
return None
subtitle_file = f"{video_id}.{lang}.vtt"
with open(subtitle_file, 'r') as f:
subtitle_content = f.read()
subtitle_content = re.sub(r"<[^>]+>", "", subtitle_content)
return subtitle_content
return None
def download_audio(video_url, quality: str = '128', speed: float = None):
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': '%(title)s.%(ext)s',
'quiet': True,
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3', #'opus',
'preferredquality': quality,
}]
}
if speed:
ydl_opts['postprocessors'].append({
'key': 'FFmpegFilter',
'filter_complex': f"atempo={speed}"
})
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([video_url])
audio_file = ydl.prepare_filename(ydl.extract_info(video_url, download=False))
print('audio_file', audio_file)
return audio_file
def get_audio(url):
yt = YouTube(url)
return yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
def get_transcript(url, model_size, lang, format):
subtitle = get_subtitle(url, lang)
print(subtitle)
return subtitle
model = whisper.load_model(model_size)
if lang == "None":
lang = None
result = model.transcribe(download_audio(url), fp16=False, language=lang)
if format == "None":
return result["text"]
elif format == ".srt":
return format_to_srt(result["segments"])
def format_to_srt(segments):
output = ""
for i, segment in enumerate(segments):
output += f"{i + 1}\n"
output += f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
output += f"{segment['text']}\n\n"
return output
def format_timestamp(t):
hh = t//3600
mm = (t - hh*3600)//60
ss = t - hh*3600 - mm*60
mi = (t - int(t))*1000
return f"{int(hh):02d}:{int(mm):02d}:{int(ss):02d},{int(mi):03d}"
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
with gr.Row():
url = gr.Textbox(placeholder='Youtube video URL', label='URL')
with gr.Row():
model_size = gr.Dropdown(choices=model_size, value='tiny', label="Model")
lang = gr.Dropdown(choices=langs, value="None", label="Language (Optional)")
format = gr.Dropdown(choices=["None", ".srt"], value="None", label="Timestamps? (Optional)")
with gr.Row():
gr.Markdown("Larger models are more accurate, but slower. For 1min video, it'll take ~30s (tiny), ~1min (base), ~3min (small), ~5min (medium), etc.")
transcribe_btn = gr.Button('Transcribe')
with gr.Column():
outputs = gr.Textbox(placeholder='Transcription of the video', label='Transcription')
transcribe_btn.click(get_transcript, inputs=[url, model_size, lang, format], outputs=outputs)
demo.launch(debug=True)