File size: 2,350 Bytes
52f8ce8
5d656c3
9cdc5fa
 
5d656c3
52f8ce8
f46c80d
da90409
52f8ce8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f46c80d
9cdc5fa
 
 
 
 
f46c80d
52f8ce8
 
9cdc5fa
52f8ce8
5d656c3
f115402
5d656c3
9cdc5fa
5d656c3
52f8ce8
 
9cdc5fa
5d656c3
 
52f8ce8
5d656c3
52f8ce8
48b0c57
9cdc5fa
5d656c3
 
 
52f8ce8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pathlib
from faster_whisper import WhisperModel
import yt_dlp
import uuid
import os
import gradio as gr


# List of all supported video sites here https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md
def download_convert_video_to_audio(
    yt_dlp,
    video_url: str,
    destination_path: pathlib.Path,
) -> None:
    ydl_opts = {
        "format": "bestaudio/best",
        "postprocessors": [
            {  # Extract audio using ffmpeg
                "key": "FFmpegExtractAudio",
                "preferredcodec": "mp3",
            }
        ],
        "outtmpl": f"{destination_path}.%(ext)s",
    }
    try:
        print(f"Downloading video from {video_url}")
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download(video_url)
        print(f"Downloaded video from {video_url} to {destination_path}")
    except Exception as e:
        raise (e)

def segment_to_dict(segment):
    segment = segment._asdict()
    if segment["words"] is not None:
        segment["words"] = [word._asdict() for word in segment["words"]]
    return segment

def download_video(video_url: str):
    download_convert_video_to_audio(yt_dlp, video_url, f"/content/{uuid.uuid4().hex}")

def transcribe_video(video_url: str, beam_size: int = 5, model_size: str = "tiny", word_timestamps: bool = True):
    print("loading model")
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    print("getting hex")
    rand_id = uuid.uuid4().hex
    print("doing download")
    download_convert_video_to_audio(yt_dlp, video_url, f"/content/{rand_id}")
    segments, info = model.transcribe(f"/content/{rand_id}.mp3", beam_size=beam_size, word_timestamps=word_timestamps)
    segments = [segment_to_dict(segment) for segment in segments]
    total_duration = round(info.duration, 2)  # Same precision as the Whisper timestamps.
    print(info)
    os.remove(f"/content/{rand_id}.mp3")
    print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
    print(segments)
    return segments

# print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

# for segment in segments:
#     print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

demo = gr.Interface(fn=transcribe_video, inputs="text", outputs="text")

demo.launch()