ayaanzaveri commited on
Commit
a40289c
1 Parent(s): 615514e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -0
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pathlib
2
+ from faster_whisper import WhisperModel
3
+ import yt_dlp
4
+ import uuid
5
+ import os
6
+ import gradio as gr
7
+
8
+
9
+ # List of all supported video sites here https://github.com/yt-dlp/yt-dlp/blob/master/supportedsites.md
10
+ def download_convert_video_to_audio(
11
+ yt_dlp,
12
+ video_url: str,
13
+ destination_path: pathlib.Path,
14
+ ) -> None:
15
+ ydl_opts = {
16
+ "format": "bestaudio/best",
17
+ "postprocessors": [
18
+ { # Extract audio using ffmpeg
19
+ "key": "FFmpegExtractAudio",
20
+ "preferredcodec": "mp3",
21
+ }
22
+ ],
23
+ "outtmpl": f"{destination_path}.%(ext)s",
24
+ }
25
+ try:
26
+ print(f"Downloading video from {video_url}")
27
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
28
+ ydl.download(video_url)
29
+ print(f"Downloaded video from {video_url} to {destination_path}")
30
+ except Exception as e:
31
+ raise (e)
32
+
33
+ def segment_to_dict(segment):
34
+ segment = segment._asdict()
35
+ if segment["words"] is not None:
36
+ segment["words"] = [word._asdict() for word in segment["words"]]
37
+ return segment
38
+
39
+ def download_video(video_url: str):
40
+ download_convert_video_to_audio(yt_dlp, video_url, f"/content/{uuid.uuid4().hex}")
41
+
42
+ def transcribe_video(video_url: str, beam_size: int = 5, model_size: str = "tiny", word_timestamps: bool = True):
43
+ print("loading model")
44
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
45
+ print("getting hex")
46
+ rand_id = uuid.uuid4().hex
47
+ print("doing download")
48
+ download_convert_video_to_audio(yt_dlp, video_url, f"/content/{rand_id}")
49
+ segments, info = model.transcribe(f"/content/{rand_id}.mp3", beam_size=beam_size, word_timestamps=word_timestamps)
50
+ segments = [segment_to_dict(segment) for segment in segments]
51
+ total_duration = round(info.duration, 2) # Same precision as the Whisper timestamps.
52
+ print(info)
53
+ os.remove(f"/content/{rand_id}.mp3")
54
+ print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
55
+ print(segments)
56
+ return segments
57
+
58
+ # print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
59
+
60
+ # for segment in segments:
61
+ # print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
62
+
63
+ demo = gr.Interface(fn=transcribe_video, inputs="text", outputs="text")
64
+
65
+ demo.launch()