lanbogao commited on
Commit
d030b89
1 Parent(s): de8dee7

Add get subtitle with type support.

Browse files
Files changed (2) hide show
  1. fetchYoutubeSubtitle.py +85 -16
  2. main.py +2 -2
fetchYoutubeSubtitle.py CHANGED
@@ -1,16 +1,51 @@
1
  import json
 
 
2
  from typing import Optional
 
 
3
  import yt_dlp
4
 
5
- def getVttUrlFromSubtitles(item, lang='en', vttType="vtt"):
 
 
 
 
 
 
 
 
 
 
 
 
6
  langs = item.keys()
7
- key = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
8
- for subtitle in item[key]:
9
- if(subtitle.get("ext") == vttType):
 
 
 
 
 
 
 
 
10
  return subtitle.get("url")
11
  return None
12
 
13
- async def fetchSubtitle(url: str, lang: Optional[str] = 'en', vttType="vtt") -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
14
  ydl_opts = {
15
  "writesubtitles": True,
16
  "allsubtitles": True,
@@ -21,19 +56,53 @@ async def fetchSubtitle(url: str, lang: Optional[str] = 'en', vttType="vtt") ->
21
 
22
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
23
  info_dict = ydl.extract_info(url, download=False)
24
- if info_dict.get("subtitles"):
25
- # get first available subtitle
26
- subtitle_url = getVttUrlFromSubtitles(info_dict.get("subtitles"), lang, vttType)
27
- if subtitle_url:
28
- with ydl.urlopen(subtitle_url) as subtitle:
29
- return subtitle.read().decode()
30
- if info_dict.get("automatic_captions"):
31
- subtitle_url = getVttUrlFromSubtitles(info_dict.get("automatic_captions"), lang, vttType)
32
- if subtitle_url:
33
- with ydl.urlopen(subtitle_url) as subtitle:
34
- return subtitle.read().decode()
35
  return None
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  async def fetchSubtitleUrls(url: str) -> json:
38
  ydl_opts = {
39
  "writesubtitles": True,
 
1
  import json
2
+ import math
3
+ import time
4
  from typing import Optional
5
+ import xml.etree.ElementTree as ElementTree
6
+ from html import unescape
7
  import yt_dlp
8
 
9
+ # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
10
+
11
+ # "subtitles": {
12
+ # "live_chat": [
13
+ # {
14
+ # "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
15
+ # "ext": "json",
16
+ # "video_id": "ANtM2bHRz04",
17
+ # "protocol": "youtube_live_chat_replay"
18
+ # }
19
+ # ]
20
+ # }
21
+ def getUrlFromSubtitles(item, lang='en', subType="vtt"):
22
  langs = item.keys()
23
+ if len(langs) == 0:
24
+ return None
25
+
26
+ l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
27
+ print("getUrlFromSubtitles l: %s, item: %s" % (l, item))
28
+
29
+ for subtitle in item[l]:
30
+ print("getUrlFromSubtitles subtitle: %s" % subtitle)
31
+ if l != "live_chat" and subType =="xml":
32
+ return subtitle.get("url").replace("fmt="+subtitle.get("ext"),"")
33
+ if subtitle.get("ext") == subType:
34
  return subtitle.get("url")
35
  return None
36
 
37
+ async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt") -> Optional[str]:
38
+ if subType == "srt":
39
+ subtitle = await fetchSubtitlebyType(url, lang, subType, True)
40
+ if subtitle:
41
+ return subtitle
42
+ subtitle = await fetchSubtitlebyType(url, lang, "xml", True)
43
+ print(subtitle)
44
+ return xml_caption_to_srt(subtitle)
45
+ else:
46
+ return await fetchSubtitlebyType(url, lang, subType, True)
47
+
48
+ async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType="vtt", decode: bool = False) -> Optional[str]:
49
  ydl_opts = {
50
  "writesubtitles": True,
51
  "allsubtitles": True,
 
56
 
57
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
58
  info_dict = ydl.extract_info(url, download=False)
59
+ for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
60
+ if info_dict.get(subtitle_item) :
61
+ subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
62
+ if subtitle_url:
63
+ with ydl.urlopen(subtitle_url) as subtitle:
64
+ return subtitle.read().decode() if decode else subtitle.read()
65
+
 
 
 
 
66
  return None
67
 
68
+ def float_to_srt_time_format(d: float) -> str:
69
+ """Convert decimal durations into proper srt format.
70
+ :rtype: str
71
+ :returns:
72
+ SubRip Subtitle (str) formatted time duration.
73
+ float_to_srt_time_format(3.89) -> '00:00:03,890'
74
+ """
75
+ fraction, whole = math.modf(d)
76
+ time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
77
+ ms = f"{fraction:.3f}".replace("0.", "")
78
+ return time_fmt + ms
79
+
80
+ def xml_caption_to_srt( xml_captions: str) -> str:
81
+ """Convert xml caption tracks to "SubRip Subtitle (srt)".
82
+ :param str xml_captions:
83
+ XML formatted caption tracks.
84
+ """
85
+ segments = []
86
+ root = ElementTree.fromstring(xml_captions)
87
+ for i, child in enumerate(list(root)):
88
+ text = child.text or ""
89
+ caption = unescape(text.replace("\n", " ").replace(" ", " "),)
90
+ try:
91
+ duration = float(child.attrib["dur"])
92
+ except KeyError:
93
+ duration = 0.0
94
+ start = float(child.attrib["start"])
95
+ end = start + duration
96
+ sequence_number = i + 1 # convert from 0-indexed to 1.
97
+ line = "{seq}\n{start} --> {end}\n{text}\n".format(
98
+ seq=sequence_number,
99
+ start=float_to_srt_time_format(start),
100
+ end=float_to_srt_time_format(end),
101
+ text=caption,
102
+ )
103
+ segments.append(line)
104
+ return "\n".join(segments).strip()
105
+
106
  async def fetchSubtitleUrls(url: str) -> json:
107
  ydl_opts = {
108
  "writesubtitles": True,
main.py CHANGED
@@ -15,8 +15,8 @@ def read_json():
15
 
16
 
17
  @app.get("/subtitle/")
18
- async def get_subtitle(url: str):
19
- subtitle = await fetchSubtitle(url)
20
  return JSONResponse(content=subtitle)
21
 
22
 
 
15
 
16
 
17
  @app.get("/subtitle/")
18
+ async def get_subtitle(url: str, subtype: str="srt"):
19
+ subtitle = await fetchSubtitle(url,subType=subtype)
20
  return JSONResponse(content=subtitle)
21
 
22