lanbogao commited on
Commit
73a9bbe
1 Parent(s): 3e21f52

1. Fix some video automatic_captions is empty (eg:VSFg5LZYsyc).

Browse files

3. Support subtitle lang format like lang-code.
2. Add requested_subtitles and parse it first.

Files changed (1) hide show
  1. fetchYoutubeSubtitle.py +166 -46
fetchYoutubeSubtitle.py CHANGED
@@ -1,13 +1,17 @@
 
1
  import json
2
  import math
3
  import time
 
4
  from typing import Optional
5
  import xml.etree.ElementTree as ElementTree
6
  from html import unescape
7
  import yt_dlp
8
 
 
9
  # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
10
 
 
11
  # "subtitles": {
12
  # "live_chat": [
13
  # {
@@ -18,53 +22,134 @@ import yt_dlp
18
  # }
19
  # ]
20
  # }
21
- def getUrlFromSubtitles(item, lang='en', subType="vtt"):
22
- langs = item.keys()
23
- if len(langs) == 0:
24
- return None
25
-
26
- l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
27
- if l is None:
28
- return
29
-
30
- for subtitle in item[l]:
31
- # print("getUrlFromSubtitles subtitle: %s" % subtitle)
32
- if l != "live_chat" and subType =="xml":
33
- # print("subtitle source url: {}".format(subtitle.get("url")))
34
- return subtitle.get("url").replace("&fmt="+subtitle.get("ext"),"")
35
  if subtitle.get("ext") == subType:
 
 
36
  return subtitle.get("url")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  return None
38
 
39
- async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
 
 
 
 
 
 
40
  return await fetchSubtitlebyType(url, lang, subType, proxy)
41
 
42
- async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
 
 
 
 
 
 
 
 
 
43
  ydl_opts = {
44
  "noplaylist": True,
45
- "writesubtitles": False,
46
- "allsubtitles": True,
47
- "subtitleslangs": [lang] if lang else [],
 
 
48
  "skip_download": True,
49
  "socket_timeout": 10,
50
  "extractor_retries": 0,
51
  "extractor_args": {
52
  "youtube": {
53
  "player_skip": ["configs", "initial"], # "webpage",
54
- "player_client": ["android"],
55
- "skip": ["hls", "dash", "translated_subs"],
56
  }
57
  },
58
  }
59
 
60
  if proxy:
61
  ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
62
-
63
  title = "unknow"
64
  duration = ""
65
  try:
66
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
67
  info_dict = ydl.extract_info(url, download=False)
 
68
  title = info_dict.get("title", "unknow")
69
  seconds = info_dict.get("duration")
70
  duration = str(seconds) if seconds else ""
@@ -72,21 +157,46 @@ async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Opt
72
  if info_dict.get("extractor") == "youtube" and subType == "srt":
73
  subType = "xml"
74
  isSrt = True
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # print("subtitles.keys(): {} automatic_captions: {}".format( info_dict.get("subtitles").keys(),info_dict.get("automatic_captions").keys()))
77
- for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
78
- if info_dict.get(subtitle_item):
79
- subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
80
- if subtitle_url:
81
- # print("subtitle_url: {}".format(subtitle_url))
82
- with ydl.urlopen(subtitle_url) as response:
83
- subtitle = xml_caption_to_srt(response.read().decode()) if isSrt else response.read().decode()
84
- print("url{}, title:{}, duration:{} len(subtitle): {}".format(url, title, duration, len(subtitle)))
85
- return {"title": title, "duration": duration,"subtitle": subtitle, "chapters":info_dict.get("chapters", None) }
 
 
 
 
 
 
 
 
 
86
  except Exception as e:
 
 
87
  return {"error": str(e)}
88
  return {"title": title, "duration": duration, "error": "No subtitles"}
89
 
 
90
  def float_to_srt_time_format(d: float) -> str:
91
  """Convert decimal durations into proper srt format.
92
  :rtype: str
@@ -99,7 +209,8 @@ def float_to_srt_time_format(d: float) -> str:
99
  ms = f"{fraction:.3f}".replace("0.", "")
100
  return time_fmt + ms
101
 
102
- def xml_caption_to_srt( xml_captions: str) -> str:
 
103
  """Convert xml caption tracks to "SubRip Subtitle (srt)".
104
  :param str xml_captions:
105
  XML formatted caption tracks.
@@ -108,7 +219,9 @@ def xml_caption_to_srt( xml_captions: str) -> str:
108
  root = ElementTree.fromstring(xml_captions)
109
  for i, child in enumerate(list(root)):
110
  text = child.text or ""
111
- caption = unescape(text.replace("\n", " ").replace(" ", " "),)
 
 
112
  try:
113
  duration = float(child.attrib["dur"])
114
  except KeyError:
@@ -125,25 +238,27 @@ def xml_caption_to_srt( xml_captions: str) -> str:
125
  segments.append(line)
126
  return "\n".join(segments).strip()
127
 
 
128
  async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
129
  ydl_opts = {
130
  "noplaylist": True,
131
- "writesubtitles": False,
132
- "allsubtitles": True,
133
- "skip_download": True,
 
134
  "socket_timeout": 10,
135
  "extractor_retries": 0,
136
- # "extractor_args": {
137
- # "youtube": {
138
- # "player_skip": ["webpage", "configs", "initial"],
139
- # "player_client": ["android"],
140
- # "skip": ["hls", "dash", "translated_subs"],
141
- # }
142
- # },
143
  }
144
  if proxy:
145
  ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
146
-
147
  title = "unknow"
148
  duration = ""
149
  try:
@@ -153,7 +268,12 @@ async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
153
  seconds = info_dict.get("duration")
154
  duration = str(seconds) if seconds else ""
155
 
156
- return {"title": title, "duration": duration, "subtitles": info_dict.get("subtitles"),"automatic_captions": info_dict.get("automatic_captions")}
 
 
 
 
 
157
 
158
  except Exception as e:
159
  return {"error": str(e)}
 
1
+ import os
2
  import json
3
  import math
4
  import time
5
+ import traceback
6
  from typing import Optional
7
  import xml.etree.ElementTree as ElementTree
8
  from html import unescape
9
  import yt_dlp
10
 
11
+ debug = os.getenv("DEBUG")
12
  # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
13
 
14
+
15
  # "subtitles": {
16
  # "live_chat": [
17
  # {
 
22
  # }
23
  # ]
24
  # }
25
+ def getUrlFromSubtitleItem(item, lang="en", subType="vtt"):
26
+ for subtitle in item[lang]:
27
+ if lang != "live_chat" and subType == "xml":
28
+ if debug:
29
+ print(
30
+ "subtitle source lang:{} url: {}".format(lang, subtitle.get("url"))
31
+ )
32
+ return subtitle.get("url").replace("&fmt=" + subtitle.get("ext"), "")
 
 
 
 
 
 
33
  if subtitle.get("ext") == subType:
34
+ if debug:
35
+ print("subtitle lang:{} url: {}".format(lang, subtitle.get("url")))
36
  return subtitle.get("url")
37
+
38
+ return None
39
+
40
+
41
+ def getRequestedSubtitlesUrl(info_dict, lang, subType):
42
+ item = info_dict.get("requested_subtitles")
43
+ langs = item.keys()
44
+ for l in langs:
45
+ if l.startswith(lang):
46
+ item = {l: [item[l]]} if type(item[l]) == dict else item
47
+ url = getUrlFromSubtitleItem(item, l, subType)
48
+ if url:
49
+ if debug:
50
+ print("getRequestedSubtitlesUrl lang:{} url:{}".format(l, url))
51
+ return url
52
+ return None
53
+
54
+
55
+ def getSubtitleLangUrl(
56
+ info_dict,
57
+ lang="en",
58
+ subType="vtt",
59
+ subTitleKeys=["subtitles", "automatic_captions"],
60
+ ):
61
+ for subtitle_item in subTitleKeys:
62
+ langs = info_dict.get(subtitle_item).keys()
63
+ if lang in langs:
64
+ url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), lang, subType)
65
+ if url:
66
+ if debug:
67
+ print("getSubtitleLangUrl lang:{}".format(lang))
68
+ return url
69
+
70
+ for subtitle_item in subTitleKeys:
71
+ langs = info_dict.get(subtitle_item).keys()
72
+ for l in langs:
73
+ if l.startswith(lang):
74
+ url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
75
+ if url:
76
+ if debug:
77
+ print("getSubtitleLangUrl lang:{} url:{}".format(l, url))
78
+ return url
79
+
80
+ return None
81
+
82
+
83
+ def getSubtitleOtherUrl(
84
+ info_dict,
85
+ lang="en",
86
+ subType="vtt",
87
+ subTitleKeys=["subtitles", "automatic_captions"],
88
+ ):
89
+ for subtitle_item in subTitleKeys:
90
+ langs = info_dict.get(subtitle_item).keys()
91
+ if len(langs) == 0:
92
+ continue
93
+
94
+ l = lang if lang in langs else ("en" if "en" in langs else list(langs)[0])
95
+ if l is None:
96
+ continue
97
+
98
+ url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
99
+ if url:
100
+ if debug:
101
+ print("getSubtitleOtherUrl lang:{} url:{}".format(l, url))
102
+ return url
103
+
104
  return None
105
 
106
+
107
+ async def fetchSubtitle(
108
+ url: str,
109
+ lang: Optional[str] = "en",
110
+ subType: Optional[str] = "vtt",
111
+ proxy: Optional[str] = None,
112
+ ) -> dict:
113
  return await fetchSubtitlebyType(url, lang, subType, proxy)
114
 
115
+
116
+ async def fetchSubtitlebyType(
117
+ url: str,
118
+ lang: Optional[str] = "en",
119
+ subType: Optional[str] = "vtt",
120
+ proxy: Optional[str] = None,
121
+ ) -> dict:
122
+ # lang-code or lang.* .* is regex
123
+ reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
124
+
125
  ydl_opts = {
126
  "noplaylist": True,
127
+ "writesubtitles": True,
128
+ "writeautomaticsub": True,
129
+ # "listsubtitles": True,
130
+ # "subtitlesformat": subType, # mark due to default youtube no srt and xml format
131
+ "subtitleslangs": [reqLang],
132
  "skip_download": True,
133
  "socket_timeout": 10,
134
  "extractor_retries": 0,
135
  "extractor_args": {
136
  "youtube": {
137
  "player_skip": ["configs", "initial"], # "webpage",
138
+ "player_client": ["web"],
139
+ "skip": ["hls", "dash"], # don't skip "translated_subs"
140
  }
141
  },
142
  }
143
 
144
  if proxy:
145
  ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
146
+ # print(ydl_opts)
147
  title = "unknow"
148
  duration = ""
149
  try:
150
  with yt_dlp.YoutubeDL(ydl_opts) as ydl:
151
  info_dict = ydl.extract_info(url, download=False)
152
+ # print(json.dumps(info_dict))
153
  title = info_dict.get("title", "unknow")
154
  seconds = info_dict.get("duration")
155
  duration = str(seconds) if seconds else ""
 
157
  if info_dict.get("extractor") == "youtube" and subType == "srt":
158
  subType = "xml"
159
  isSrt = True
160
+ if debug:
161
+ print(
162
+ "subtitles.keys(): {} automatic_captions: {}".format(
163
+ info_dict.get("subtitles").keys(),
164
+ info_dict.get("automatic_captions").keys(),
165
+ )
166
+ )
167
+
168
+ subtitle_url = getRequestedSubtitlesUrl(info_dict, lang, subType)
169
+ if not subtitle_url:
170
+ subtitle_url = getSubtitleLangUrl(info_dict, lang, subType)
171
+ if not subtitle_url:
172
+ subtitle_url = getSubtitleOtherUrl(info_dict, lang, subType)
173
 
174
+ if subtitle_url:
175
+ # print("subtitle_url: {}".format(subtitle_url))
176
+ with ydl.urlopen(subtitle_url) as response:
177
+ subtitle = (
178
+ xml_caption_to_srt(response.read().decode())
179
+ if isSrt
180
+ else response.read().decode()
181
+ )
182
+ print(
183
+ "url:{}, title:{}, duration:{} len(subtitle): {}".format(
184
+ url, title, duration, len(subtitle)
185
+ )
186
+ )
187
+ return {
188
+ "title": title,
189
+ "duration": duration,
190
+ "subtitle": subtitle,
191
+ "chapters": info_dict.get("chapters", None),
192
+ }
193
  except Exception as e:
194
+ print(e)
195
+ traceback.print_exc()
196
  return {"error": str(e)}
197
  return {"title": title, "duration": duration, "error": "No subtitles"}
198
 
199
+
200
  def float_to_srt_time_format(d: float) -> str:
201
  """Convert decimal durations into proper srt format.
202
  :rtype: str
 
209
  ms = f"{fraction:.3f}".replace("0.", "")
210
  return time_fmt + ms
211
 
212
+
213
+ def xml_caption_to_srt(xml_captions: str) -> str:
214
  """Convert xml caption tracks to "SubRip Subtitle (srt)".
215
  :param str xml_captions:
216
  XML formatted caption tracks.
 
219
  root = ElementTree.fromstring(xml_captions)
220
  for i, child in enumerate(list(root)):
221
  text = child.text or ""
222
+ caption = unescape(
223
+ text.replace("\n", " ").replace(" ", " "),
224
+ )
225
  try:
226
  duration = float(child.attrib["dur"])
227
  except KeyError:
 
238
  segments.append(line)
239
  return "\n".join(segments).strip()
240
 
241
+
242
  async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
243
  ydl_opts = {
244
  "noplaylist": True,
245
+ # "writesubtitles": False,
246
+ # "allsubtitles": False,
247
+ "listsubtitles": True,
248
+ # "skip_download": True,
249
  "socket_timeout": 10,
250
  "extractor_retries": 0,
251
+ "extractor_args": {
252
+ "youtube": {
253
+ "player_skip": ["configs", "initial"], # "webpage",
254
+ "player_client": ["web"],
255
+ "skip": ["hls", "dash"], # , "translated_subs"
256
+ }
257
+ },
258
  }
259
  if proxy:
260
  ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
261
+
262
  title = "unknow"
263
  duration = ""
264
  try:
 
268
  seconds = info_dict.get("duration")
269
  duration = str(seconds) if seconds else ""
270
 
271
+ return {
272
+ "title": title,
273
+ "duration": duration,
274
+ "subtitles": info_dict.get("subtitles"),
275
+ "automatic_captions": info_dict.get("automatic_captions"),
276
+ }
277
 
278
  except Exception as e:
279
  return {"error": str(e)}