ytdlp_subtitle_stranscript / fetchYoutubeSubtitle.py
lanbogao's picture
1. Fix some video automatic_captions is empty (eg:VSFg5LZYsyc).
73a9bbe
raw
history blame
9.2 kB
import os
import json
import math
import time
import traceback
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
debug = os.getenv("DEBUG")
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitleItem(item, lang="en", subType="vtt"):
for subtitle in item[lang]:
if lang != "live_chat" and subType == "xml":
if debug:
print(
"subtitle source lang:{} url: {}".format(lang, subtitle.get("url"))
)
return subtitle.get("url").replace("&fmt=" + subtitle.get("ext"), "")
if subtitle.get("ext") == subType:
if debug:
print("subtitle lang:{} url: {}".format(lang, subtitle.get("url")))
return subtitle.get("url")
return None
def getRequestedSubtitlesUrl(info_dict, lang, subType):
item = info_dict.get("requested_subtitles")
langs = item.keys()
for l in langs:
if l.startswith(lang):
item = {l: [item[l]]} if type(item[l]) == dict else item
url = getUrlFromSubtitleItem(item, l, subType)
if url:
if debug:
print("getRequestedSubtitlesUrl lang:{} url:{}".format(l, url))
return url
return None
def getSubtitleLangUrl(
info_dict,
lang="en",
subType="vtt",
subTitleKeys=["subtitles", "automatic_captions"],
):
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
if lang in langs:
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), lang, subType)
if url:
if debug:
print("getSubtitleLangUrl lang:{}".format(lang))
return url
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
for l in langs:
if l.startswith(lang):
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
if url:
if debug:
print("getSubtitleLangUrl lang:{} url:{}".format(l, url))
return url
return None
def getSubtitleOtherUrl(
info_dict,
lang="en",
subType="vtt",
subTitleKeys=["subtitles", "automatic_captions"],
):
for subtitle_item in subTitleKeys:
langs = info_dict.get(subtitle_item).keys()
if len(langs) == 0:
continue
l = lang if lang in langs else ("en" if "en" in langs else list(langs)[0])
if l is None:
continue
url = getUrlFromSubtitleItem(info_dict.get(subtitle_item), l, subType)
if url:
if debug:
print("getSubtitleOtherUrl lang:{} url:{}".format(l, url))
return url
return None
async def fetchSubtitle(
url: str,
lang: Optional[str] = "en",
subType: Optional[str] = "vtt",
proxy: Optional[str] = None,
) -> dict:
return await fetchSubtitlebyType(url, lang, subType, proxy)
async def fetchSubtitlebyType(
url: str,
lang: Optional[str] = "en",
subType: Optional[str] = "vtt",
proxy: Optional[str] = None,
) -> dict:
# lang-code or lang.* .* is regex
reqLang = lang if len(lang.split("-")) > 1 or lang.endswith(".*") else lang + ".*"
ydl_opts = {
"noplaylist": True,
"writesubtitles": True,
"writeautomaticsub": True,
# "listsubtitles": True,
# "subtitlesformat": subType, # mark due to default youtube no srt and xml format
"subtitleslangs": [reqLang],
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
"extractor_args": {
"youtube": {
"player_skip": ["configs", "initial"], # "webpage",
"player_client": ["web"],
"skip": ["hls", "dash"], # don't skip "translated_subs"
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
# print(ydl_opts)
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
# print(json.dumps(info_dict))
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
isSrt = False
if info_dict.get("extractor") == "youtube" and subType == "srt":
subType = "xml"
isSrt = True
if debug:
print(
"subtitles.keys(): {} automatic_captions: {}".format(
info_dict.get("subtitles").keys(),
info_dict.get("automatic_captions").keys(),
)
)
subtitle_url = getRequestedSubtitlesUrl(info_dict, lang, subType)
if not subtitle_url:
subtitle_url = getSubtitleLangUrl(info_dict, lang, subType)
if not subtitle_url:
subtitle_url = getSubtitleOtherUrl(info_dict, lang, subType)
if subtitle_url:
# print("subtitle_url: {}".format(subtitle_url))
with ydl.urlopen(subtitle_url) as response:
subtitle = (
xml_caption_to_srt(response.read().decode())
if isSrt
else response.read().decode()
)
print(
"url:{}, title:{}, duration:{} len(subtitle): {}".format(
url, title, duration, len(subtitle)
)
)
return {
"title": title,
"duration": duration,
"subtitle": subtitle,
"chapters": info_dict.get("chapters", None),
}
except Exception as e:
print(e)
traceback.print_exc()
return {"error": str(e)}
return {"title": title, "duration": duration, "error": "No subtitles"}
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt(xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(
text.replace("\n", " ").replace(" ", " "),
)
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
return "\n".join(segments).strip()
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
ydl_opts = {
"noplaylist": True,
# "writesubtitles": False,
# "allsubtitles": False,
"listsubtitles": True,
# "skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
"extractor_args": {
"youtube": {
"player_skip": ["configs", "initial"], # "webpage",
"player_client": ["web"],
"skip": ["hls", "dash"], # , "translated_subs"
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
return {
"title": title,
"duration": duration,
"subtitles": info_dict.get("subtitles"),
"automatic_captions": info_dict.get("automatic_captions"),
}
except Exception as e:
return {"error": str(e)}