ytdlp_subtitle / fetchYoutubeSubtitle.py
lanbogao's picture
1. Set socket_timeout default 10 and set 20 when proxy.
8a3c63b
raw
history blame
5.79 kB
import json
import math
import time
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitles(item, lang='en', subType="vtt"):
langs = item.keys()
if len(langs) == 0:
return None
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
if l is None:
return
for subtitle in item[l]:
# print("getUrlFromSubtitles subtitle: %s" % subtitle)
if l != "live_chat" and subType =="xml":
# print("subtitle source url: {}".format(subtitle.get("url")))
return subtitle.get("url").replace("&fmt="+subtitle.get("ext"),"")
if subtitle.get("ext") == subType:
return subtitle.get("url")
return None
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
return await fetchSubtitlebyType(url, lang, subType, proxy)
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
ydl_opts = {
"noplaylist": True,
"writesubtitles": False,
"allsubtitles": True,
"subtitleslangs": [lang] if lang else [],
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0,
"extractor_args": {
"youtube": {
"player_skip": ["webpage", "configs", "initial"],
"player_client": ["android"],
"skip": ["hls", "dash", "translated_subs"],
}
},
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
isSrt = False
if info_dict.get("extractor") == "youtube" and subType == "srt":
subType = "xml"
isSrt = True
for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
if info_dict.get(subtitle_item):
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
if subtitle_url:
# print("subtitle_url: {}".format(subtitle_url))
with ydl.urlopen(subtitle_url) as response:
subtitle = xml_caption_to_srt(response.read().decode()) if isSrt else response.read().decode()
print("url{}, title:{}, duration:{} len(subtitle): {}".format(url, title, duration, len(subtitle)))
return {"title": title, "duration": duration,"subtitle": subtitle, "chapters":info_dict.get("chapters", None) }
except Exception as e:
return {"error": str(e)}
return {"title": title, "duration": duration, "error": "No subtitles"}
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt( xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
return "\n".join(segments).strip()
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
ydl_opts = {
"noplaylist": True,
"writesubtitles": False,
"allsubtitles": True,
"skip_download": True,
"socket_timeout": 10,
"extractor_retries": 0
}
if proxy:
ydl_opts.update({"proxy": proxy, "socket_timeout": 20})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
return {"title": title, "duration": duration, "subtitles": info_dict.get("subtitles"),"automatic_captions": info_dict.get("automatic_captions")}
except Exception as e:
return {"error": str(e)}