ytdlp_subtitle_stranscript / fetchYoutubeSubtitle.py
lanbogao's picture
Add proxy & update fetchSubtitleUrls.
d8804e5
raw
history blame
5.4 kB
import json
import math
import time
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitles(item, lang='en', subType="vtt"):
langs = item.keys()
if len(langs) == 0:
return None
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
if l is None:
return
for subtitle in item[l]:
# print("getUrlFromSubtitles subtitle: %s" % subtitle)
if l != "live_chat" and subType =="xml":
# print("subtitle source url: {}".format(subtitle.get("url")))
return subtitle.get("url").replace("&fmt="+subtitle.get("ext"),"")
if subtitle.get("ext") == subType:
return subtitle.get("url")
return None
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
return await fetchSubtitlebyType(url, lang, subType, proxy)
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt", proxy: Optional[str] = None) -> dict:
ydl_opts = {
"noplaylist": True,
"writesubtitles": False,
"allsubtitles": True,
"subtitleslangs": [lang] if lang else [],
"skip_download": True,
"socket_timeout": 20
}
if proxy:
ydl_opts.update({"proxy": proxy})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
isSrt = False
if info_dict.get("extractor") == "youtube" and subType == "srt":
subType = "xml"
isSrt = True
for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
if info_dict.get(subtitle_item):
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
if subtitle_url:
# print("subtitle_url: {}".format(subtitle_url))
with ydl.urlopen(subtitle_url) as response:
subtitle = xml_caption_to_srt(response.read().decode()) if isSrt else response.read().decode()
print("url{}, title:{}, duration:{} len(subtitle): {}".format(url, title, duration, len(subtitle)))
return {"title": title, "duration": duration,"subtitle": subtitle, "chapters":info_dict.get("chapters", None) }
except Exception as e:
return {"error": str(e)}
return {"title": title, "duration": duration, "error": "No subtitles"}
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt( xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
return "\n".join(segments).strip()
async def fetchSubtitleUrls(url: str, proxy: Optional[str] = None) -> json:
ydl_opts = {
"noplaylist": True,
"writesubtitles": False,
"allsubtitles": True,
"skip_download": True,
}
if proxy:
ydl_opts.update({"proxy": proxy})
title = "unknow"
duration = ""
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
title = info_dict.get("title", "unknow")
seconds = info_dict.get("duration")
duration = str(seconds) if seconds else ""
return {"title": title, "duration": duration, "subtitles": info_dict.get("subtitles"),"automatic_captions": info_dict.get("automatic_captions")}
except Exception as e:
return {"error": str(e)}