import json import math import time from typing import Optional import xml.etree.ElementTree as ElementTree from html import unescape import yt_dlp # yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs) # "subtitles": { # "live_chat": [ # { # "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1", # "ext": "json", # "video_id": "ANtM2bHRz04", # "protocol": "youtube_live_chat_replay" # } # ] # } def getUrlFromSubtitles(item, lang='en', subType="vtt"): langs = item.keys() if len(langs) == 0: return None l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] ) print("getUrlFromSubtitles l: %s, item: %s" % (l, item)) for subtitle in item[l]: print("getUrlFromSubtitles subtitle: %s" % subtitle) if l != "live_chat" and subType =="xml": return subtitle.get("url").replace("fmt="+subtitle.get("ext"),"") if subtitle.get("ext") == subType: return subtitle.get("url") return None async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt") -> Optional[str]: if subType == "srt": subtitle = await fetchSubtitlebyType(url, lang, subType, True) if subtitle: return subtitle subtitle = await fetchSubtitlebyType(url, lang, "xml", True) print(subtitle) return xml_caption_to_srt(subtitle) else: return await fetchSubtitlebyType(url, lang, subType, True) async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType="vtt", decode: bool = False) -> Optional[str]: ydl_opts = { "writesubtitles": True, "allsubtitles": True, "subtitleslangs": [lang] if lang else [], "skip_download": True, "socket_timeout": 20 } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict if info_dict.get(subtitle_item) : subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType) if subtitle_url: with ydl.urlopen(subtitle_url) as subtitle: return subtitle.read().decode() if decode else subtitle.read() return None def float_to_srt_time_format(d: float) -> str: """Convert decimal durations into proper srt format. :rtype: str :returns: SubRip Subtitle (str) formatted time duration. float_to_srt_time_format(3.89) -> '00:00:03,890' """ fraction, whole = math.modf(d) time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole)) ms = f"{fraction:.3f}".replace("0.", "") return time_fmt + ms def xml_caption_to_srt( xml_captions: str) -> str: """Convert xml caption tracks to "SubRip Subtitle (srt)". :param str xml_captions: XML formatted caption tracks. """ segments = [] root = ElementTree.fromstring(xml_captions) for i, child in enumerate(list(root)): text = child.text or "" caption = unescape(text.replace("\n", " ").replace(" ", " "),) try: duration = float(child.attrib["dur"]) except KeyError: duration = 0.0 start = float(child.attrib["start"]) end = start + duration sequence_number = i + 1 # convert from 0-indexed to 1. line = "{seq}\n{start} --> {end}\n{text}\n".format( seq=sequence_number, start=float_to_srt_time_format(start), end=float_to_srt_time_format(end), text=caption, ) segments.append(line) return "\n".join(segments).strip() async def fetchSubtitleUrls(url: str) -> json: ydl_opts = { "writesubtitles": True, "allsubtitles": True, "skip_download": True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info_dict = ydl.extract_info(url, download=False) if info_dict.get("subtitles"): langs = info_dict.get("subtitles").keys() if not (len(langs) == 1 and "live_chat" in langs): return info_dict.get("subtitles") if info_dict.get("automatic_captions"): return info_dict.get("automatic_captions") return None