File size: 4,413 Bytes
093a866 d030b89 ba9fae4 d030b89 ba9fae4 093a866 d030b89 ba9fae4 d030b89 49a4a29 ba9fae4 093a866 d030b89 ba9fae4 4797dae 093a866 ba9fae4 d030b89 ba9fae4 093a866 d030b89 3dadd80 ba9fae4 093a866 ba9fae4 56ace91 ba9fae4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import json
import math
import time
from typing import Optional
import xml.etree.ElementTree as ElementTree
from html import unescape
import yt_dlp
# yt-dlp subtitle types: json3,srv1,srv2,srv3,ttml,vtt, xml(youtube url with out extargs)
# "subtitles": {
# "live_chat": [
# {
# "url": "https://www.youtube.com/watch?v=ANtM2bHRz04&bpctr=9999999999&has_verified=1",
# "ext": "json",
# "video_id": "ANtM2bHRz04",
# "protocol": "youtube_live_chat_replay"
# }
# ]
# }
def getUrlFromSubtitles(item, lang='en', subType="vtt"):
langs = item.keys()
if len(langs) == 0:
return None
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] )
print("getUrlFromSubtitles l: %s, item: %s" % (l, item))
for subtitle in item[l]:
print("getUrlFromSubtitles subtitle: %s" % subtitle)
if l != "live_chat" and subType =="xml":
return subtitle.get("url").replace("fmt="+subtitle.get("ext"),"")
if subtitle.get("ext") == subType:
return subtitle.get("url")
return None
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt") -> Optional[str]:
if subType == "srt":
subtitle = await fetchSubtitlebyType(url, lang, subType, True)
if subtitle:
return subtitle
subtitle = await fetchSubtitlebyType(url, lang, "xml", True)
print(subtitle)
return xml_caption_to_srt(subtitle)
else:
return await fetchSubtitlebyType(url, lang, subType, True)
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType="vtt", decode: bool = False) -> Optional[str]:
ydl_opts = {
"writesubtitles": True,
"allsubtitles": True,
"subtitleslangs": [lang] if lang else [],
"skip_download": True,
"socket_timeout": 20
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
for subtitle_item in ["subtitles", "automatic_captions"]: # "requested_subtitles" item is dict
if info_dict.get(subtitle_item) :
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType)
if subtitle_url:
with ydl.urlopen(subtitle_url) as subtitle:
return subtitle.read().decode() if decode else subtitle.read()
return None
def float_to_srt_time_format(d: float) -> str:
"""Convert decimal durations into proper srt format.
:rtype: str
:returns:
SubRip Subtitle (str) formatted time duration.
float_to_srt_time_format(3.89) -> '00:00:03,890'
"""
fraction, whole = math.modf(d)
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole))
ms = f"{fraction:.3f}".replace("0.", "")
return time_fmt + ms
def xml_caption_to_srt( xml_captions: str) -> str:
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
:param str xml_captions:
XML formatted caption tracks.
"""
segments = []
root = ElementTree.fromstring(xml_captions)
for i, child in enumerate(list(root)):
text = child.text or ""
caption = unescape(text.replace("\n", " ").replace(" ", " "),)
try:
duration = float(child.attrib["dur"])
except KeyError:
duration = 0.0
start = float(child.attrib["start"])
end = start + duration
sequence_number = i + 1 # convert from 0-indexed to 1.
line = "{seq}\n{start} --> {end}\n{text}\n".format(
seq=sequence_number,
start=float_to_srt_time_format(start),
end=float_to_srt_time_format(end),
text=caption,
)
segments.append(line)
return "\n".join(segments).strip()
async def fetchSubtitleUrls(url: str) -> json:
ydl_opts = {
"writesubtitles": True,
"allsubtitles": True,
"skip_download": True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
if info_dict.get("subtitles"):
langs = info_dict.get("subtitles").keys()
if not (len(langs) == 1 and "live_chat" in langs):
return info_dict.get("subtitles")
if info_dict.get("automatic_captions"):
return info_dict.get("automatic_captions")
return None
|