|
import json |
|
import math |
|
import time |
|
from typing import Optional |
|
import xml.etree.ElementTree as ElementTree |
|
from html import unescape |
|
import yt_dlp |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def getUrlFromSubtitles(item, lang='en', subType="vtt"): |
|
langs = item.keys() |
|
if len(langs) == 0: |
|
return None |
|
|
|
l = lang if lang in langs else ('en' if 'en' in langs else list(langs)[0] ) |
|
print("getUrlFromSubtitles l: %s, item: %s" % (l, item)) |
|
|
|
for subtitle in item[l]: |
|
print("getUrlFromSubtitles subtitle: %s" % subtitle) |
|
if l != "live_chat" and subType =="xml": |
|
return subtitle.get("url").replace("fmt="+subtitle.get("ext"),"") |
|
if subtitle.get("ext") == subType: |
|
return subtitle.get("url") |
|
return None |
|
|
|
async def fetchSubtitle(url: str, lang: Optional[str] = 'en', subType: Optional[str] = "vtt") -> Optional[str]: |
|
if subType == "srt": |
|
subtitle = await fetchSubtitlebyType(url, lang, subType, True) |
|
if subtitle: |
|
return subtitle |
|
subtitle = await fetchSubtitlebyType(url, lang, "xml", True) |
|
print(subtitle) |
|
return xml_caption_to_srt(subtitle) |
|
else: |
|
return await fetchSubtitlebyType(url, lang, subType, True) |
|
|
|
async def fetchSubtitlebyType(url: str, lang: Optional[str] = 'en', subType="vtt", decode: bool = False) -> Optional[str]: |
|
ydl_opts = { |
|
"writesubtitles": True, |
|
"allsubtitles": True, |
|
"subtitleslangs": [lang] if lang else [], |
|
"skip_download": True, |
|
"socket_timeout": 20 |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
info_dict = ydl.extract_info(url, download=False) |
|
for subtitle_item in ["subtitles", "automatic_captions"]: |
|
if info_dict.get(subtitle_item) : |
|
subtitle_url = getUrlFromSubtitles(info_dict.get(subtitle_item), lang, subType) |
|
if subtitle_url: |
|
with ydl.urlopen(subtitle_url) as subtitle: |
|
return subtitle.read().decode() if decode else subtitle.read() |
|
|
|
return None |
|
|
|
def float_to_srt_time_format(d: float) -> str: |
|
"""Convert decimal durations into proper srt format. |
|
:rtype: str |
|
:returns: |
|
SubRip Subtitle (str) formatted time duration. |
|
float_to_srt_time_format(3.89) -> '00:00:03,890' |
|
""" |
|
fraction, whole = math.modf(d) |
|
time_fmt = time.strftime("%H:%M:%S,", time.gmtime(whole)) |
|
ms = f"{fraction:.3f}".replace("0.", "") |
|
return time_fmt + ms |
|
|
|
def xml_caption_to_srt( xml_captions: str) -> str: |
|
"""Convert xml caption tracks to "SubRip Subtitle (srt)". |
|
:param str xml_captions: |
|
XML formatted caption tracks. |
|
""" |
|
segments = [] |
|
root = ElementTree.fromstring(xml_captions) |
|
for i, child in enumerate(list(root)): |
|
text = child.text or "" |
|
caption = unescape(text.replace("\n", " ").replace(" ", " "),) |
|
try: |
|
duration = float(child.attrib["dur"]) |
|
except KeyError: |
|
duration = 0.0 |
|
start = float(child.attrib["start"]) |
|
end = start + duration |
|
sequence_number = i + 1 |
|
line = "{seq}\n{start} --> {end}\n{text}\n".format( |
|
seq=sequence_number, |
|
start=float_to_srt_time_format(start), |
|
end=float_to_srt_time_format(end), |
|
text=caption, |
|
) |
|
segments.append(line) |
|
return "\n".join(segments).strip() |
|
|
|
async def fetchSubtitleUrls(url: str) -> json: |
|
ydl_opts = { |
|
"writesubtitles": True, |
|
"allsubtitles": True, |
|
"skip_download": True, |
|
} |
|
|
|
with yt_dlp.YoutubeDL(ydl_opts) as ydl: |
|
info_dict = ydl.extract_info(url, download=False) |
|
if info_dict.get("subtitles"): |
|
langs = info_dict.get("subtitles").keys() |
|
if not (len(langs) == 1 and "live_chat" in langs): |
|
return info_dict.get("subtitles") |
|
if info_dict.get("automatic_captions"): |
|
return info_dict.get("automatic_captions") |
|
return None |
|
|