Joshua Lochner
commited on
Commit
·
a45bd3f
1
Parent(s):
6e9c369
Add youtube transcript api
Browse files- youtube_transcript_api2/__init__.py +16 -0
- youtube_transcript_api2/__main__.py +15 -0
- youtube_transcript_api2/_api.py +140 -0
- youtube_transcript_api2/_cli.py +135 -0
- youtube_transcript_api2/_errors.py +112 -0
- youtube_transcript_api2/_html_unescaping.py +21 -0
- youtube_transcript_api2/_settings.py +1 -0
- youtube_transcript_api2/_transcripts.py +332 -0
- youtube_transcript_api2/formatters.py +165 -0
youtube_transcript_api2/__init__.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ._api import YouTubeTranscriptApi
|
2 |
+
from ._transcripts import TranscriptList, Transcript
|
3 |
+
from ._errors import (
|
4 |
+
TranscriptsDisabled,
|
5 |
+
NoTranscriptFound,
|
6 |
+
CouldNotRetrieveTranscript,
|
7 |
+
VideoUnavailable,
|
8 |
+
TooManyRequests,
|
9 |
+
NotTranslatable,
|
10 |
+
TranslationLanguageNotAvailable,
|
11 |
+
NoTranscriptAvailable,
|
12 |
+
CookiePathInvalid,
|
13 |
+
CookiesInvalid,
|
14 |
+
FailedToCreateConsentCookie,
|
15 |
+
YouTubeRequestFailed,
|
16 |
+
)
|
youtube_transcript_api2/__main__.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
import logging
|
4 |
+
|
5 |
+
from ._cli import YouTubeTranscriptCli
|
6 |
+
|
7 |
+
|
8 |
+
def main():
|
9 |
+
logging.basicConfig()
|
10 |
+
|
11 |
+
print(YouTubeTranscriptCli(sys.argv[1:]).run())
|
12 |
+
|
13 |
+
|
14 |
+
if __name__ == '__main__':
|
15 |
+
main()
|
youtube_transcript_api2/_api.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
try: # pragma: no cover
|
3 |
+
import http.cookiejar as cookiejar
|
4 |
+
CookieLoadError = (FileNotFoundError, cookiejar.LoadError)
|
5 |
+
except ImportError: # pragma: no cover
|
6 |
+
import cookielib as cookiejar
|
7 |
+
CookieLoadError = IOError
|
8 |
+
|
9 |
+
from ._transcripts import TranscriptListFetcher
|
10 |
+
|
11 |
+
from ._errors import (
|
12 |
+
CookiePathInvalid,
|
13 |
+
CookiesInvalid
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
class YouTubeTranscriptApi(object):
|
18 |
+
@classmethod
|
19 |
+
def list_transcripts(cls, video_id, proxies=None, cookies=None):
|
20 |
+
"""
|
21 |
+
Retrieves the list of transcripts which are available for a given video. It returns a `TranscriptList` object
|
22 |
+
which is iterable and provides methods to filter the list of transcripts for specific languages. While iterating
|
23 |
+
over the `TranscriptList` the individual transcripts are represented by `Transcript` objects, which provide
|
24 |
+
metadata and can either be fetched by calling `transcript.fetch()` or translated by calling
|
25 |
+
`transcript.translate('en')`. Example::
|
26 |
+
|
27 |
+
# retrieve the available transcripts
|
28 |
+
transcript_list = YouTubeTranscriptApi.get('video_id')
|
29 |
+
|
30 |
+
# iterate over all available transcripts
|
31 |
+
for transcript in transcript_list:
|
32 |
+
# the Transcript object provides metadata properties
|
33 |
+
print(
|
34 |
+
transcript.video_id,
|
35 |
+
transcript.language,
|
36 |
+
transcript.language_code,
|
37 |
+
# whether it has been manually created or generated by YouTube
|
38 |
+
transcript.is_generated,
|
39 |
+
# a list of languages the transcript can be translated to
|
40 |
+
transcript.translation_languages,
|
41 |
+
)
|
42 |
+
|
43 |
+
# fetch the actual transcript data
|
44 |
+
print(transcript.fetch())
|
45 |
+
|
46 |
+
# translating the transcript will return another transcript object
|
47 |
+
print(transcript.translate('en').fetch())
|
48 |
+
|
49 |
+
# you can also directly filter for the language you are looking for, using the transcript list
|
50 |
+
transcript = transcript_list.find_transcript(['de', 'en'])
|
51 |
+
|
52 |
+
# or just filter for manually created transcripts
|
53 |
+
transcript = transcript_list.find_manually_created_transcript(['de', 'en'])
|
54 |
+
|
55 |
+
# or automatically generated ones
|
56 |
+
transcript = transcript_list.find_generated_transcript(['de', 'en'])
|
57 |
+
|
58 |
+
:param video_id: the youtube video id
|
59 |
+
:type video_id: str
|
60 |
+
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
61 |
+
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
62 |
+
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
63 |
+
:type cookies: str
|
64 |
+
:return: the list of available transcripts
|
65 |
+
:rtype TranscriptList:
|
66 |
+
"""
|
67 |
+
with requests.Session() as http_client:
|
68 |
+
if cookies:
|
69 |
+
http_client.cookies = cls._load_cookies(cookies, video_id)
|
70 |
+
http_client.proxies = proxies if proxies else {}
|
71 |
+
return TranscriptListFetcher(http_client).fetch(video_id)
|
72 |
+
|
73 |
+
@classmethod
|
74 |
+
def get_transcripts(cls, video_ids, languages=('en',), continue_after_error=False, proxies=None, cookies=None):
|
75 |
+
"""
|
76 |
+
Retrieves the transcripts for a list of videos.
|
77 |
+
|
78 |
+
:param video_ids: a list of youtube video ids
|
79 |
+
:type video_ids: list[str]
|
80 |
+
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
81 |
+
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
82 |
+
do so.
|
83 |
+
:type languages: list[str]
|
84 |
+
:param continue_after_error: if this is set the execution won't be stopped, if an error occurs while retrieving
|
85 |
+
one of the video transcripts
|
86 |
+
:type continue_after_error: bool
|
87 |
+
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
88 |
+
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
89 |
+
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
90 |
+
:type cookies: str
|
91 |
+
:return: a tuple containing a dictionary mapping video ids onto their corresponding transcripts, and a list of
|
92 |
+
video ids, which could not be retrieved
|
93 |
+
:rtype ({str: [{'text': str, 'start': float, 'end': float}]}, [str]}):
|
94 |
+
"""
|
95 |
+
data = {}
|
96 |
+
unretrievable_videos = []
|
97 |
+
|
98 |
+
for video_id in video_ids:
|
99 |
+
try:
|
100 |
+
data[video_id] = cls.get_transcript(video_id, languages, proxies, cookies)
|
101 |
+
except Exception as exception:
|
102 |
+
if not continue_after_error:
|
103 |
+
raise exception
|
104 |
+
|
105 |
+
unretrievable_videos.append(video_id)
|
106 |
+
|
107 |
+
return data, unretrievable_videos
|
108 |
+
|
109 |
+
@classmethod
|
110 |
+
def get_transcript(cls, video_id, languages=('en',), proxies=None, cookies=None):
|
111 |
+
"""
|
112 |
+
Retrieves the transcript for a single video. This is just a shortcut for calling::
|
113 |
+
|
114 |
+
YouTubeTranscriptApi.list_transcripts(video_id, proxies).find_transcript(languages).fetch()
|
115 |
+
|
116 |
+
:param video_id: the youtube video id
|
117 |
+
:type video_id: str
|
118 |
+
:param languages: A list of language codes in a descending priority. For example, if this is set to ['de', 'en']
|
119 |
+
it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails to
|
120 |
+
do so.
|
121 |
+
:type languages: list[str]
|
122 |
+
:param proxies: a dictionary mapping of http and https proxies to be used for the network requests
|
123 |
+
:type proxies: {'http': str, 'https': str} - http://docs.python-requests.org/en/master/user/advanced/#proxies
|
124 |
+
:param cookies: a string of the path to a text file containing youtube authorization cookies
|
125 |
+
:type cookies: str
|
126 |
+
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
127 |
+
:rtype [{'text': str, 'start': float, 'end': float}]:
|
128 |
+
"""
|
129 |
+
return cls.list_transcripts(video_id, proxies, cookies).find_transcript(languages).fetch()
|
130 |
+
|
131 |
+
@classmethod
|
132 |
+
def _load_cookies(cls, cookies, video_id):
|
133 |
+
try:
|
134 |
+
cookie_jar = cookiejar.MozillaCookieJar()
|
135 |
+
cookie_jar.load(cookies)
|
136 |
+
if not cookie_jar:
|
137 |
+
raise CookiesInvalid(video_id)
|
138 |
+
return cookie_jar
|
139 |
+
except CookieLoadError:
|
140 |
+
raise CookiePathInvalid(video_id)
|
youtube_transcript_api2/_cli.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
|
3 |
+
from ._api import YouTubeTranscriptApi
|
4 |
+
|
5 |
+
from .formatters import FormatterLoader
|
6 |
+
|
7 |
+
|
8 |
+
class YouTubeTranscriptCli(object):
|
9 |
+
def __init__(self, args):
|
10 |
+
self._args = args
|
11 |
+
|
12 |
+
def run(self):
|
13 |
+
parsed_args = self._parse_args()
|
14 |
+
|
15 |
+
if parsed_args.exclude_manually_created and parsed_args.exclude_generated:
|
16 |
+
return ''
|
17 |
+
|
18 |
+
proxies = None
|
19 |
+
if parsed_args.http_proxy != '' or parsed_args.https_proxy != '':
|
20 |
+
proxies = {"http": parsed_args.http_proxy, "https": parsed_args.https_proxy}
|
21 |
+
|
22 |
+
cookies = parsed_args.cookies
|
23 |
+
|
24 |
+
transcripts = []
|
25 |
+
exceptions = []
|
26 |
+
|
27 |
+
for video_id in parsed_args.video_ids:
|
28 |
+
try:
|
29 |
+
transcripts.append(self._fetch_transcript(parsed_args, proxies, cookies, video_id))
|
30 |
+
except Exception as exception:
|
31 |
+
exceptions.append(exception)
|
32 |
+
|
33 |
+
return '\n\n'.join(
|
34 |
+
[str(exception) for exception in exceptions]
|
35 |
+
+ ([FormatterLoader().load(parsed_args.format).format_transcripts(transcripts)] if transcripts else [])
|
36 |
+
)
|
37 |
+
|
38 |
+
def _fetch_transcript(self, parsed_args, proxies, cookies, video_id):
|
39 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id, proxies=proxies, cookies=cookies)
|
40 |
+
|
41 |
+
if parsed_args.list_transcripts:
|
42 |
+
return str(transcript_list)
|
43 |
+
|
44 |
+
if parsed_args.exclude_manually_created:
|
45 |
+
transcript = transcript_list.find_generated_transcript(parsed_args.languages)
|
46 |
+
elif parsed_args.exclude_generated:
|
47 |
+
transcript = transcript_list.find_manually_created_transcript(parsed_args.languages)
|
48 |
+
else:
|
49 |
+
transcript = transcript_list.find_transcript(parsed_args.languages)
|
50 |
+
|
51 |
+
if parsed_args.translate:
|
52 |
+
transcript = transcript.translate(parsed_args.translate)
|
53 |
+
|
54 |
+
return transcript.fetch()
|
55 |
+
|
56 |
+
def _parse_args(self):
|
57 |
+
parser = argparse.ArgumentParser(
|
58 |
+
description=(
|
59 |
+
'This is an python API which allows you to get the transcripts/subtitles for a given YouTube video. '
|
60 |
+
'It also works for automatically generated subtitles and it does not require a headless browser, like '
|
61 |
+
'other selenium based solutions do!'
|
62 |
+
)
|
63 |
+
)
|
64 |
+
parser.add_argument(
|
65 |
+
'--list-transcripts',
|
66 |
+
action='store_const',
|
67 |
+
const=True,
|
68 |
+
default=False,
|
69 |
+
help='This will list the languages in which the given videos are available in.',
|
70 |
+
)
|
71 |
+
parser.add_argument('video_ids', nargs='+', type=str, help='List of YouTube video IDs.')
|
72 |
+
parser.add_argument(
|
73 |
+
'--languages',
|
74 |
+
nargs='*',
|
75 |
+
default=['en',],
|
76 |
+
type=str,
|
77 |
+
help=(
|
78 |
+
'A list of language codes in a descending priority. For example, if this is set to "de en" it will '
|
79 |
+
'first try to fetch the german transcript (de) and then fetch the english transcript (en) if it fails '
|
80 |
+
'to do so. As I can\'t provide a complete list of all working language codes with full certainty, you '
|
81 |
+
'may have to play around with the language codes a bit, to find the one which is working for you!'
|
82 |
+
),
|
83 |
+
)
|
84 |
+
parser.add_argument(
|
85 |
+
'--exclude-generated',
|
86 |
+
action='store_const',
|
87 |
+
const=True,
|
88 |
+
default=False,
|
89 |
+
help='If this flag is set transcripts which have been generated by YouTube will not be retrieved.',
|
90 |
+
)
|
91 |
+
parser.add_argument(
|
92 |
+
'--exclude-manually-created',
|
93 |
+
action='store_const',
|
94 |
+
const=True,
|
95 |
+
default=False,
|
96 |
+
help='If this flag is set transcripts which have been manually created will not be retrieved.',
|
97 |
+
)
|
98 |
+
parser.add_argument(
|
99 |
+
'--format',
|
100 |
+
type=str,
|
101 |
+
default='pretty',
|
102 |
+
choices=tuple(FormatterLoader.TYPES.keys()),
|
103 |
+
)
|
104 |
+
parser.add_argument(
|
105 |
+
'--translate',
|
106 |
+
default='',
|
107 |
+
help=(
|
108 |
+
'The language code for the language you want this transcript to be translated to. Use the '
|
109 |
+
'--list-transcripts feature to find out which languages are translatable and which translation '
|
110 |
+
'languages are available.'
|
111 |
+
)
|
112 |
+
)
|
113 |
+
parser.add_argument(
|
114 |
+
'--http-proxy',
|
115 |
+
default='',
|
116 |
+
metavar='URL',
|
117 |
+
help='Use the specified HTTP proxy.'
|
118 |
+
)
|
119 |
+
parser.add_argument(
|
120 |
+
'--https-proxy',
|
121 |
+
default='',
|
122 |
+
metavar='URL',
|
123 |
+
help='Use the specified HTTPS proxy.'
|
124 |
+
)
|
125 |
+
parser.add_argument(
|
126 |
+
'--cookies',
|
127 |
+
default=None,
|
128 |
+
help='The cookie file that will be used for authorization with youtube.'
|
129 |
+
)
|
130 |
+
|
131 |
+
return self._sanitize_video_ids(parser.parse_args(self._args))
|
132 |
+
|
133 |
+
def _sanitize_video_ids(self, args):
|
134 |
+
args.video_ids = [video_id.replace('\\', '') for video_id in args.video_ids]
|
135 |
+
return args
|
youtube_transcript_api2/_errors.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ._settings import WATCH_URL
|
2 |
+
|
3 |
+
|
4 |
+
class CouldNotRetrieveTranscript(Exception):
|
5 |
+
"""
|
6 |
+
Raised if a transcript could not be retrieved.
|
7 |
+
"""
|
8 |
+
ERROR_MESSAGE = '\nCould not retrieve a transcript for the video {video_url}!'
|
9 |
+
CAUSE_MESSAGE_INTRO = ' This is most likely caused by:\n\n{cause}'
|
10 |
+
CAUSE_MESSAGE = ''
|
11 |
+
GITHUB_REFERRAL = (
|
12 |
+
'\n\nIf you are sure that the described cause is not responsible for this error '
|
13 |
+
'and that a transcript should be retrievable, please create an issue at '
|
14 |
+
'https://github.com/jdepoix/youtube-transcript-api/issues. '
|
15 |
+
'Please add which version of youtube_transcript_api you are using '
|
16 |
+
'and provide the information needed to replicate the error. '
|
17 |
+
'Also make sure that there are no open issues which already describe your problem!'
|
18 |
+
)
|
19 |
+
|
20 |
+
def __init__(self, video_id):
|
21 |
+
self.video_id = video_id
|
22 |
+
super(CouldNotRetrieveTranscript, self).__init__(self._build_error_message())
|
23 |
+
|
24 |
+
def _build_error_message(self):
|
25 |
+
cause = self.cause
|
26 |
+
error_message = self.ERROR_MESSAGE.format(video_url=WATCH_URL.format(video_id=self.video_id))
|
27 |
+
|
28 |
+
if cause:
|
29 |
+
error_message += self.CAUSE_MESSAGE_INTRO.format(cause=cause) + self.GITHUB_REFERRAL
|
30 |
+
|
31 |
+
return error_message
|
32 |
+
|
33 |
+
@property
|
34 |
+
def cause(self):
|
35 |
+
return self.CAUSE_MESSAGE
|
36 |
+
|
37 |
+
|
38 |
+
class YouTubeRequestFailed(CouldNotRetrieveTranscript):
|
39 |
+
CAUSE_MESSAGE = 'Request to YouTube failed: {reason}'
|
40 |
+
|
41 |
+
def __init__(self, video_id, http_error):
|
42 |
+
self.reason = str(http_error)
|
43 |
+
super(YouTubeRequestFailed, self).__init__(video_id)
|
44 |
+
|
45 |
+
@property
|
46 |
+
def cause(self):
|
47 |
+
return self.CAUSE_MESSAGE.format(
|
48 |
+
reason=self.reason,
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
class VideoUnavailable(CouldNotRetrieveTranscript):
|
53 |
+
CAUSE_MESSAGE = 'The video is no longer available'
|
54 |
+
|
55 |
+
|
56 |
+
class TooManyRequests(CouldNotRetrieveTranscript):
|
57 |
+
CAUSE_MESSAGE = (
|
58 |
+
'YouTube is receiving too many requests from this IP and now requires solving a captcha to continue. '
|
59 |
+
'One of the following things can be done to work around this:\n\
|
60 |
+
- Manually solve the captcha in a browser and export the cookie. '
|
61 |
+
'Read here how to use that cookie with '
|
62 |
+
'youtube-transcript-api: https://github.com/jdepoix/youtube-transcript-api#cookies\n\
|
63 |
+
- Use a different IP address\n\
|
64 |
+
- Wait until the ban on your IP has been lifted'
|
65 |
+
)
|
66 |
+
|
67 |
+
|
68 |
+
class TranscriptsDisabled(CouldNotRetrieveTranscript):
|
69 |
+
CAUSE_MESSAGE = 'Subtitles are disabled for this video'
|
70 |
+
|
71 |
+
|
72 |
+
class NoTranscriptAvailable(CouldNotRetrieveTranscript):
|
73 |
+
CAUSE_MESSAGE = 'No transcripts are available for this video'
|
74 |
+
|
75 |
+
|
76 |
+
class NotTranslatable(CouldNotRetrieveTranscript):
|
77 |
+
CAUSE_MESSAGE = 'The requested language is not translatable'
|
78 |
+
|
79 |
+
|
80 |
+
class TranslationLanguageNotAvailable(CouldNotRetrieveTranscript):
|
81 |
+
CAUSE_MESSAGE = 'The requested translation language is not available'
|
82 |
+
|
83 |
+
|
84 |
+
class CookiePathInvalid(CouldNotRetrieveTranscript):
|
85 |
+
CAUSE_MESSAGE = 'The provided cookie file was unable to be loaded'
|
86 |
+
|
87 |
+
|
88 |
+
class CookiesInvalid(CouldNotRetrieveTranscript):
|
89 |
+
CAUSE_MESSAGE = 'The cookies provided are not valid (may have expired)'
|
90 |
+
|
91 |
+
|
92 |
+
class FailedToCreateConsentCookie(CouldNotRetrieveTranscript):
|
93 |
+
CAUSE_MESSAGE = 'Failed to automatically give consent to saving cookies'
|
94 |
+
|
95 |
+
|
96 |
+
class NoTranscriptFound(CouldNotRetrieveTranscript):
|
97 |
+
CAUSE_MESSAGE = (
|
98 |
+
'No transcripts were found for any of the requested language codes: {requested_language_codes}\n\n'
|
99 |
+
'{transcript_data}'
|
100 |
+
)
|
101 |
+
|
102 |
+
def __init__(self, video_id, requested_language_codes, transcript_data):
|
103 |
+
self._requested_language_codes = requested_language_codes
|
104 |
+
self._transcript_data = transcript_data
|
105 |
+
super(NoTranscriptFound, self).__init__(video_id)
|
106 |
+
|
107 |
+
@property
|
108 |
+
def cause(self):
|
109 |
+
return self.CAUSE_MESSAGE.format(
|
110 |
+
requested_language_codes=self._requested_language_codes,
|
111 |
+
transcript_data=str(self._transcript_data),
|
112 |
+
)
|
youtube_transcript_api2/_html_unescaping.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
|
4 |
+
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
5 |
+
if sys.version_info.major == 3 and sys.version_info.minor >= 4: # pragma: no cover
|
6 |
+
# Python 3.4+
|
7 |
+
from html import unescape
|
8 |
+
else: # pragma: no cover
|
9 |
+
if sys.version_info.major <= 2:
|
10 |
+
# Python 2
|
11 |
+
import HTMLParser
|
12 |
+
|
13 |
+
html_parser = HTMLParser.HTMLParser()
|
14 |
+
else:
|
15 |
+
# Python 3.0 - 3.3
|
16 |
+
import html.parser
|
17 |
+
|
18 |
+
html_parser = html.parser.HTMLParser()
|
19 |
+
|
20 |
+
def unescape(string):
|
21 |
+
return html_parser.unescape(string)
|
youtube_transcript_api2/_settings.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
WATCH_URL = 'https://www.youtube.com/watch?v={video_id}'
|
youtube_transcript_api2/_transcripts.py
ADDED
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
# This can only be tested by using different python versions, therefore it is not covered by coverage.py
|
4 |
+
if sys.version_info.major == 2: # pragma: no cover
|
5 |
+
reload(sys)
|
6 |
+
sys.setdefaultencoding('utf-8')
|
7 |
+
|
8 |
+
import json
|
9 |
+
|
10 |
+
from xml.etree import ElementTree
|
11 |
+
|
12 |
+
import re
|
13 |
+
|
14 |
+
from requests import HTTPError
|
15 |
+
|
16 |
+
from ._html_unescaping import unescape
|
17 |
+
from ._errors import (
|
18 |
+
VideoUnavailable,
|
19 |
+
TooManyRequests,
|
20 |
+
YouTubeRequestFailed,
|
21 |
+
NoTranscriptFound,
|
22 |
+
TranscriptsDisabled,
|
23 |
+
NotTranslatable,
|
24 |
+
TranslationLanguageNotAvailable,
|
25 |
+
NoTranscriptAvailable,
|
26 |
+
FailedToCreateConsentCookie,
|
27 |
+
)
|
28 |
+
from ._settings import WATCH_URL
|
29 |
+
|
30 |
+
|
31 |
+
def _raise_http_errors(response, video_id):
|
32 |
+
try:
|
33 |
+
response.raise_for_status()
|
34 |
+
return response
|
35 |
+
except HTTPError as error:
|
36 |
+
raise YouTubeRequestFailed(error, video_id)
|
37 |
+
|
38 |
+
|
39 |
+
class TranscriptListFetcher(object):
|
40 |
+
def __init__(self, http_client):
|
41 |
+
self._http_client = http_client
|
42 |
+
|
43 |
+
def fetch(self, video_id):
|
44 |
+
return TranscriptList.build(
|
45 |
+
self._http_client,
|
46 |
+
video_id,
|
47 |
+
self._extract_captions_json(self._fetch_video_html(video_id), video_id)
|
48 |
+
)
|
49 |
+
|
50 |
+
def _extract_captions_json(self, html, video_id):
|
51 |
+
splitted_html = html.split('"captions":')
|
52 |
+
|
53 |
+
if len(splitted_html) <= 1:
|
54 |
+
if 'class="g-recaptcha"' in html:
|
55 |
+
raise TooManyRequests(video_id)
|
56 |
+
if '"playabilityStatus":' not in html:
|
57 |
+
raise VideoUnavailable(video_id)
|
58 |
+
|
59 |
+
raise TranscriptsDisabled(video_id)
|
60 |
+
|
61 |
+
captions_json = json.loads(
|
62 |
+
splitted_html[1].split(',"videoDetails')[0].replace('\n', '')
|
63 |
+
).get('playerCaptionsTracklistRenderer')
|
64 |
+
if captions_json is None:
|
65 |
+
raise TranscriptsDisabled(video_id)
|
66 |
+
|
67 |
+
if 'captionTracks' not in captions_json:
|
68 |
+
raise NoTranscriptAvailable(video_id)
|
69 |
+
|
70 |
+
return captions_json
|
71 |
+
|
72 |
+
def _create_consent_cookie(self, html, video_id):
|
73 |
+
match = re.search('name="v" value="(.*?)"', html)
|
74 |
+
if match is None:
|
75 |
+
raise FailedToCreateConsentCookie(video_id)
|
76 |
+
self._http_client.cookies.set('CONSENT', 'YES+' + match.group(1), domain='.youtube.com')
|
77 |
+
|
78 |
+
def _fetch_video_html(self, video_id):
|
79 |
+
html = self._fetch_html(video_id)
|
80 |
+
if 'action="https://consent.youtube.com/s"' in html:
|
81 |
+
self._create_consent_cookie(html, video_id)
|
82 |
+
html = self._fetch_html(video_id)
|
83 |
+
if 'action="https://consent.youtube.com/s"' in html:
|
84 |
+
raise FailedToCreateConsentCookie(video_id)
|
85 |
+
return html
|
86 |
+
|
87 |
+
def _fetch_html(self, video_id):
|
88 |
+
response = self._http_client.get(WATCH_URL.format(video_id=video_id))
|
89 |
+
return unescape(_raise_http_errors(response, video_id).text)
|
90 |
+
|
91 |
+
|
92 |
+
class TranscriptList(object):
|
93 |
+
"""
|
94 |
+
This object represents a list of transcripts. It can be iterated over to list all transcripts which are available
|
95 |
+
for a given YouTube video. Also it provides functionality to search for a transcript in a given language.
|
96 |
+
"""
|
97 |
+
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
|
98 |
+
"""
|
99 |
+
The constructor is only for internal use. Use the static build method instead.
|
100 |
+
|
101 |
+
:param video_id: the id of the video this TranscriptList is for
|
102 |
+
:type video_id: str
|
103 |
+
:param manually_created_transcripts: dict mapping language codes to the manually created transcripts
|
104 |
+
:type manually_created_transcripts: dict[str, Transcript]
|
105 |
+
:param generated_transcripts: dict mapping language codes to the generated transcripts
|
106 |
+
:type generated_transcripts: dict[str, Transcript]
|
107 |
+
:param translation_languages: list of languages which can be used for translatable languages
|
108 |
+
:type translation_languages: list[dict[str, str]]
|
109 |
+
"""
|
110 |
+
self.video_id = video_id
|
111 |
+
self._manually_created_transcripts = manually_created_transcripts
|
112 |
+
self._generated_transcripts = generated_transcripts
|
113 |
+
self._translation_languages = translation_languages
|
114 |
+
|
115 |
+
@staticmethod
|
116 |
+
def build(http_client, video_id, captions_json):
|
117 |
+
"""
|
118 |
+
Factory method for TranscriptList.
|
119 |
+
|
120 |
+
:param http_client: http client which is used to make the transcript retrieving http calls
|
121 |
+
:type http_client: requests.Session
|
122 |
+
:param video_id: the id of the video this TranscriptList is for
|
123 |
+
:type video_id: str
|
124 |
+
:param captions_json: the JSON parsed from the YouTube pages static HTML
|
125 |
+
:type captions_json: dict
|
126 |
+
:return: the created TranscriptList
|
127 |
+
:rtype TranscriptList:
|
128 |
+
"""
|
129 |
+
translation_languages = [
|
130 |
+
{
|
131 |
+
'language': translation_language['languageName']['simpleText'],
|
132 |
+
'language_code': translation_language['languageCode'],
|
133 |
+
} for translation_language in captions_json['translationLanguages']
|
134 |
+
]
|
135 |
+
|
136 |
+
manually_created_transcripts = {}
|
137 |
+
generated_transcripts = {}
|
138 |
+
|
139 |
+
for caption in captions_json['captionTracks']:
|
140 |
+
if caption.get('kind', '') == 'asr':
|
141 |
+
transcript_dict = generated_transcripts
|
142 |
+
else:
|
143 |
+
transcript_dict = manually_created_transcripts
|
144 |
+
|
145 |
+
transcript_dict[caption['languageCode']] = Transcript(
|
146 |
+
http_client,
|
147 |
+
video_id,
|
148 |
+
caption['baseUrl'],
|
149 |
+
caption['name']['simpleText'],
|
150 |
+
caption['languageCode'],
|
151 |
+
caption.get('kind', '') == 'asr',
|
152 |
+
translation_languages if caption.get('isTranslatable', False) else []
|
153 |
+
)
|
154 |
+
|
155 |
+
return TranscriptList(
|
156 |
+
video_id,
|
157 |
+
manually_created_transcripts,
|
158 |
+
generated_transcripts,
|
159 |
+
translation_languages,
|
160 |
+
)
|
161 |
+
|
162 |
+
def __iter__(self):
|
163 |
+
return iter(list(self._manually_created_transcripts.values()) + list(self._generated_transcripts.values()))
|
164 |
+
|
165 |
+
def find_transcript(self, language_codes):
|
166 |
+
"""
|
167 |
+
Finds a transcript for a given language code. Manually created transcripts are returned first and only if none
|
168 |
+
are found, generated transcripts are used. If you only want generated transcripts use
|
169 |
+
`find_manually_created_transcript` instead.
|
170 |
+
|
171 |
+
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
172 |
+
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
173 |
+
it fails to do so.
|
174 |
+
:type languages: list[str]
|
175 |
+
:return: the found Transcript
|
176 |
+
:rtype Transcript:
|
177 |
+
:raises: NoTranscriptFound
|
178 |
+
"""
|
179 |
+
return self._find_transcript(language_codes, [self._manually_created_transcripts, self._generated_transcripts])
|
180 |
+
|
181 |
+
def find_generated_transcript(self, language_codes):
|
182 |
+
"""
|
183 |
+
Finds a automatically generated transcript for a given language code.
|
184 |
+
|
185 |
+
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
186 |
+
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
187 |
+
it fails to do so.
|
188 |
+
:type languages: list[str]
|
189 |
+
:return: the found Transcript
|
190 |
+
:rtype Transcript:
|
191 |
+
:raises: NoTranscriptFound
|
192 |
+
"""
|
193 |
+
return self._find_transcript(language_codes, [self._generated_transcripts,])
|
194 |
+
|
195 |
+
def find_manually_created_transcript(self, language_codes):
|
196 |
+
"""
|
197 |
+
Finds a manually created transcript for a given language code.
|
198 |
+
|
199 |
+
:param language_codes: A list of language codes in a descending priority. For example, if this is set to
|
200 |
+
['de', 'en'] it will first try to fetch the german transcript (de) and then fetch the english transcript (en) if
|
201 |
+
it fails to do so.
|
202 |
+
:type languages: list[str]
|
203 |
+
:return: the found Transcript
|
204 |
+
:rtype Transcript:
|
205 |
+
:raises: NoTranscriptFound
|
206 |
+
"""
|
207 |
+
return self._find_transcript(language_codes, [self._manually_created_transcripts,])
|
208 |
+
|
209 |
+
def _find_transcript(self, language_codes, transcript_dicts):
|
210 |
+
for language_code in language_codes:
|
211 |
+
for transcript_dict in transcript_dicts:
|
212 |
+
if language_code in transcript_dict:
|
213 |
+
return transcript_dict[language_code]
|
214 |
+
|
215 |
+
raise NoTranscriptFound(
|
216 |
+
self.video_id,
|
217 |
+
language_codes,
|
218 |
+
self
|
219 |
+
)
|
220 |
+
|
221 |
+
def __str__(self):
|
222 |
+
return (
|
223 |
+
'For this video ({video_id}) transcripts are available in the following languages:\n\n'
|
224 |
+
'(MANUALLY CREATED)\n'
|
225 |
+
'{available_manually_created_transcript_languages}\n\n'
|
226 |
+
'(GENERATED)\n'
|
227 |
+
'{available_generated_transcripts}\n\n'
|
228 |
+
'(TRANSLATION LANGUAGES)\n'
|
229 |
+
'{available_translation_languages}'
|
230 |
+
).format(
|
231 |
+
video_id=self.video_id,
|
232 |
+
available_manually_created_transcript_languages=self._get_language_description(
|
233 |
+
str(transcript) for transcript in self._manually_created_transcripts.values()
|
234 |
+
),
|
235 |
+
available_generated_transcripts=self._get_language_description(
|
236 |
+
str(transcript) for transcript in self._generated_transcripts.values()
|
237 |
+
),
|
238 |
+
available_translation_languages=self._get_language_description(
|
239 |
+
'{language_code} ("{language}")'.format(
|
240 |
+
language=translation_language['language'],
|
241 |
+
language_code=translation_language['language_code'],
|
242 |
+
) for translation_language in self._translation_languages
|
243 |
+
)
|
244 |
+
)
|
245 |
+
|
246 |
+
def _get_language_description(self, transcript_strings):
|
247 |
+
description = '\n'.join(' - {transcript}'.format(transcript=transcript) for transcript in transcript_strings)
|
248 |
+
return description if description else 'None'
|
249 |
+
|
250 |
+
|
251 |
+
class Transcript(object):
|
252 |
+
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
|
253 |
+
"""
|
254 |
+
You probably don't want to initialize this directly. Usually you'll access Transcript objects using a
|
255 |
+
TranscriptList.
|
256 |
+
|
257 |
+
:param http_client: http client which is used to make the transcript retrieving http calls
|
258 |
+
:type http_client: requests.Session
|
259 |
+
:param video_id: the id of the video this TranscriptList is for
|
260 |
+
:type video_id: str
|
261 |
+
:param url: the url which needs to be called to fetch the transcript
|
262 |
+
:param language: the name of the language this transcript uses
|
263 |
+
:param language_code:
|
264 |
+
:param is_generated:
|
265 |
+
:param translation_languages:
|
266 |
+
"""
|
267 |
+
self._http_client = http_client
|
268 |
+
self.video_id = video_id
|
269 |
+
self._url = url
|
270 |
+
self.language = language
|
271 |
+
self.language_code = language_code
|
272 |
+
self.is_generated = is_generated
|
273 |
+
self.translation_languages = translation_languages
|
274 |
+
self._translation_languages_dict = {
|
275 |
+
translation_language['language_code']: translation_language['language']
|
276 |
+
for translation_language in translation_languages
|
277 |
+
}
|
278 |
+
|
279 |
+
def fetch(self):
|
280 |
+
"""
|
281 |
+
Loads the actual transcript data.
|
282 |
+
|
283 |
+
:return: a list of dictionaries containing the 'text', 'start' and 'duration' keys
|
284 |
+
:rtype [{'text': str, 'start': float, 'end': float}]:
|
285 |
+
"""
|
286 |
+
response = self._http_client.get(self._url)
|
287 |
+
return _TranscriptParser().parse(
|
288 |
+
_raise_http_errors(response, self.video_id).text,
|
289 |
+
)
|
290 |
+
|
291 |
+
def __str__(self):
|
292 |
+
return '{language_code} ("{language}"){translation_description}'.format(
|
293 |
+
language=self.language,
|
294 |
+
language_code=self.language_code,
|
295 |
+
translation_description='[TRANSLATABLE]' if self.is_translatable else ''
|
296 |
+
)
|
297 |
+
|
298 |
+
@property
|
299 |
+
def is_translatable(self):
|
300 |
+
return len(self.translation_languages) > 0
|
301 |
+
|
302 |
+
def translate(self, language_code):
|
303 |
+
if not self.is_translatable:
|
304 |
+
raise NotTranslatable(self.video_id)
|
305 |
+
|
306 |
+
if language_code not in self._translation_languages_dict:
|
307 |
+
raise TranslationLanguageNotAvailable(self.video_id)
|
308 |
+
|
309 |
+
return Transcript(
|
310 |
+
self._http_client,
|
311 |
+
self.video_id,
|
312 |
+
'{url}&tlang={language_code}'.format(url=self._url, language_code=language_code),
|
313 |
+
self._translation_languages_dict[language_code],
|
314 |
+
language_code,
|
315 |
+
True,
|
316 |
+
[],
|
317 |
+
)
|
318 |
+
|
319 |
+
|
320 |
+
class _TranscriptParser(object):
|
321 |
+
HTML_TAG_REGEX = re.compile(r'<[^>]*>', re.IGNORECASE)
|
322 |
+
|
323 |
+
def parse(self, plain_data):
|
324 |
+
return [
|
325 |
+
{
|
326 |
+
'text': re.sub(self.HTML_TAG_REGEX, '', unescape(xml_element.text)),
|
327 |
+
'start': float(xml_element.attrib['start']),
|
328 |
+
'duration': float(xml_element.attrib.get('dur', '0.0')),
|
329 |
+
}
|
330 |
+
for xml_element in ElementTree.fromstring(plain_data)
|
331 |
+
if xml_element.text is not None
|
332 |
+
]
|
youtube_transcript_api2/formatters.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import pprint
|
4 |
+
|
5 |
+
|
6 |
+
class Formatter(object):
|
7 |
+
"""Formatter should be used as an abstract base class.
|
8 |
+
|
9 |
+
Formatter classes should inherit from this class and implement
|
10 |
+
their own .format() method which should return a string. A
|
11 |
+
transcript is represented by a List of Dictionary items.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def format_transcript(self, transcript, **kwargs):
|
15 |
+
raise NotImplementedError('A subclass of Formatter must implement ' \
|
16 |
+
'their own .format_transcript() method.')
|
17 |
+
|
18 |
+
def format_transcripts(self, transcripts, **kwargs):
|
19 |
+
raise NotImplementedError('A subclass of Formatter must implement ' \
|
20 |
+
'their own .format_transcripts() method.')
|
21 |
+
|
22 |
+
|
23 |
+
class PrettyPrintFormatter(Formatter):
|
24 |
+
def format_transcript(self, transcript, **kwargs):
|
25 |
+
"""Pretty prints a transcript.
|
26 |
+
|
27 |
+
:param transcript:
|
28 |
+
:return: A pretty printed string representation of the transcript.'
|
29 |
+
:rtype str
|
30 |
+
"""
|
31 |
+
return pprint.pformat(transcript, **kwargs)
|
32 |
+
|
33 |
+
def format_transcripts(self, transcripts, **kwargs):
|
34 |
+
"""Pretty prints a list of transcripts.
|
35 |
+
|
36 |
+
:param transcripts:
|
37 |
+
:return: A pretty printed string representation of the transcripts.'
|
38 |
+
:rtype str
|
39 |
+
"""
|
40 |
+
return self.format_transcript(transcripts, **kwargs)
|
41 |
+
|
42 |
+
|
43 |
+
class JSONFormatter(Formatter):
|
44 |
+
def format_transcript(self, transcript, **kwargs):
|
45 |
+
"""Converts a transcript into a JSON string.
|
46 |
+
|
47 |
+
:param transcript:
|
48 |
+
:return: A JSON string representation of the transcript.'
|
49 |
+
:rtype str
|
50 |
+
"""
|
51 |
+
return json.dumps(transcript, **kwargs)
|
52 |
+
|
53 |
+
def format_transcripts(self, transcripts, **kwargs):
|
54 |
+
"""Converts a list of transcripts into a JSON string.
|
55 |
+
|
56 |
+
:param transcripts:
|
57 |
+
:return: A JSON string representation of the transcript.'
|
58 |
+
:rtype str
|
59 |
+
"""
|
60 |
+
return self.format_transcript(transcripts, **kwargs)
|
61 |
+
|
62 |
+
|
63 |
+
class TextFormatter(Formatter):
|
64 |
+
def format_transcript(self, transcript, **kwargs):
|
65 |
+
"""Converts a transcript into plain text with no timestamps.
|
66 |
+
|
67 |
+
:param transcript:
|
68 |
+
:return: all transcript text lines separated by newline breaks.'
|
69 |
+
:rtype str
|
70 |
+
"""
|
71 |
+
return '\n'.join(line['text'] for line in transcript)
|
72 |
+
|
73 |
+
def format_transcripts(self, transcripts, **kwargs):
|
74 |
+
"""Converts a list of transcripts into plain text with no timestamps.
|
75 |
+
|
76 |
+
:param transcripts:
|
77 |
+
:return: all transcript text lines separated by newline breaks.'
|
78 |
+
:rtype str
|
79 |
+
"""
|
80 |
+
return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
|
81 |
+
|
82 |
+
|
83 |
+
class WebVTTFormatter(Formatter):
|
84 |
+
def _seconds_to_timestamp(self, time):
|
85 |
+
"""Helper that converts `time` into a transcript cue timestamp.
|
86 |
+
|
87 |
+
:reference: https://www.w3.org/TR/webvtt1/#webvtt-timestamp
|
88 |
+
|
89 |
+
:param time: a float representing time in seconds.
|
90 |
+
:type time: float
|
91 |
+
:return: a string formatted as a cue timestamp, 'HH:MM:SS.MS'
|
92 |
+
:rtype str
|
93 |
+
:example:
|
94 |
+
>>> self._seconds_to_timestamp(6.93)
|
95 |
+
'00:00:06.930'
|
96 |
+
"""
|
97 |
+
time = float(time)
|
98 |
+
hours, remainder = divmod(time, 3600)
|
99 |
+
mins, secs = divmod(remainder, 60)
|
100 |
+
ms = int(round((time - int(time))*1000, 2))
|
101 |
+
return "{:02.0f}:{:02.0f}:{:02.0f}.{:03d}".format(hours, mins, secs, ms)
|
102 |
+
|
103 |
+
def format_transcript(self, transcript, **kwargs):
|
104 |
+
"""A basic implementation of WEBVTT formatting.
|
105 |
+
|
106 |
+
:param transcript:
|
107 |
+
:reference: https://www.w3.org/TR/webvtt1/#introduction-caption
|
108 |
+
"""
|
109 |
+
lines = []
|
110 |
+
for i, line in enumerate(transcript):
|
111 |
+
if i < len(transcript) - 1:
|
112 |
+
# Looks ahead, use next start time since duration value
|
113 |
+
# would create an overlap between start times.
|
114 |
+
time_text = "{} --> {}".format(
|
115 |
+
self._seconds_to_timestamp(line['start']),
|
116 |
+
self._seconds_to_timestamp(transcript[i + 1]['start'])
|
117 |
+
)
|
118 |
+
else:
|
119 |
+
# Reached the end, cannot look ahead, use duration now.
|
120 |
+
duration = line['start'] + line['duration']
|
121 |
+
time_text = "{} --> {}".format(
|
122 |
+
self._seconds_to_timestamp(line['start']),
|
123 |
+
self._seconds_to_timestamp(duration)
|
124 |
+
)
|
125 |
+
lines.append("{}\n{}".format(time_text, line['text']))
|
126 |
+
|
127 |
+
return "WEBVTT\n\n" + "\n\n".join(lines) + "\n"
|
128 |
+
|
129 |
+
def format_transcripts(self, transcripts, **kwargs):
|
130 |
+
"""A basic implementation of WEBVTT formatting for a list of transcripts.
|
131 |
+
|
132 |
+
:param transcripts:
|
133 |
+
:reference: https://www.w3.org/TR/webvtt1/#introduction-caption
|
134 |
+
"""
|
135 |
+
return '\n\n\n'.join([self.format_transcript(transcript, **kwargs) for transcript in transcripts])
|
136 |
+
|
137 |
+
|
138 |
+
class FormatterLoader(object):
|
139 |
+
TYPES = {
|
140 |
+
'json': JSONFormatter,
|
141 |
+
'pretty': PrettyPrintFormatter,
|
142 |
+
'text': TextFormatter,
|
143 |
+
'webvtt': WebVTTFormatter,
|
144 |
+
}
|
145 |
+
|
146 |
+
class UnknownFormatterType(Exception):
|
147 |
+
def __init__(self, formatter_type):
|
148 |
+
super(FormatterLoader.UnknownFormatterType, self).__init__(
|
149 |
+
'The format \'{formatter_type}\' is not supported. '
|
150 |
+
'Choose one of the following formats: {supported_formatter_types}'.format(
|
151 |
+
formatter_type=formatter_type,
|
152 |
+
supported_formatter_types=', '.join(FormatterLoader.TYPES.keys()),
|
153 |
+
)
|
154 |
+
)
|
155 |
+
|
156 |
+
def load(self, formatter_type='pretty'):
|
157 |
+
"""
|
158 |
+
Loads the Formatter for the given formatter type.
|
159 |
+
|
160 |
+
:param formatter_type:
|
161 |
+
:return: Formatter object
|
162 |
+
"""
|
163 |
+
if formatter_type not in FormatterLoader.TYPES.keys():
|
164 |
+
raise FormatterLoader.UnknownFormatterType(formatter_type)
|
165 |
+
return FormatterLoader.TYPES[formatter_type]()
|