lanbogao commited on
Commit
ba9fae4
1 Parent(s): 093a866

Update readme.

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. fetchYoutubeSubtitle.py +67 -87
  3. main.py +10 -4
  4. requirements.txt +1 -1
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Docker Fastapi
3
  emoji: 🐢
4
  colorFrom: purple
5
  colorTo: blue
 
1
  ---
2
+ title: Ytdlp Subtitle
3
  emoji: 🐢
4
  colorFrom: purple
5
  colorTo: blue
fetchYoutubeSubtitle.py CHANGED
@@ -1,97 +1,77 @@
1
  import json
2
- import os
3
- import requests
4
- import logging
5
 
6
- logger = logging.getLogger(__name__)
 
 
 
 
 
 
7
 
8
- from typing import List, Dict
9
-
10
- SUBTITLE_DOWNLOADER_URL = 'https://savesubs.com'
11
-
12
- def fetchYoutubeSubtitleUrls(video_id):
13
- headers = {
14
- 'accept': 'application/json, text/plain, */*',
15
- 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
16
- 'cache-control': 'no-cache',
17
- 'Content-Type': 'application/json; charset=UTF-8',
18
- 'pragma': 'no-cache',
19
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
20
- 'x-auth-token': os.environ.get('SAVESUBS_X_AUTH_TOKEN', ''),
21
- 'x-requested-domain': 'savesubs.com',
22
- 'X-requested-with': 'xmlhttprequest',
23
- 'sec-ch-ua': '"Google Chrome";v="111", "Not(A:Brand";v="8", "Chromium";v="111"',
24
- 'sec-ch-ua-mobile': '?0',
25
- 'sec-ch-ua-platform': 'Linux',
26
- 'sec-fetch-dest': 'empty',
27
- 'sec-fetch-mode': 'cors',
28
- 'sec-fetch-site': 'same-origin',
29
- 'authority': 'savesubs.com',
30
- 'origin': 'https://savesubs.com',
31
- 'referer': f'https://savesubs.com/process?url=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3D{video_id}'
32
  }
33
- data = {
34
- 'data': {'url': f'https://www.youtube.com/watch?v={video_id}'}
35
- }
36
- session = requests.Session()
37
-
38
- proxy = os.environ.get('PROXY', None)
39
- if proxy:
40
- session.proxies = {
41
- "http": proxy,
42
- "https": proxy,
43
- }
44
-
45
- response = session.post(SUBTITLE_DOWNLOADER_URL + '/action/extract', json=data, headers=headers)
46
-
47
-
48
- if response.status_code != 200:
49
- logger.error("response.status_code: {}".format(response.status_code))
50
- return {'title': None, 'subtitleList': None, 'error': response.reason}
51
- else:
52
- try:
53
- json = response.json().get('response', {})
54
- logger.info('subtitle url json: {}'.format(json))
55
- return {'title': json.get('title'), 'subtitleList': json.get('formats')}
56
- except Exception as error:
57
- logger.error(error)
58
- return {'title': None, 'subtitleList': None, 'error': str(error)}
59
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- async def find(subtitleList: List[Dict], args: Dict) -> Dict:
62
- key = list(args.keys())[0]
63
- return next((item for item in subtitleList if item.get(key, None) == args[key]), None)
64
-
65
- async def fetchYoutubeSubtitle(videoId: str) -> Dict:
66
- subtitle_url = ""
67
- betterSubtitle = {}
68
- subtitleList = []
69
- title = ""
70
- error = ""
71
-
72
- result = fetchYoutubeSubtitleUrls(videoId)
73
- title, subtitleList, error = result["title"], result["subtitleList"], result.get("error", None)
74
-
75
- if not subtitleList or len(subtitleList) <= 0:
76
- return {"title": title, "subtitle": None, "error": error}
77
-
78
- betterSubtitle = (
79
- await find(subtitleList, {"quality": "English"})
80
- or await find(subtitleList, {"quality": "English (auto"})
81
- or await find(subtitleList, {"quality": "zh-CN"})
82
- or subtitleList[0]
83
- )
84
 
85
- subtitleUrl = f"{SUBTITLE_DOWNLOADER_URL}{betterSubtitle['url']}?ext=srt"
 
 
 
 
 
 
 
 
86
 
87
- session = requests.Session()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- proxy = os.environ.get('PROXY', None)
90
- if proxy:
91
- session.proxies = {
92
- "http": proxy,
93
- "https": proxy,
94
- }
95
- response = session.get(subtitleUrl)
 
96
 
97
- return {"title": title, "subtitle": response.text}
 
1
  import json
2
+ from typing import Optional
3
+ import yt_dlp
 
4
 
5
+ def getVttUrlFromSubtitles(item, lang='en', vttType="vtt"):
6
+ langs = item.keys()
7
+ key = lang if langs.get(lang) else ('en' if langs.get('en') else langs[0] )
8
+ for item in langs[key]:
9
+ if(item.get("ext") == type):
10
+ return item.get("url")
11
+ return None
12
 
13
+ def getSubtitle(url: str, lang: Optional[str] = 'en', vttType="vtt") -> Optional[str]:
14
+ ydl_opts = {
15
+ "writesubtitles": True,
16
+ "allsubtitles": True,
17
+ "subtitleslangs": [lang] if lang else [],
18
+ "skip_download": True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
22
+ info_dict = ydl.extract_info(url, download=False)
23
+ if info_dict.get("subtitles"):
24
+ # get first available subtitle
25
+ subtitle_url = getVttUrlFromSubtitles(info_dict.get("subtitles"), lang, vttType)
26
+ with ydl.urlopen(subtitle_url) as subtitle:
27
+ return subtitle.read().decode()
28
+ if info_dict.get("automatic_captions"):
29
+ subtitle_url = getVttUrlFromSubtitles(info_dict.get("automatic_captions"), , lang, vttType)
30
+ with ydl.urlopen(subtitle_url) as subtitle:
31
+ return subtitle.read().decode()
32
+ return None
33
 
34
+ def fetchSubtitleUrls(url: str) -> json:
35
+ ydl_opts = {
36
+ "writesubtitles": True,
37
+ "allsubtitles": True,
38
+ "skip_download": True,
39
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
42
+ info_dict = ydl.extract_info(url, download=False)
43
+ if info_dict.get("subtitles"):
44
+ langs = info_dict.get("subtitles").keys()
45
+ if not (langs.length == 1 and "live_chat" in langs):
46
+ return info_dict.get("subtitles")
47
+ if info_dict.get("automatic_captions"):
48
+ return info_dict.get("automatic_captions")
49
+ return None
50
 
51
+ def get_subtitle(url, lang='en'):
52
+ if lang is None:
53
+ lang = 'en'
54
+ # Download subtitles if available
55
+ ydl_opts = {
56
+ 'writesubtitles': True,
57
+ 'outtmpl': '%(id)s.%(ext)s',
58
+ 'subtitleslangs': [lang],
59
+ 'skip_download': True,
60
+ }
61
+ try:
62
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
63
+ info_dict = ydl.extract_info(url, download=True)
64
+ video_id = info_dict.get("id", None)
65
+ if video_id is None:
66
+ return None
67
 
68
+ subtitle_file = f"{video_id}.{lang}.vtt"
69
+ with open(subtitle_file, 'r') as f:
70
+ subtitle_content = f.read()
71
+ subtitle_content = re.sub(r"<[^>]+>", "", subtitle_content)
72
+ return subtitle_content
73
+ except error:
74
+ print(error)
75
+ return None
76
 
77
+ return None
main.py CHANGED
@@ -1,6 +1,6 @@
1
  from fastapi import FastAPI
2
  from fastapi.responses import JSONResponse
3
- from fetchYoutubeSubtitle import fetchYoutubeSubtitle
4
 
5
  app = FastAPI()
6
 
@@ -15,6 +15,12 @@ def read_json():
15
 
16
 
17
  @app.get("/subtitle/")
18
- async def get_subtitle(vid: str):
19
- subtitle = await fetchYoutubeSubtitle(vid)
20
- return JSONResponse(content=subtitle)
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from fastapi.responses import JSONResponse
3
+ from fetchYoutubeSubtitle import getSubtitle, fetchSubtitleUrls
4
 
5
  app = FastAPI()
6
 
 
15
 
16
 
17
  @app.get("/subtitle/")
18
+ async def get_subtitle(url: str):
19
+ subtitle = await fetchSubtitleUrls(url)
20
+ return JSONResponse(content=subtitle)
21
+
22
+
23
+ @app.get("/subtitle-urls/")
24
+ async def get_subtitleUrls(url: str):
25
+ subtitles = await fetchSubtitleUrls(url)
26
+ return JSONResponse(content=subtitles)
requirements.txt CHANGED
@@ -4,4 +4,4 @@ sentencepiece==0.1.*
4
  torch==1.11.*
5
  transformers==4.*
6
  uvicorn[standard]==0.17.*
7
- langchain
 
4
  torch==1.11.*
5
  transformers==4.*
6
  uvicorn[standard]==0.17.*
7
+ yt-dlp