Lookimi commited on
Commit
bda77e6
1 Parent(s): 9337fa5

Create App.py

Browse files
Files changed (1) hide show
  1. App.py +59 -0
App.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ #changing the code to have a Gradio Blocks App Menu on Huggingface Space prompting the channel URL
4
+ #importing the necessary modules
5
+ import os
6
+ import urllib.request
7
+ import re
8
+ import time
9
+ import gradio as gr
10
+
11
+ #Creating a Gradio App Menu
12
+ def transcript_extract():
13
+ #specifying the YouTube channel URL
14
+ channel_url = gr.inputs.Textbox(label="Channel URL")
15
+
16
+ #accessing the webpage
17
+ page = urllib.request.urlopen(channel_url)
18
+
19
+ #reading the source code
20
+ data = page.read()
21
+
22
+ #creating a directory to save the transcripts
23
+ os.mkdir('Transcripts')
24
+
25
+ #finding the transcripts
26
+ transcript_links = re.findall(r'(\/watch\?v=[A-Za-z0-9_.-]*)', str(data))
27
+
28
+ #looping through each transcript to download
29
+ for link in transcript_links:
30
+ video_url = 'http://www.youtube.com'+link
31
+ #access the video page
32
+ video_page = urllib.request.urlopen(video_url)
33
+ #read the source code
34
+ video_data = video_page.read()
35
+ #find the transcript
36
+ transcript_link = re.findall(r'(\/timedtext_editor\?[A-Za-z0-9_.-]*)', str(video_data))
37
+ #check if there is a transcript available
38
+ if(len(transcript_link) > 0):
39
+ #access the transcript page
40
+ transcript_url ='http://www.youtube.com'+ transcript_link[0]
41
+ transcript_page = urllib.request.urlopen(transcript_url)
42
+ transcript_data = transcript_page.read()
43
+ #find the link to the transcript
44
+ transcript_download_link = re.findall(r'(\/api\/timedtext\?[A-Za-z0-9_.-]*)', str(transcript_data))
45
+ #check if the transcript is available for download
46
+ if(len(transcript_download_link) > 0):
47
+ #download the transcript
48
+ file_name = "Transcripts/" + link[9:] + ".xml"
49
+ download_url = 'http://www.youtube.com'+transcript_download_link[0]
50
+ urllib.request.urlretrieve(download_url, file_name)
51
+ print("Downloading transcript for video " + link[9:] + "...")
52
+ time.sleep(3)
53
+ else:
54
+ print("Transcript not available for video " + link[9:])
55
+ else:
56
+ print("Transcript not available for video " + link[9:])
57
+
58
+ #upload to Huggingface Space
59
+ gr.Interface(fn=transcript_extract, inputs="textbox", force_reload=True).launch(share=True)