summarize_youtube

Sleeping

App Files Files Community

smakamali commited on Oct 10, 2023

Commit

7c42e43

•

1 Parent(s): 7330707

updating the app to v1.2

Browse files

Files changed (1) hide show

app.py +306 -141

app.py CHANGED Viewed

@@ -1,52 +1,116 @@
-def transcribe_youtube_video(url, force_transcribe=False):
     text = ''
-    try:
-        from youtube_transcript_api import YouTubeTranscriptApi
-        import pytube
-        from pytube import YouTube
-        vid_id = pytube.extract.video_id(url)
-        temp = YouTubeTranscriptApi.get_transcript(vid_id)
-        for t in temp:
-            text+=t['text']+' '
-        yt = YouTube(str(url))
-    except:
-        pass
-    if text == '' or force_transcribe:
-        from pytube import YouTube
-        import torch
-        import os
-        save_dir="./docs/youtube/"
-        os.mkdir(save_dir)
-        yt = YouTube(str(url))
-        audio = yt.streams.filter(only_audio = True).first()
-        out_file = audio.download(filename="audio.mp3",output_path = save_dir)
-        import transformers
-        whisper_asr = transformers.pipeline(
-            "automatic-speech-recognition", model="openai/whisper-large", device_map= 'auto',
-        )
-        whisper_asr.model.config.forced_decoder_ids = (
-            whisper_asr.tokenizer.get_decoder_prompt_ids(
-                language="en",
-                task="transcribe"
-            )
         )
-        temp = whisper_asr(out_file,chunk_length_s=20)
-        text = temp['text']
-        del(whisper_asr)
-        torch.cuda.empty_cache()
-    return yt.title, text
-def summarize_text(title,text):
     from langchain.chains.llm import LLMChain
     from langchain.prompts import PromptTemplate
@@ -56,108 +120,153 @@ def summarize_text(title,text):
     import transformers
     from transformers import BitsAndBytesConfig
     from transformers import AutoTokenizer, AutoModelForCausalLM
-    # quantization_config = BitsAndBytesConfig(
-    #     load_in_4bit=True,
-    #     bnb_4bit_compute_dtype=torch.float16,
-    #     bnb_4bit_quant_type="nf4",
-    #     bnb_4bit_use_double_quant=True,
-    # )
-    # model = "nomic-ai/gpt4all-falcon"
-    model = "tiiuae/falcon-7b-instruct"
-    tokenizer = AutoTokenizer.from_pretrained(model,trust_remote_code=True,)
-    model = AutoModelForCausalLM.from_pretrained(model,
-                                                # trust_remote_code=True,
-                                                # quantization_config=quantization_config,
-                                                )
     from langchain import HuggingFacePipeline
     import torch
-    pipeline = transformers.pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        torch_dtype=torch.bfloat16,
-        device_map="auto",
-        max_new_tokens = 150,
-        pad_token_id=tokenizer.eos_token_id,
-        # device=-1,
-    )
-    llm = HuggingFacePipeline(pipeline=pipeline)
-    pipeline2 = transformers.pipeline(
-        "text-generation",
-        model=model,
-        tokenizer=tokenizer,
-        torch_dtype=torch.bfloat16,
-        device_map="auto",
-        max_new_tokens = 250,
-        pad_token_id=tokenizer.eos_token_id,
-        repetition_penalty= 2.0,
-        # device=-1,
-    )
-    llm2 = HuggingFacePipeline(pipeline=pipeline2)
     # Map
     map_template = """
-    Summarize the following text in a clear and concise way:
-    TITLE: `{title}`
-    TEXT:`{docs}`
-    Brief Summary:
     """
-    map_prompt = PromptTemplate(template = map_template,
-                                input_variables = ['title','docs'])
     map_chain = LLMChain(llm=llm, prompt=map_prompt)
     # Reduce - Collapse
-    reduce_template = """
-    The following is set of partial summaries of a video titled {title}:
-    partial summaries: {doc_summaries}
-    Take these and distill them into a consolidated summary.
-    Summary:
     """
-    reduce_prompt = PromptTemplate(template = reduce_template,
-                                    input_variables = ['title','doc_summaries'])
-    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
     # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
     collapse_documents_chain = StuffDocumentsChain(
-        llm_chain=reduce_chain, document_variable_name="doc_summaries"
-    )
     # Final Reduce - Combine
-    final_reduce_template = """
-    The following is set of partial summaries of a video titled '{title}':
-    partial summaries:
-    {doc_summaries}
-    Generate a summary of the whole text that includes `Video Subject`, and the `Key Highlights` as maximum 10 pullet points listing the main facts, arguments, or points:
     """
-    final_reduce_prompt = PromptTemplate(template = final_reduce_template,
-                                        input_variables = ['title','doc_summaries'])
-    final_reduce_chain = LLMChain(llm=llm2, prompt=final_reduce_prompt)
     # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
     combine_documents_chain = StuffDocumentsChain(
-        llm_chain=final_reduce_chain, document_variable_name="doc_summaries"
-    )
-    # Combines and iteravely reduces the mapped documents
     reduce_documents_chain = ReduceDocumentsChain(
         # This is final chain that is called.
         combine_documents_chain=combine_documents_chain,
         # If documents exceed context for `StuffDocumentsChain`
         collapse_documents_chain=collapse_documents_chain,
         # The maximum number of tokens to group documents into.
-        token_max=500,
-    )
     # Combining documents by mapping a chain over them, then combining results
     map_reduce_chain = MapReduceDocumentsChain(
@@ -169,27 +278,27 @@ def summarize_text(title,text):
         document_variable_name="docs",
         # Return the results of the map steps in the output
         return_intermediate_steps=False,
-    )
     from langchain.document_loaders import TextLoader
     from langchain.text_splitter import TokenTextSplitter
-    with open('./docs/transcript.txt','w') as f:
         f.write(text)
-    loader = TextLoader("./docs/transcript.txt")
     doc = loader.load()
-    text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
     docs = text_splitter.split_documents(doc)
-    summary = map_reduce_chain.run({'input_documents':docs, 'title':title})
-    # del(llm)
-    # del(llm2)
-    # del(model)
-    # del(tokenizer)
-    # torch.cuda.empty_cache()
-    return summary
 import gradio as gr
 import pytube
@@ -204,29 +313,85 @@ def get_video(url):
     embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id)
     return embed_html
-def summarize_youtube_video(url,force_transcribe):
-    title,text = transcribe_youtube_video(url,force_transcribe)
-    Summary = summarize_text(title,text)
-    return Summary
 html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
 with gr.Blocks() as demo:
-    # gr.Markdown("Transribe a YouTube video using this demo.")
     with gr.Row():
-        with gr.Column(scale=3):
             url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=")
-            force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.")
         with gr.Column(scale=1):
-            gr.Markdown("# Summarize a YouTube video using this demo!",scale=2)
-            sum_btn = gr.Button("Summarize!",scale=1)
     title = gr.Textbox(label="Video Title",placeholder="title...")
     with gr.Row():
-        video = gr.HTML(html)
-        output = gr.Textbox(label="Summary",placeholder="summary...")
-    sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title")
-    sum_btn.click(fn=summarize_youtube_video, inputs=[url,force_transcribe], outputs=output, api_name="summarize_youtube_video", queue=True)
-    sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video",queue=False)
-demo.queue()
-demo.launch()

+import os
+save_dir="./docs/youtube/"
+if not os.path.exists(save_dir):
+    os.mkdir(save_dir)
+transcription_model_id = "openai/whisper-large"
+llm_model_id = "tiiuae/falcon-7b-instruct"
+from youtube_transcript_api import YouTubeTranscriptApi
+import pytube
+# get the transcript from YouTube
+def get_yt_transcript(url):
     text = ''
+    vid_id = pytube.extract.video_id(url)
+    temp = YouTubeTranscriptApi.get_transcript(vid_id)
+    for t in temp:
+        text+=t['text']+' '
+    return text
+from pytube import YouTube
+import transformers
+import torch
+# transcribes the video
+def transcribe_yt_vid(url):
+    # download YouTube video's audio
+    yt = YouTube(str(url))
+    audio = yt.streams.filter(only_audio = True).first()
+    out_file = audio.download(filename="audio.mp3",
+                              output_path = save_dir)
+    # defining an automatic-speech-recognition pipeline
+    asr = transformers.pipeline(
+        "automatic-speech-recognition",
+        model=transcription_model_id,
+        device_map= 'auto',
+    )
+    # setting model config parameters
+    asr.model.config.forced_decoder_ids = (
+        asr.tokenizer.get_decoder_prompt_ids(
+            language="en",
+            task="transcribe"
         )
+    )
+    # invoking the Whisper model
+    temp = asr(out_file,chunk_length_s=20)
+    text = temp['text']
+    # we can do this at the end to release GPU memory
+    del(asr)
+    torch.cuda.empty_cache()
+    return text
+from pytube import YouTube
+from huggingface_hub import InferenceClient
+# transcribes the video using the Hugging Face Hub API
+def transcribe_yt_vid_api(url,api_token):
+    # download YouTube video's audio
+    yt = YouTube(str(url))
+    audio = yt.streams.filter(only_audio = True).first()
+    out_file = audio.download(filename="audio.mp3",
+                              output_path = save_dir)
+    # Initialize client for the Whisper model
+    client = InferenceClient(model=transcription_model_id,
+                             token=api_token)
+    import librosa
+    import soundfile as sf
+    text = ''
+    t=20 # audio chunk length in seconds
+    x, sr = librosa.load(out_file, sr=None)
+    # This gives x as audio file in numpy array and sr as original sampling rate
+    # The audio needs to be split in 20 second chunks since the API call truncates the response
+    for i in range(0, len(x)//(t * sr)):
+        y = x[t * sr * i: t * sr *(i+1)]
+        split_path = save_dir+"audio_split.mp3"
+        sf.write(split_path, y, sr)
+        text += client.automatic_speech_recognition(split_path)
+    return text
+def transcribe_youtube_video(url, force_transcribe=False,use_api=False,api_token=None):
+    yt = YouTube(str(url))
+    text = ''
+    # get the transcript from YouTube if available
+    try:
+        text = get_yt_transcript(url)
+    except:
+        pass
+    # transcribes the video if YouTube did not provide a transcription
+    # or if you want to force_transcribe anyway
+    if text == '' or force_transcribe:
+        if use_api:
+            text = transcribe_yt_vid_api(url,api_token=api_token)
+            transcript_source = 'The transcript was generated using {} via the Hugging Face Hub API.'.format(transcription_model_id)
+        else:
+            text = transcribe_yt_vid(url)
+            transcript_source = 'The transcript was generated using {} hosted locally.'.format(transcription_model_id)
+    else:
+        transcript_source = 'The transcript was downloaded from YouTube.'
+    return yt.title, text, transcript_source
+def summarize_text(title,text,temperature,words,use_api=False,api_token=None,do_sample=False):
     from langchain.chains.llm import LLMChain
     from langchain.prompts import PromptTemplate
     import transformers
     from transformers import BitsAndBytesConfig
     from transformers import AutoTokenizer, AutoModelForCausalLM
     from langchain import HuggingFacePipeline
     import torch
+    model_kwargs1 = {"temperature":temperature ,
+                    "do_sample":do_sample,
+                    "min_new_tokens":200-25,
+                    "max_new_tokens":200+25
+                    }
+    model_kwargs2 = {"temperature":temperature ,
+                    "do_sample":do_sample,
+                    "min_new_tokens":words-25,
+                    "max_new_tokens":words+25,
+                    'repetition_penalty':2.0
+                    }
+    if not do_sample:
+        del model_kwargs1["temperature"]
+        del model_kwargs2["temperature"]
+    if use_api:
+        from langchain import HuggingFaceHub
+        # os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
+        llm=HuggingFaceHub(
+            repo_id=llm_model_id, model_kwargs=model_kwargs1,
+            huggingfacehub_api_token=api_token
+            )
+        llm2=HuggingFaceHub(
+            repo_id=llm_model_id, model_kwargs=model_kwargs2,
+            huggingfacehub_api_token=api_token
+            )
+        summary_source = 'The summary was generated using {} via Hugging Face API.'.format(llm_model_id)
+    else:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            )
+        tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
+        model = AutoModelForCausalLM.from_pretrained(llm_model_id,
+                                                    quantization_config=quantization_config)
+        model.to_bettertransformer()
+        pipeline = transformers.pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            pad_token_id=tokenizer.eos_token_id,
+            **model_kwargs1,
+        )
+        pipeline2 = transformers.pipeline(
+            "text-generation",
+            model=model,
+            tokenizer=tokenizer,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            pad_token_id=tokenizer.eos_token_id,
+            **model_kwargs2,
+        )
+        llm = HuggingFacePipeline(pipeline=pipeline)
+        llm2 = HuggingFacePipeline(pipeline=pipeline2)
+        summary_source = 'The summary was generated using {} hosted locally.'.format(llm_model_id)
     # Map
     map_template = """
+    You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n
+    You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n
+    Summarize the following text in a clear and concise way:\n
+    ----------------------- \n
+    TITLE: `{title}`\n
+    TEXT:\n
+    `{docs}`\n
+    ----------------------- \n
+    BRIEF SUMMARY:\n
     """
+    map_prompt = PromptTemplate(
+        template = map_template,
+        input_variables = ['title','docs']
+        )
     map_chain = LLMChain(llm=llm, prompt=map_prompt)
     # Reduce - Collapse
+    collapse_template = """
+    You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n
+    You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n
+    The following is set of partial summaries of a video:\n
+    ----------------------- \n
+    TITLE: `{title}`\n
+    PARTIAL SUMMARIES:\n
+    `{doc_summaries}`\n
+    ----------------------- \n
+    Take these and distill them into a consolidated summary.\n
+    SUMMARY:\n
     """
+    collapse_prompt = PromptTemplate(
+        template = collapse_template,
+        input_variables = ['title','doc_summaries']
+        )
+    collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt)
     # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
     collapse_documents_chain = StuffDocumentsChain(
+        llm_chain=collapse_chain, document_variable_name="doc_summaries"
+        )
     # Final Reduce - Combine
+    combine_template = """\n
+    You are an intelligent AI assistant that is tasked to review the content of a video and provide a concise and accurate summary.\n
+    You do not provide information that is not mentioned in the video. You only provide information that you are absolutely sure about.\n
+    The following is a set of partial summaries of a video:\n
+    ----------------------- \n
+    TITLE: `{title}`\n
+    PARTIAL SUMMARIES:\n
+    `{doc_summaries}`\n
+    ----------------------- \n
+    Generate an executive summary of the whole text in maximum {words} words that contains the main messages, points, and arguments presented in the video.\n
+    EXECUTIVE SUMMARY:\n
     """
+    combine_prompt = PromptTemplate(
+        template = combine_template,
+        input_variables = ['title','doc_summaries','words']
+        )
+    combine_chain = LLMChain(llm=llm2, prompt=combine_prompt)
     # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
     combine_documents_chain = StuffDocumentsChain(
+        llm_chain=combine_chain, document_variable_name="doc_summaries"
+        )
+    # Combines and iteratively reduces the mapped documents
     reduce_documents_chain = ReduceDocumentsChain(
         # This is final chain that is called.
         combine_documents_chain=combine_documents_chain,
         # If documents exceed context for `StuffDocumentsChain`
         collapse_documents_chain=collapse_documents_chain,
         # The maximum number of tokens to group documents into.
+        token_max=800,
+        )
     # Combining documents by mapping a chain over them, then combining results
     map_reduce_chain = MapReduceDocumentsChain(
         document_variable_name="docs",
         # Return the results of the map steps in the output
         return_intermediate_steps=False,
+        )
     from langchain.document_loaders import TextLoader
     from langchain.text_splitter import TokenTextSplitter
+    with open(save_dir+'/transcript.txt','w') as f:
         f.write(text)
+    loader = TextLoader(save_dir+"/transcript.txt")
     doc = loader.load()
+    text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100)
     docs = text_splitter.split_documents(doc)
+    summary = map_reduce_chain.run({'input_documents':docs, 'title':title, 'words':words})
+    try:
+        del(map_reduce_chain,reduce_documents_chain,combine_chain,collapse_documents_chain,map_chain,collapse_chain,llm,llm2,pipeline,pipeline2,model,tokenizer)
+    except:
+        pass
+    torch.cuda.empty_cache()
+    return summary, summary_source
 import gradio as gr
 import pytube
     embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id)
     return embed_html
+def summarize_youtube_video(url,force_transcribe,use_transcribe_api=False,api_token="",
+                            temperature=1,words=250,use_llm_api=False,do_sample=False):
+    title,text,transcript_source = transcribe_youtube_video(url,force_transcribe,use_transcribe_api,api_token)
+    summary, summary_source = summarize_text(title,text,temperature,words,use_llm_api,api_token,do_sample)
+    return summary, text, transcript_source, summary_source
 html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'
+def change_transcribe_api(vis):
+    return gr.Checkbox(value=False, visible=vis)
+def change_api_token(vis):
+    return gr.Textbox(visible=vis)
+def update_source(source):
+    return gr.Textbox(info=source)
+def show_temp(vis):
+    return gr.Slider(visible=vis)
+# Defining the structure of the UI
 with gr.Blocks() as demo:
     with gr.Row():
+        gr.Markdown("# Summarize a YouTube Video")
+    with gr.Row():
+        with gr.Column(scale=4):
             url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=")
         with gr.Column(scale=1):
+            api_token = gr.Textbox(label="Paste your Hugging Face API token here:",placeholder="hf_...",visible=False,show_label=True,info='The API token passed via this field is not stored. It is only passed through the Hugging Face Hub API for inference.')
+        with gr.Column(scale=1):
+            sum_btn = gr.Button("Summarize!")
+    with gr.Accordion("Transcription Settings",open=False):
+        with gr.Row():
+            force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.", info='If unchecked, the app attempts to download the transcript from YouTube first. Check this if the transcript does not seem accurate.')
+            use_transcribe_api = gr.Checkbox(label="Transcribe using the HuggingFaceHub API.",visible=False)
+    with gr.Accordion("Summarization Settings",open=False):
+        with gr.Row():
+            use_llm_api = gr.Checkbox(label="Summarize using the HuggingFaceHub API.",visible=True)
+            do_sample = gr.Checkbox(label="Set the Temperature",value=True,visible=True)
+            temperature = gr.Slider(minimum=0,maximum=1,value=0.9,label="Generation temperature",visible=True)
+            words = gr.Slider(minimum=100,maximum=500,value=250,label="Length of the summary")
+    gr.Markdown("# Results")
     title = gr.Textbox(label="Video Title",placeholder="title...")
+    with gr.Row():
+        video = gr.HTML(html,scale=1)
+        summary_source = gr.Textbox(visible=False,scale=0)
+        summary = gr.Textbox(label="Summary",placeholder="summary...",scale=1)
     with gr.Row():
+        with gr.Group():
+            transcript_source = gr.Textbox(visible=False)
+            transcript = gr.Textbox(label="Full Transcript",placeholder="transcript...",show_label=True)
+    with gr.Accordion("Notes",open=False):
+        gr.Markdown("""
+                    1. This app attempts to download the transcript from Youtube first. If the transcript is not available, or the prompts require, the video will be transcribed.\n
+                    2. The app performs best on videos in which the number of speakers is limited or when the YouTube transcript includes annotations of the speakers.\n
+                    3. The trascription does not annotate the speakers which may downgrade the quality of the summary if there are more than one speaker.\n
+                    """)
+    # Defining the interactivity of the UI elements
+    force_transcribe.change(fn=change_transcribe_api,inputs=force_transcribe,outputs=use_transcribe_api)
+    use_transcribe_api.change(fn=change_api_token,inputs=use_transcribe_api,outputs=api_token)
+    use_llm_api.change(fn=change_api_token,inputs=use_llm_api,outputs=api_token)
+    transcript_source.change(fn=update_source,inputs=transcript_source,outputs=transcript)
+    summary_source.change(fn=update_source,inputs=summary_source,outputs=summary)
+    do_sample.change(fn=show_temp,inputs=do_sample,outputs=temperature)
+    # Defining the functions to call on clicking the button
+    sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title", queue=False)
+    sum_btn.click(fn=summarize_youtube_video, inputs=[url,force_transcribe,use_transcribe_api,api_token,temperature,words,use_llm_api,do_sample],
+                  outputs=[summary,transcript, transcript_source, summary_source], api_name="summarize_youtube_video", queue=True)
+    sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video", queue=False)
+demo.queue()
+demo.launch(share=False)