Spaces:

DuckyBlender
/

phi3-youtube-summarizer

Paused

App Files Files Community

DuckyBlender commited on Aug 14, 2024

Commit

766eb70

1 Parent(s): 257b9bf

done!

Browse files

Files changed (3) hide show

README.md +1 -1
main.py +150 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 title: Phi3 Youtube Summarizer
-emoji: 😻
 colorFrom: blue
 colorTo: yellow
 sdk: gradio

 ---
 title: Phi3 Youtube Summarizer
+emoji: 📹
 colorFrom: blue
 colorTo: yellow
 sdk: gradio

main.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import gradio as gr
+from youtube_transcript_api import YouTubeTranscriptApi
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
+import re
+import os
+import torch
+# import dotenv
+# dotenv.load_dotenv()
+import subprocess
+subprocess.run(
+    "pip install flash_attn --no-build-isolation --break-system-packages",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
+# Uncomment and set your Hugging Face token if needed
+token = os.environ["HF_TOKEN"]
+# Configure 4-bit quantization for model loading
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+)
+# Load the Phi-3 model and tokenizer
+print("Loading model and tokenizer...")
+model_id = "microsoft/Phi-3-mini-128k-instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="cuda",
+    quantization_config=bnb_config,
+    trust_remote_code=True,
+).eval()
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+# Define the system prompt and generation pipeline
+system_prompt = "Summarize this YouTube video. Give a brief summary of the video content with the key points and main takeaways."
+messages = [{"role": "system", "content": system_prompt}]
+pipe = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+)
+generation_args = {
+    "max_new_tokens": 32767,
+    "return_full_text": False,
+    "do_sample": True,
+    "temperature": 0.2,
+}
+# Function to extract the video ID from a YouTube URL
+def extract_video_id(url):
+    video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url)
+    if video_id_match:
+        print(f"Extracted video ID: {video_id_match.group(1)}")
+        return video_id_match.group(1)
+    return None
+# Function to get the transcript of a YouTube video
+def get_transcript(video_id):
+    try:
+        transcript = YouTubeTranscriptApi.get_transcript(video_id)
+        transcription = [entry['text'] for entry in transcript]
+        print(f"Transcript: {transcription}")
+        return " ".join(transcription)
+    except Exception as e:
+        return f"Error fetching transcript: {str(e)}"
+# Function to summarize the text using the model
+def summarize_text(text):
+    messages.append({"role": "user", "content": text})
+    output = pipe(messages, **generation_args)
+    output = output[0]['generated_text'].strip() # type: ignore
+    print(f"Summary: {output}")
+    return output
+# Main function to process the video URL
+def process_video(url):
+    video_id = extract_video_id(url)
+    if not video_id:
+        print("Invalid YouTube URL")
+        return "Invalid YouTube URL"
+    transcript = get_transcript(video_id)
+    if transcript.startswith("Error"):
+        return transcript
+    summary = summarize_text(transcript)
+    return summary, transcript
+# Function to update the embedded video player
+def update_embed(url):
+    video_id = extract_video_id(url)
+    if video_id:
+        embed_url = f"https://www.youtube.com/embed/{video_id}"
+        return f"<div class='gradio-embed-container'><iframe class='gradio-embed' src='{embed_url}' frameborder='0' allowfullscreen></iframe></div>"
+    return "<div class='gradio-embed-container'><iframe class='gradio-embed' src='' frameborder='0' allowfullscreen></iframe></div>"
+# Gradio UI setup
+with gr.Blocks(css="""
+    .gradio-embed-container { position: relative; width: 100%; padding-bottom: 56.25%; height: 0; }
+    .gradio-embed { position: absolute; top: 0; left: 0; width: 100%; height: 100%; }
+    .small-font { font-size: 0.6em; }
+    """) as demo:
+    gr.Markdown("""
+    # YouTube Video Summarizer using Phi-3-mini-128k-instruct
+    Summarize any YouTube video using the Phi-3-mini-128k-instruct model.
+    """)
+    with gr.Row():
+        with gr.Column(scale=1):
+            url = gr.Textbox(
+                label="YouTube URL",
+                placeholder="https://www.youtube.com/watch?v=dQw4w9WgXcQ",
+                max_lines=1
+            )
+            summary = gr.Textbox(
+                label="Summary",
+                placeholder="Summary will appear here...",
+                lines=10,
+                show_label=True,
+                show_copy_button=True,
+                elem_classes="small-font"
+            )
+            transcript = gr.Textbox(
+                label="Transcript",
+                placeholder="Transcript will appear here...",
+                lines=1,
+                show_label=True,
+                show_copy_button=True,
+                elem_classes="small-font"
+            )
+            btn = gr.Button("Summarize")
+            btn.click(fn=process_video, inputs=url, outputs=[summary, transcript])
+        with gr.Column(scale=1):
+            video_embed = gr.HTML("<div class='gradio-embed-container'><iframe class='gradio-embed' src='' frameborder='0' allowfullscreen></iframe></div>")
+    url.change(fn=update_embed, inputs=url, outputs=video_embed)
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+torch
+transformers
+youtube_transcript_api
+google-api-python-client
+bitsandbytes
+# python-dotenv
+accelerate