summarize_youtube

Sleeping

App Files Files Community

summarize_youtube / app.py

smakamali

update app.py

b313c5d about 1 year ago

raw

history blame

8.53 kB

	def transcribe_youtube_video(url, force_transcribe=False):

	text = ''
	try:
	from youtube_transcript_api import YouTubeTranscriptApi
	import pytube
	from pytube import YouTube

	vid_id = pytube.extract.video_id(url)
	temp = YouTubeTranscriptApi.get_transcript(vid_id)
	for t in temp:
	text+=t['text']+' '
	yt = YouTube(str(url))

	except:
	pass

	if text == '' or force_transcribe:
	from pytube import YouTube
	import torch
	import os

	save_dir="./docs/youtube/"
	os.mkdir(save_dir)
	yt = YouTube(str(url))
	audio = yt.streams.filter(only_audio = True).first()
	out_file = audio.download(filename="audio.mp3",output_path = save_dir)

	import transformers

	whisper_asr = transformers.pipeline(
	"automatic-speech-recognition", model="openai/whisper-large", device_map= 'auto',
	)

	whisper_asr.model.config.forced_decoder_ids = (
	whisper_asr.tokenizer.get_decoder_prompt_ids(
	language="en",
	task="transcribe"
	)
	)
	temp = whisper_asr(out_file,chunk_length_s=20)
	text = temp['text']

	del(whisper_asr)
	torch.cuda.empty_cache()

	return yt.title, text

	def summarize_text(title,text):

	from langchain.chains.llm import LLMChain
	from langchain.prompts import PromptTemplate
	from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
	from langchain.chains.combine_documents.stuff import StuffDocumentsChain
	import torch
	import transformers
	from transformers import BitsAndBytesConfig
	from transformers import AutoTokenizer, AutoModelForCausalLM

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.float16,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_use_double_quant=True,
	)

	# model = "nomic-ai/gpt4all-falcon"
	model = "tiiuae/falcon-7b-instruct"

	tokenizer = AutoTokenizer.from_pretrained(model,trust_remote_code=True,)
	model = AutoModelForCausalLM.from_pretrained(model,
	# trust_remote_code=True,
	quantization_config=quantization_config,
	)

	from langchain import HuggingFacePipeline
	import torch

	pipeline = transformers.pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	max_new_tokens = 150,
	pad_token_id=tokenizer.eos_token_id,
	# device=-1,
	)

	llm = HuggingFacePipeline(pipeline=pipeline)

	pipeline2 = transformers.pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=torch.bfloat16,
	device_map="auto",
	max_new_tokens = 250,
	pad_token_id=tokenizer.eos_token_id,
	repetition_penalty= 2.0,
	# device=-1,
	)

	llm2 = HuggingFacePipeline(pipeline=pipeline2)

	# Map
	map_template = """
	Summarize the following text in a clear and concise way:
	TITLE: `{title}`
	TEXT:`{docs}`
	Brief Summary:
	"""
	map_prompt = PromptTemplate(template = map_template,
	input_variables = ['title','docs'])
	map_chain = LLMChain(llm=llm, prompt=map_prompt)

	# Reduce - Collapse
	reduce_template = """
	The following is set of partial summaries of a video titled {title}:
	partial summaries: {doc_summaries}
	Take these and distill them into a consolidated summary.
	Summary:
	"""

	reduce_prompt = PromptTemplate(template = reduce_template,
	input_variables = ['title','doc_summaries'])
	reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

	# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
	collapse_documents_chain = StuffDocumentsChain(
	llm_chain=reduce_chain, document_variable_name="doc_summaries"
	)

	# Final Reduce - Combine
	final_reduce_template = """
	The following is set of partial summaries of a video titled '{title}':
	partial summaries:

	{doc_summaries}

	Generate a summary of the whole text that includes `Video Subject`, and the `Key Highlights` as maximum 10 pullet points listing the main facts, arguments, or points:
	"""
	final_reduce_prompt = PromptTemplate(template = final_reduce_template,
	input_variables = ['title','doc_summaries'])
	final_reduce_chain = LLMChain(llm=llm2, prompt=final_reduce_prompt)

	# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
	combine_documents_chain = StuffDocumentsChain(
	llm_chain=final_reduce_chain, document_variable_name="doc_summaries"
	)

	# Combines and iteravely reduces the mapped documents
	reduce_documents_chain = ReduceDocumentsChain(
	# This is final chain that is called.
	combine_documents_chain=combine_documents_chain,
	# If documents exceed context for `StuffDocumentsChain`
	collapse_documents_chain=collapse_documents_chain,
	# The maximum number of tokens to group documents into.
	token_max=500,
	)

	# Combining documents by mapping a chain over them, then combining results
	map_reduce_chain = MapReduceDocumentsChain(
	# Map chain
	llm_chain=map_chain,
	# Reduce chain
	reduce_documents_chain=reduce_documents_chain,
	# The variable name in the llm_chain to put the documents in
	document_variable_name="docs",
	# Return the results of the map steps in the output
	return_intermediate_steps=False,
	)

	from langchain.document_loaders import TextLoader
	from langchain.text_splitter import TokenTextSplitter

	with open('./docs/transcript.txt','w') as f:
	f.write(text)
	loader = TextLoader("./docs/transcript.txt")
	doc = loader.load()
	text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=0)
	docs = text_splitter.split_documents(doc)

	summary = map_reduce_chain.run({'input_documents':docs, 'title':title})

	del(llm)
	del(llm2)
	del(model)
	del(tokenizer)
	torch.cuda.empty_cache()

	return summary

	import gradio as gr
	import pytube
	from pytube import YouTube

	def get_youtube_title(url):
	yt = YouTube(str(url))
	return yt.title

	def get_video(url):
	vid_id = pytube.extract.video_id(url)
	embed_html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/{}" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'.format(vid_id)
	return embed_html

	def summarize_youtube_video(url,force_transcribe):
	title,text = transcribe_youtube_video(url,force_transcribe)
	Summary = summarize_text(title,text)
	return Summary

	html = '<iframe width="100%" height="315" src="https://www.youtube.com/embed/" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>'

	with gr.Blocks() as demo:
	# gr.Markdown("Transribe a YouTube video using this demo.")
	with gr.Row():
	with gr.Column(scale=3):
	url = gr.Textbox(label="Enter YouTube video URL here:",placeholder="https://www.youtube.com/watch?v=")
	force_transcribe = gr.Checkbox(label="Transcribe even if transcription is available.")
	with gr.Column(scale=1):
	gr.Markdown("# Summarize a YouTube video using this demo!",scale=2)
	sum_btn = gr.Button("Summarize!",scale=1)
	title = gr.Textbox(label="Video Title",placeholder="title...")
	with gr.Row():
	video = gr.HTML(html)
	output = gr.Textbox(label="Summary",placeholder="summary...")
	sum_btn.click(fn=get_youtube_title, inputs=url, outputs=title, api_name="get_youtube_title")
	sum_btn.click(fn=summarize_youtube_video, inputs=[url,force_transcribe], outputs=output, api_name="summarize_youtube_video", queue=True)
	sum_btn.click(fn=get_video, inputs=url, outputs=video, api_name="get_youtube_video",queue=False)

	demo.queue()
	demo.launch(share=True)