Spaces:
Running
Running
import argparse | |
import datetime | |
import os | |
import gradio as gr | |
from signal import SIGINT, signal | |
from utils.log import debug, info, logger, breakPoint as bc | |
import requests | |
from constants import * | |
CHUNK_SIZE = 512 | |
VIDEO_ID = "" | |
OUT_PPT_NAME= PPTX_DEST | |
NO_IMAGES = False | |
QUESTIONS = 5 | |
def init_check(): | |
# check for google-chrome | |
if os.system("google-chrome --version") != 0: | |
logger.critical("Google Chrome is not installed") | |
if os.path.exists("scripts/chrome-setup.sh"): | |
logger.info("Trying to install chrome..") | |
os.system("bash scripts/chrome-setup.sh") | |
if os.system("npm --version") != 0: | |
logger.critical("npm is not installed") | |
if os.system("npx --version") != 0: | |
logger.critical("npx is not installed") | |
if os.system("ffmpeg --version") != 0: | |
logger.critical("ffmpeg is not installed") | |
logger.info("Init check done, look for errors above..") | |
def gradio_run( | |
video_id, chunk_size: int, | |
no_images: bool, no_chapters: bool, out_type="pdf"): | |
# do init check | |
init_check() | |
VIDEO_ID = video_id | |
CHUNK_SIZE = chunk_size | |
NO_IMAGES = no_images | |
NO_CHAPTERS = no_chapters | |
OUT_PPT_NAME = f"{OUTDIR}/gradio-out{VIDEO_ID}.{out_type}" | |
info("Loading modules..") | |
from langchain.chains.summarize import load_summarize_chain | |
# from langchain.vectorstores import Chroma | |
# from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
# from langchain.chains import RetrievalQA | |
# from langchain.llms import HuggingFacePipeline | |
from langchain.docstore.document import Document | |
from rich.progress import track | |
import utils.markdown as md | |
from models.lamini import lamini as model | |
from utils.marp_wrapper import marp | |
from utils.ppt import generate_ppt | |
from utils.subtitles import subs | |
from utils.video import video | |
from utils.chunk import ChunkByChapters | |
# intialize marp | |
out = marp(MD_DEST) | |
out.add_header(config=MARP_GAIA) | |
# out.add_body("<style> section { font-size: 1.5rem; } </style>") | |
# initialize video | |
vid = video(VIDEO_ID, f"{OUTDIR}/vid-{VIDEO_ID}") | |
vid.download() | |
# initialize model | |
llm_model = model | |
llm = llm_model.load_model( | |
max_length=400, | |
temperature=0, | |
top_p=0.95, | |
repetition_penalty=1.15 | |
) | |
# slice subtitle and chunk them | |
# to CHUNK_SIZE based on chapters | |
info(f"Getting subtitles {VIDEO_ID}..") | |
raw_subs = vid.getSubtitles() | |
if raw_subs is None: | |
logger.critical("No subtitles found, exiting..") | |
exit() | |
info(f"got {len(raw_subs)} length subtitles") | |
if NO_CHAPTERS: | |
chunker = subs(VIDEO_ID) | |
chunks = chunker.getSubsList(size=CHUNK_SIZE) | |
model_tmplts = llm_model.templates() | |
summarizer = model_tmplts.summarize | |
title_gen = model_tmplts.generate_title | |
# title Photo | |
first_pic = str(datetime.timedelta(seconds=chunks[0][1])) | |
img_name = f"vid-{VIDEO_ID}_{first_pic}.png" | |
img_path = f"{PNG_DEST}/{img_name}" | |
vid.getframe(first_pic, img_path) | |
out.add_page(md.h1(VIDEO_ID), md.image(url=img_name)) | |
out.marp_end() | |
FCL = len(chunks) # full chunk length | |
CCH = 0 | |
for chunk in track(chunks, description="(processing chunks) Summarizing.."): | |
CCH += 1 | |
logger.info(f"{CCH}/{FCL} - {(CCH/FCL)*100:.2f}% - PROCESSING CHUNKS.") | |
summary = summarizer(chunk[0])[0]["generated_text"].replace("-", "\n-") | |
title = title_gen(chunk[0])[0]["generated_text"] | |
heading = md.h2 if len(title) < 40 else md.h3 | |
out.add_page(heading(title), summary) | |
if not NO_IMAGES and len(summary+title) < 270: | |
timestamp = str(datetime.timedelta(seconds=chunk[1])) | |
imgName = f"vid-{VIDEO_ID}_{timestamp}.png" | |
imgPath = f"{PNG_DEST}/{imgName}" | |
vid.getframe(timestamp, imgPath) | |
out.add_body(md.image(imgName, align="left", setAsBackground=True)) | |
out.marp_end() | |
else: | |
raw_chapters = vid.getChapters(f"{YT_CHAPTER_ENDPOINT}{VIDEO_ID}") | |
chunk_dict = ChunkByChapters(raw_chapters, raw_subs, CHUNK_SIZE) | |
chain = load_summarize_chain(llm, chain_type="stuff") | |
# TODO: ( use refine chain type to summarize all chapters ) | |
img_hook = False | |
for title, subchunks in track(chunk_dict.items(), description="(processing chunks) Summarizing.."): | |
# Typecase subchunks to Document for every topic | |
# get summary for every topic with stuff/refine chain | |
# add to final summary | |
debug(subchunks) | |
docs = [ Document(page_content=t[0]) for t in subchunks[0] ] | |
summary = chain.run(docs) | |
if img_hook == False: | |
ts = str(datetime.timedelta(seconds=subchunks[0][1][0])) | |
img_path = f"{PNG_DEST}/vid-{VIDEO_ID}_{ts}.png" | |
vid.getframe(ts, img_path) | |
if os.path.exists(img_path): | |
# if summary is long ignore images for better page and no clipping | |
if len(summary+title) < 270: | |
out.add_body(md.image( | |
img_path.replace(f"{OUTEXTRA}/", ""), | |
align="left", | |
setAsBackground=True | |
)) | |
out.add_page(md.h2(title), summary) | |
out.marp_end() | |
info(f"Generating {OUT_PPT_NAME}..") | |
out.close_file() | |
generate_ppt(MD_DEST, OUT_PPT_NAME) | |
print(f"Done! {OUT_PPT_NAME}") | |
return os.path.abspath(OUT_PPT_NAME) | |
def gradio_Interface(): | |
init_check() | |
app = gr.Interface( | |
fn=gradio_run, | |
inputs=[ | |
"text", | |
gr.Slider(1, 2000, 1, label="Chunk Size", info="More chunk size = longer text & shorter numbber of slides"), | |
gr.Checkbox(label="No Images", info="Don't keep images in output ( gives more spaces for larger text)"), | |
gr.Checkbox(label="No Chapters", info="Don't use chapter based chunking"), | |
gr.Dropdown(["pptx", "pdf", "html"], label="file format", info="which file format to generte.") | |
], | |
outputs="file" | |
) | |
app.launch() | |
if __name__ == "__main__": | |
logger.info("Starting gradio interface..") | |
if not os.path.exists(OUTDIR): | |
os.mkdir(OUTDIR) | |
os.mkdir(OUTEXTRA) | |
if not os.path.exists(OUTEXTRA): | |
os.mkdir(OUTEXTRA) | |
gradio_Interface() | |