Spaces:

AmithAdiraju1694
/

Video_Summary_Beta

Sleeping

App Files Files Community

AmithAdiraju1694 commited on 17 days ago

Commit

533891e

verified ·

1 Parent(s): 2eeab11

Upload 10 files

Browse files

Files changed (9) hide show

__init__.py +1 -0
base_frame_medoid.npz +3 -0
model_inference.py +168 -0
packages.txt +3 -0
pages.py +107 -0
preprocessing.py +109 -0
requirements.txt +38 -0
runtime.txt +1 -0
utils.py +143 -0

__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ RuntimeError: cannot reshape tensor of 0 elements into shape [1, 0, 8, -1] because the unspecified dimension size -1 can be any value and is ambiguous

base_frame_medoid.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a14d1f517dcc8296c0f67402f44ffb071ed751749c27f4641f84e20f4e99ff1
+size 717

model_inference.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from transformers import pipeline
+import torch
+from PIL import Image
+import torch.nn as nn
+import torchvision.models as models
+import torch.nn.functional as F
+from PIL import Image
+from utils import prompt_frame_summarization, assistant_role, prompt_audio_summarization
+import streamlit as st
+from utils import timer
+import numpy as np
+import whisper
+from utils import batch_generator, cosine_sim
+class SiameseNetwork(nn.Module):
+    def __init__(self, model_name="vit_b_16"):
+        super(SiameseNetwork, self).__init__()
+        self.encoder = models.vit_b_16(weights="IMAGENET1K_V1")  # Pretrained ViT
+        self.encoder.heads = nn.Identity()  # Remove classification head
+        self.fc = nn.Linear(768, 128)  # Reduce to 128-d embedding
+    def forward(self, video_frames1, video_frames2):
+        """
+        video1: (B, nf, H, W, C)  # Batch of videos (50 frames each)
+        video2: (B, nf, H, W, C)
+        """
+        B,num_frames,H,W,C = video_frames1.shape  # (Batch, Channels, H, W)
+        # Flatten frames into batch dimension for ViT
+        video_frames1 = video_frames1.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
+        video_frames2 = video_frames2.reshape(0,1,4,2,3).reshape(B * num_frames, C,H,W)
+        # Extract frame-level embeddings
+        emb1 = self.encoder(video_frames1)  # (B*num_frames, 768)
+        emb2 = self.encoder(video_frames2)
+        # Reshape back to (B, T, 768) and average over T
+        #TODO: Change this to use LSTM instead of averaging
+        emb1 = emb1.reshape(B, num_frames, -1).mean(dim=1)  # (B, 768)
+        emb2 = emb2.reshape(B, num_frames, -1).mean(dim=1)
+        # Pass through fully connected layer
+        emb1 = self.fc(emb1)  # (B, 128)
+        emb2 = self.fc(emb2)
+        return emb1, emb2
+    def inference(self, video_frames):
+        """
+        video: (B, 50, C, H, W)
+        """
+        B, num_frames, H, W, C = video_frames.shape
+        video_frames = video_frames.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
+        emb = self.encoder(video_frames)
+        emb = emb.reshape(B, num_frames, -1).mean(dim=1)
+        emb = self.fc(emb)
+        return emb
+@timer
+def summarize_from_audio(audio_tensor):
+    # Transcribe the in-memory audio
+    result = audio_transcriber_model.transcribe(audio_tensor)
+    all_transcription_segments = result["text"]
+    summary = text_summarizer(prompt_audio_summarization + all_transcription_segments,
+                              max_length=108,
+                              min_length=36, do_sample=False)[0]['summary_text']
+    return summary
+def get_important_frames_ML(frame):
+    """
+    Classifies frames using your second ML model.
+    """
+    # Implement your model's logic here
+    # ...
+    return None
+def Vit_Summarize_Video(video_frames):
+    """
+    Summarizes video frames into a text sentence.
+    """
+    processor = None
+    messages = None
+    model = None
+    tokenizer = None
+    if video_frames is None or len(video_frames) == 0:
+        return "Error: No video frames available."
+    # Ensure frames are properly formatted
+    video_frames = [Image.fromarray(frame.astype("uint8")) for frame in video_frames]
+    # Ensure correct format for processor
+    inputs = processor(messages, images=None, videos=[video_frames])
+    inputs.update({
+        "tokenizer": tokenizer,
+        "max_new_tokens": 54,
+        "decode_text": True,
+                 })
+    summary_text = model.generate(**inputs)
+    return summary_text
+@timer
+def rate_video_frames(video_frames):
+    """
+    Classifies video frames into another category.
+    """
+    tensor = torch.tensor(
+                            np.array(video_frames),
+                            dtype = torch.float32
+                         ).reshape(len(video_frames)//5, 5, 224,224,3) # 20,5,224,224,3
+    video_frame_emb = video_rating_model.inference(tensor) # 20,128
+    overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
+                                        emb2 = video_frame_emb,
+                                        threshold=0.4
+                                        )
+    if count_upg / (len(video_frames)//5) > 0.5:
+        return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
+    else:
+        return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."
+@st.cache_resource
+def load_models():
+    transcriber = whisper.load_model("base")
+    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    base_frame_emb = torch.tensor(
+                                  np.load('base_frame_medoid.npz')['arr'],
+                                  dtype = torch.float32
+                                 )
+    video_rating_model = SiameseNetwork()
+    # video_rating_model.load_state_dict(
+    #     torch.load('/Users/amithadiraju/Desktop/Video_Summary_App/video_contrastive-siamese_v3.pt',
+    #                weights_only = True
+    #                )
+    # )
+    video_rating_model.eval()
+    return (
+        transcriber, summarizer, video_rating_model, base_frame_emb
+            )
+audio_transcriber_model, text_summarizer, video_rating_model,base_frame_emb = load_models()

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+libsndfile1
+pkgconfig

pages.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+from streamlit import session_state as sst
+import time
+import pandas as pd
+from utils import navigate_to
+from model_inference import rate_video_frames,summarize_from_audio
+from utils import read_important_frames, extract_audio
+import numpy as np
+# Define size limits (adjust based on your system)
+SMALL_VIDEO_LIMIT_MB = 35  # Files ≤ 35MB are small
+LARGE_VIDEO_LIMIT_MB = 50  # Max large file upload allowed
+# Convert MB to bytes
+SMALL_VIDEO_LIMIT_BYTES = SMALL_VIDEO_LIMIT_MB * 1024 * 1024
+LARGE_VIDEO_LIMIT_BYTES = LARGE_VIDEO_LIMIT_MB * 1024 * 1024
+async def landing_page():
+    uploaded_file = st.file_uploader("Upload a video",
+                                     type=["mp4", "avi", "mov"])
+    if uploaded_file is not None:
+        file_size = uploaded_file.size  # Get file size in bytes
+        # Restrict max file upload size
+        if file_size > LARGE_VIDEO_LIMIT_BYTES:
+            st.error(f"File is too large! Max allowed size is {LARGE_VIDEO_LIMIT_MB}MB. Please upload a smaller version of it.")
+        else:
+            # bytes object which can be translated to audio or video
+            video_bytes = uploaded_file.read()
+            with st.spinner("Getting most important moments from your video."):
+                important_frames = read_important_frames(video_bytes, 100)
+            st.success(f"Got important moments.")
+            print(f"Time taken to extract {len(important_frames)} important frames: {read_important_frames.total_time}")
+            with st.spinner("Getting audio transcript from your video for summary"):
+                audio_transcript_bytes = extract_audio(video_bytes)
+            st.success(f"Got audio transcript.")
+            print(f"Time taken to extract audio data: {extract_audio.total_time}")
+            # add important frames to session state and redirect to model inference page
+            sst["important_frames"] = important_frames
+            # add audio transcript to session state
+            sst["audio_transcript"] = audio_transcript_bytes
+            st.button("Summarize & Analyze Video",
+                      on_click = navigate_to,
+                      args = ("model_inference_page",)
+                      )
+async def model_inference_page():
+    df = pd.DataFrame([('Video_Text_Summary', 'Video_Rating_Scale')])
+    sl_df = st.table(df)
+    # check if audio is present and it's non-empty
+    if "audio_transcript" in sst:
+        video_summary_text = summarize_from_audio(sst["audio_transcript"])
+        if len(video_summary_text) > 0:
+            pass
+        else:
+            video_summary_text = "Sorry, we couldn't find any audio data from your video, hence couldn't generate any summary"
+        print("Time taken to generate text summary from audio in seconds: ", summarize_from_audio.total_time)
+    # check if frames are present and they are non-empty
+    if "important_frames" in sst:
+        important_frames = sst["important_frames"]
+        with st.spinner("Generating text summary for your video"):
+            video_rating_scale = rate_video_frames(important_frames)
+        if len(video_rating_scale) > 0:
+            pass
+        else:
+            video_rating_scale = "Sorry, we couldn't find any images from your video, hence couldn't generate any summary"
+        print("Time taken to generate video rating in seconds: ", rate_video_frames.total_time)
+    sl_df.add_rows(
+                [( video_summary_text, video_rating_scale ) ]
+                  )
+    st.button("Go Home",
+                      on_click = navigate_to,
+                      args = ("landing_page",)
+                      )

preprocessing.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import numpy as np
+def pad_to_center(img,img_height, img_width, req_height, req_width):
+    """
+    Goal of this function is to increase original image size, upto the
+    req_height and width.
+    Parameters:
+        img -> 3D numpy array of shape Height x Width x Channels
+        img_height -> height of current image
+        img_width -> width of current image
+        req_height -> max height you want your image to be padded to.
+        req_width -> max width you want your image to be padded to.
+    """
+    # How many rows and columns needs to be added to make current image
+    # upto what is needed
+    rem_height = req_height - img_height
+    rem_width = req_width - img_width
+    # split the remaining height to be added evenly, to pad on top
+    # and bottom
+    pad_top = rem_height // 2
+    pad_bottom = rem_height - pad_top
+    # split the remaining width to be added evenly, to pad on left
+    # and right
+    pad_left = rem_width // 2
+    pad_right = rem_width - pad_left
+    # Don't pad along batch size and channels dimension, pad everything else to required height and width
+    # we are basically telling how many values to add on each of the 4 sides of image
+    return np.pad(
+                        img,
+                    (
+                        (pad_top, pad_bottom),
+                        (pad_left, pad_right),
+                        (0,0)
+                    ),
+                    mode = 'reflect'
+                )
+def crop_to_center(img, img_height, img_width, req_height, req_width, req_channel = 3):
+    """
+    Goal of this function is to reduce the original image(s) size to the required height and width,
+    by making sure that we only trim the edges of images and not the middle part.
+    Parameters:
+        img -> 4d numpy array batch_size/num frames x Height x width x Channels
+    """
+    # difference in height and widths of an image, divided into equal halfs
+    toph = (img_height - req_height)//2
+    leftw = (img_width - req_width)//2
+    # from bottom of image, how far up to go, so that top-bottom is center of image
+    bothen = img_height - toph
+    # from right of image, how far left to go, so that left-right is center of image
+    rightwen = img_width - leftw
+    cropped_image = img[toph:bothen, leftw:rightwen, :]
+    assert cropped_image.shape == (req_height, req_width, req_channel)
+    return cropped_image
+def preprocess_images(img, req_height, req_width):
+    """
+    Crops the input image array to the specified height and width,
+    centered around the middle.
+    Args:
+        img (np.ndarray): The image to crop, represented as a NumPy array
+                          (height, width, channels).
+        crop_height (int): The desired height of the cropped image.
+        crop_width (int): The desired width of the cropped image.
+    Returns:
+        np.ndarray: The center-cropped image.
+    """
+    image_shape_tuple = img.shape
+    assert len(image_shape_tuple) == 3, f"Please pass a 3D image with height, width and channels , you passed: {image_shape_tuple}"
+    # assuming it to be H,W,C
+    img_height, img_width, img_channel = image_shape_tuple
+    # if original image height is less than req_height or original width less than req_width
+    # pad them until required dimensions and return
+    if img_height < req_height or img_width < req_width:
+        return pad_to_center(img, img_height, img_width, req_height, req_width)
+    # if your image size is same as cropped size, nothing to crop
+    elif img_height == req_height and img_width == req_width:
+        return img
+    # your image height is greater than required height and width, so crop it to center frames.
+    else:
+        return crop_to_center(img,
+                                img_height,
+                                img_width,
+                                req_height,
+                                req_width,
+                                img_channel
+                                )

requirements.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+aiohappyeyeballs==2.4.0
+aiohttp==3.10.5
+aiosignal==1.3.1
+async-timeout==4.0.3
+av==14.1.0
+ctranslate2==4.5.0
+ffmpeg-python==0.2.0
+httpcore==1.0.7
+huggingface-hub==0.24.6
+Jinja2==3.1.5
+networkx==3.2.1
+nltk==3.9.1
+num2words==0.5.14
+numba==0.60.0
+numpy==1.26.3
+openai-whisper==20240930
+opencv-python==4.11.0.86
+peft==0.12.0
+pillow==10.4.0
+protobuf==5.29.3
+pydantic==1.10.21
+PyYAML==6.0.2
+safetensors==0.4.5
+scipy==1.13.1
+sentencepiece==0.2.0
+smmap==5.0.2
+sniffio==1.3.1
+soundfile==0.13.1
+sseclient-py==1.8.0
+streamlit==1.42.0
+tiktoken==0.8.0
+tokenizers==0.21.0
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
+transformers==4.48.3
+typeguard==4.4.1
+typing_extensions==4.12.2

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.9.6

utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from streamlit import session_state as sst
+import time
+import torch.nn.functional as F
+import cv2
+import av
+import heapq
+import numpy as np
+from preprocessing import preprocess_images
+import time
+import io
+from io import BytesIO
+import torch
+import soundfile as sf
+import subprocess
+from typing import List
+prompt_frame_summarization = "These are important frames of a video file. Please generate summary such that end user gets gist of what the video is about."
+prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
+assistant_role = "You are agent who summarizes videos from important frames, use domain specific language to generate summary: sports, cartoon, education,finance etc."
+def timer(func):
+    def wrapper(*args, **kwargs):
+        start = time.time()
+        result = func(*args, **kwargs)
+        duration = time.time() - start
+        wrapper.total_time += duration
+        print(f"Execution time of {func}: {duration}")
+        return result
+    wrapper.total_time = 0
+    return wrapper
+def navigate_to(page: str) -> None:
+    """
+    Function to set the current page in the state of streamlit. A helper for
+    simulating navigation in streamlit.
+    Parameters:
+        page: str, required.
+    Returns:
+        None
+    """
+    sst["page"] = page
+@timer
+def read_important_frames(video_bytes, top_k_frames) -> List:
+    # reading uploaded vidoe in memory
+    video_io  = io.BytesIO(video_bytes)
+    # opening uploaded video frames
+    container = av.open(video_io, format='mp4')
+    prev_frame = None; important_frames = []
+    # for each frame, find if it's movement worthy and push to heap for top_k movement frames
+    for frameId, frame in enumerate( container.decode(video=0) ):  # Decode all frames
+        img = frame.to_ndarray(format="bgr24")  # Convert frame to NumPy array (BGR format)
+        assert len(img.shape) == 3, f"Instead it is: {img.shape}"
+        if prev_frame is not None:
+            # Compute frame difference in gray scale for efficiency
+            diff = cv2.absdiff(prev_frame, img)
+            gray_diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
+            movement_score = np.sum(gray_diff)  # Sum of pixel differences
+            processed_frame = preprocess_images(frame.to_ndarray(format="rgb24") ,
+                                                224,
+                                                224
+                                                )
+            # Thresholding to detect movement (adjust based on video)
+            if len(important_frames) < top_k_frames:  # Tune threshold for motion sensitivity
+                heapq.heappush(important_frames,
+                               (movement_score, frameId, processed_frame)
+                                )
+            else:
+                heapq.heappushpop(important_frames,
+                                  (movement_score, frameId, processed_frame)
+                                  )
+        prev_frame = img  # Update previous frame
+    # sorting top_k frames in chronological order of their appearance. This is quickest LOC.
+    important_frames = [item[2] for item in sorted(important_frames, key = lambda x: x[1])]
+    return important_frames
+@timer
+def extract_audio(video_bytes):
+    """Extracts raw audio from a video file given as bytes without writing temp files."""
+    # Run FFmpeg to extract raw WAV audio without writing a file
+    process = subprocess.run(
+        ["ffmpeg", "-i", "pipe:0", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", "-f", "wav", "pipe:1"],
+        input=video_bytes,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.DEVNULL
+    )
+    # Convert FFmpeg output to a BytesIO stream
+    audio_stream = BytesIO(process.stdout)
+    # Read the audio stream into a NumPy array
+    audio_array, sample_rate = sf.read(audio_stream, dtype="float32")
+    # Convert to PyTorch tensor (Whisper expects a torch.Tensor)
+    audio_tensor = torch.tensor(audio_array)
+    return audio_tensor
+def batch_generator(array_list, batch_size=5):
+    """
+    Generator that yields batches of 5 NumPy arrays stacked along the first dimension.
+    Parameters:
+        array_list (list of np.ndarray): List of NumPy arrays of shape (H, W, C).
+        batch_size (int): Number of arrays per batch (default is 5).
+    Yields:
+        np.ndarray: A batch of shape (batch_size, H, W, C).
+    """
+    for i in range(0, len(array_list), batch_size):
+        batch = array_list[i:i + batch_size]
+        if len(batch) == batch_size:
+            yield np.stack(batch, axis=0)
+@timer
+def cosine_sim(emb1, emb2, threshold = 0.5):
+    cosine_sim = F.cosine_similarity(emb1, emb2)
+    counts = torch.count_nonzero(cosine_sim > threshold).numpy()
+    return (cosine_sim.mean(), counts)