Spaces:

AmithAdiraju1694
/

Video_Summary_Beta

Sleeping

Amith Adiraju commited on 10 days ago

Commit

ef39fae

1 Parent(s): d3acc32

Changed onnx to checkpoint based inference, as onnx file is corrupted.

Replaced medoid array with right embeddings and increased PG classification threshold to 0.95.
Modified rating status message.
fixed issues with audio text summarization.

Signed-off-by: Amith Adiraju <[email protected]>

Files changed (5) hide show

base_frame_medoid.npz +1 -1
video_rating_siamesev2.onnx → checkpoint__fl_batch_480_epoch_0.pt +2 -2
model_inference.py +32 -25
pages.py +3 -3
utils.py +31 -0

base_frame_medoid.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ccfba511e72cddd47db59a02147bfb03a8216b6bdfa4129d7c98b604cd048f7
 size 772

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dad2de4ef28891c9cf509177d3ff24beb0eea068fe3d8159b4b1050d4f55139
 size 772

video_rating_siamesev2.onnx → checkpoint__fl_batch_480_epoch_0.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f073fb71728915d53cde75d316c3196c07bda3fe79af5acab6596bc397146b6
-size 344064697

 version https://git-lfs.github.com/spec/v1
+oid sha256:15a5a088b3fc06010de0b00ccbf00f0be0058c96d667c417c9a3188fcfb6e6dc
+size 382240253

model_inference.py CHANGED Viewed

@@ -3,20 +3,21 @@ import torch
 from utils import (
     prompt_audio_summarization,
     timer,
-    cosine_sim
 )
 from transformers import BartForConditionalGeneration, BartTokenizer
 import numpy as np
 import whisper
 from streamlit import session_state as sst
-import onnxruntime
 @timer
 def get_text_from_audio(audio_tensors) -> str:
     """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
-    audio_tensors = audio_tensors.to(sst['device'])
     result = audio_transcriber_model.transcribe(audio_tensors
                                                 )
     all_transcription_segments = result["text"]
@@ -28,17 +29,23 @@ def summarize_from_text(raw_transcription):
     inputs = text_summarizer[0](prompt_audio_summarization + raw_transcription,
                                 return_tensors="pt",
                                 max_length=1024,
-                                truncation=True)\
-                                .to(sst['device'])
     summary_ids = text_summarizer[1].generate(**inputs,
                                               max_length=150,
                                               min_length=30,
                                               length_penalty=2.0,
-                                              num_beams=4
                                               )
-    return text_summarizer[0].decode(summary_ids[0], skip_special_tokens=True)
 @timer
 def rate_video_frames(video_frames):
@@ -47,39 +54,39 @@ def rate_video_frames(video_frames):
     """
     inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
-    inputs_dict = {"frames": inp_frames}
-    video_frame_emb = video_rating_model.run(['emb'], inputs_dict)[0]
     overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
-                                        emb2 = torch.tensor(video_frame_emb),
-                                        threshold=0.4
                                         )
     perc_of_upg = count_upg / (len(video_frames)//5)
-    if perc_of_upg > 0.4:
-        return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
-    else:
-        return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."
 @st.cache_resource
 def load_models():
-    sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
-    transcriber = whisper.load_model("base", device = sst['device'])
-    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(sst['device'])
     tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
-                                  dtype = torch.float32,
-                                  device = sst['device']
                                  )
-    session = onnxruntime.InferenceSession("video_rating_siamesev2.onnx",
-                                           providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
-                                           )
     return (
         transcriber, (tokenizer, model), session, base_frame_emb

 from utils import (
     prompt_audio_summarization,
     timer,
+    cosine_sim,
+    SiameseNetwork
 )
 from transformers import BartForConditionalGeneration, BartTokenizer
 import numpy as np
 import whisper
 from streamlit import session_state as sst
+import math
 @timer
 def get_text_from_audio(audio_tensors) -> str:
     """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
+    audio_tensors = audio_tensors
     result = audio_transcriber_model.transcribe(audio_tensors
                                                 )
     all_transcription_segments = result["text"]
     inputs = text_summarizer[0](prompt_audio_summarization + raw_transcription,
                                 return_tensors="pt",
                                 max_length=1024,
+                                truncation=True)
     summary_ids = text_summarizer[1].generate(**inputs,
                                               max_length=150,
                                               min_length=30,
                                               length_penalty=2.0,
+                                              num_beams=4,
+                                              early_stopping = True
                                               )
+    prediction_string = text_summarizer[0].decode(summary_ids[0], skip_special_tokens=True)
+    if prompt_audio_summarization[:15] == prediction_string[:15]:
+        prediction_string = prediction_string[50: ]
+    return prediction_string
 @timer
 def rate_video_frames(video_frames):
     """
     inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
+    with torch.no_grad():
+        video_frame_emb = video_rating_model(torch.tensor(inp_frames) )
     overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
+                                        emb2 = video_frame_emb,
+                                        threshold=0.95
                                         )
     perc_of_upg = count_upg / (len(video_frames)//5)
+    perc_of_upg = math.floor(perc_of_upg*100)
+    non_upg_perc = math.ceil(100 - perc_of_upg)
+    response_string = f"{perc_of_upg} % of important moments from this video contain Under or PG content, rest of {non_upg_perc} % moments contain atleast PG-13, R or even NC-17 content."
+    return response_string
 @st.cache_resource
 def load_models():
+    transcriber = whisper.load_model("base")
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
     tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
+                                  dtype = torch.float32
                                  )
+    session = SiameseNetwork()
+    checkpoint = torch.load('./checkpoint__fl_batch_480_epoch_0.pt', weights_only=True)
+    session.load_state_dict(checkpoint['model_state_dict'])
+    _ = session.eval()
     return (
         transcriber, (tokenizer, model), session, base_frame_emb

pages.py CHANGED Viewed

@@ -76,7 +76,7 @@ async def model_inference_page():
             try:
                 video_rating_scale = rate_video_frames(important_frames)
             except Exception as e:
-                video_rating_scale = f"Sorry, we couldn't generate rating of your video because of this error: {e} "
         st.toast("Done")
         st.header("Movie Scale Rating of Your Video: ", divider = True)
@@ -91,7 +91,7 @@ async def model_inference_page():
             try:
                 video_summary_text = get_text_from_audio(sst["audio_transcript"])
             except Exception as e:
-                video_summary_text = f"Sorry, we couldn't extract text from audio of this file because of this error: {e} "
         st.toast("Done")
         if video_summary_text[:5] != "Sorry":
@@ -99,7 +99,7 @@ async def model_inference_page():
                 try:
                     video_summary_text = summarize_from_text(video_summary_text)
                 except Exception as e:
-                    video_summary_text = f"Sorry, we couldn't summarize text from audio of this file because of this error: {e} "
             st.toast("Done")
         st.header("Audio Transcript summary of your video: ", divider = True)

             try:
                 video_rating_scale = rate_video_frames(important_frames)
             except Exception as e:
+                video_rating_scale = "Sorry, we couldn't generate rating of your video because of this error "
         st.toast("Done")
         st.header("Movie Scale Rating of Your Video: ", divider = True)
             try:
                 video_summary_text = get_text_from_audio(sst["audio_transcript"])
             except Exception as e:
+                video_summary_text = "Sorry, we couldn't extract text from audio of this file because of this error"
         st.toast("Done")
         if video_summary_text[:5] != "Sorry":
                 try:
                     video_summary_text = summarize_from_text(video_summary_text)
                 except Exception as e:
+                    video_summary_text = "Sorry, we couldn't summarize text from audio of this file"
             st.toast("Done")
         st.header("Audio Transcript summary of your video: ", divider = True)

utils.py CHANGED Viewed

@@ -12,6 +12,8 @@ import time
 from io import BytesIO
 import torch
 import soundfile as sf
 import subprocess
 from typing import List
@@ -19,6 +21,34 @@ from typing import List
 prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
 def timer(func):
     def wrapper(*args, **kwargs):
         start = time.time()
@@ -138,3 +168,4 @@ def cosine_sim(emb1, emb2, threshold = 0.5):
     cosine_sim = F.cosine_similarity(emb1, emb2)
     counts = torch.count_nonzero(cosine_sim > threshold).numpy()
     return (cosine_sim.mean(), counts)

 from io import BytesIO
 import torch
+import torchvision.models as models
+import torch.nn as nn
 import soundfile as sf
 import subprocess
 from typing import List
 prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
+class SiameseNetwork(nn.Module):
+    def __init__(self, model_name="vit_b_16"):
+        super(SiameseNetwork, self).__init__()
+        self.encoder = models.vit_b_16(weights="IMAGENET1K_V1")  # Pretrained ViT
+        self.encoder.heads = nn.Identity()  # Remove classification head
+        self.fc = nn.Linear(768, 128)  # Reduce to 128-d embedding
+    def forward(self, frames):
+        B,num_frames,H,W,C = frames.shape  # (Batch,num_frames, H, W, C)
+        # Flatten frames into batch dimension for ViT
+        frames = frames.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
+        # Extract frame-level embeddings
+        emb = self.encoder(frames)
+        # Reshape back to (B, T, 768) and average over T
+        #TODO: Change this to use LSTM instead of averaging
+        emb = emb.reshape(B, num_frames, -1).mean(dim=1)  # (B, 768)
+        # Pass through fully connected layer
+        emb = self.fc(emb)
+        return emb
 def timer(func):
     def wrapper(*args, **kwargs):
         start = time.time()
     cosine_sim = F.cosine_similarity(emb1, emb2)
     counts = torch.count_nonzero(cosine_sim > threshold).numpy()
     return (cosine_sim.mean(), counts)