Spaces:

AmithAdiraju1694
/

Video_Summary_Beta

Sleeping

App Files Files Community

change_medoid_inference

by AmithAdiraju1694 - opened 11 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+70

-32

Files changed (5) hide show

base_frame_medoid.npz +1 -1
video_rating_siamesev2.onnx → checkpoint__fl_batch_480_epoch_0.pt +2 -2
model_inference.py +32 -25
pages.py +3 -3
utils.py +31 -0

base_frame_medoid.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7ccfba511e72cddd47db59a02147bfb03a8216b6bdfa4129d7c98b604cd048f7
 size 772

 version https://git-lfs.github.com/spec/v1
+oid sha256:9dad2de4ef28891c9cf509177d3ff24beb0eea068fe3d8159b4b1050d4f55139
 size 772

video_rating_siamesev2.onnx → checkpoint__fl_batch_480_epoch_0.pt RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f073fb71728915d53cde75d316c3196c07bda3fe79af5acab6596bc397146b6
-size 344064697

 version https://git-lfs.github.com/spec/v1
+oid sha256:15a5a088b3fc06010de0b00ccbf00f0be0058c96d667c417c9a3188fcfb6e6dc
+size 382240253

model_inference.py CHANGED Viewed

@@ -3,20 +3,21 @@ import torch
 from utils import (
     prompt_audio_summarization,
     timer,
-    cosine_sim
 )
 from transformers import BartForConditionalGeneration, BartTokenizer
 import numpy as np
 import whisper
 from streamlit import session_state as sst
-import onnxruntime
 @timer
 def get_text_from_audio(audio_tensors) -> str:
     """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
-    audio_tensors = audio_tensors.to(sst['device'])
     result = audio_transcriber_model.transcribe(audio_tensors
                                                 )
     all_transcription_segments = result["text"]
@@ -28,17 +29,23 @@ def summarize_from_text(raw_transcription):
     inputs = text_summarizer[0](prompt_audio_summarization + raw_transcription,
                                 return_tensors="pt",
                                 max_length=1024,
-                                truncation=True)\
-                                .to(sst['device'])
     summary_ids = text_summarizer[1].generate(**inputs,
                                               max_length=150,
                                               min_length=30,
                                               length_penalty=2.0,
-                                              num_beams=4
                                               )
-    return text_summarizer[0].decode(summary_ids[0], skip_special_tokens=True)
 @timer
 def rate_video_frames(video_frames):
@@ -47,39 +54,39 @@ def rate_video_frames(video_frames):
     """
     inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
-    inputs_dict = {"frames": inp_frames}
-    video_frame_emb = video_rating_model.run(['emb'], inputs_dict)[0]
     overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
-                                        emb2 = torch.tensor(video_frame_emb),
-                                        threshold=0.4
                                         )
     perc_of_upg = count_upg / (len(video_frames)//5)
-    if perc_of_upg > 0.4:
-        return f"Out of {len(video_frames)} important moments of this video, {count_upg*5} moments contain under or at least PG content. Hence this video is suitable for kids & family."
-    else:
-        return f"Out of {len(video_frames)} important moments of this video, {(len(video_frames)//5 - count_upg)*5} moments contain at least PG-13 content.Hence parental guidance is strongly suggested for this video."
 @st.cache_resource
 def load_models():
-    sst['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'
-    transcriber = whisper.load_model("base", device = sst['device'])
-    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(sst['device'])
     tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
-                                  dtype = torch.float32,
-                                  device = sst['device']
                                  )
-    session = onnxruntime.InferenceSession("video_rating_siamesev2.onnx",
-                                           providers=['CUDAExecutionProvider', 'CPUExecutionProvider']
-                                           )
     return (
         transcriber, (tokenizer, model), session, base_frame_emb

 from utils import (
     prompt_audio_summarization,
     timer,
+    cosine_sim,
+    SiameseNetwork
 )
 from transformers import BartForConditionalGeneration, BartTokenizer
 import numpy as np
 import whisper
 from streamlit import session_state as sst
+import math
 @timer
 def get_text_from_audio(audio_tensors) -> str:
     """Transcribe multiple audio tensors in parallel using Whisper's batch processing."""
     # Transcribe the in-memory audio
+    audio_tensors = audio_tensors
     result = audio_transcriber_model.transcribe(audio_tensors
                                                 )
     all_transcription_segments = result["text"]
     inputs = text_summarizer[0](prompt_audio_summarization + raw_transcription,
                                 return_tensors="pt",
                                 max_length=1024,
+                                truncation=True)
     summary_ids = text_summarizer[1].generate(**inputs,
                                               max_length=150,
                                               min_length=30,
                                               length_penalty=2.0,
+                                              num_beams=4,
+                                              early_stopping = True
                                               )
+    prediction_string = text_summarizer[0].decode(summary_ids[0], skip_special_tokens=True)
+    if prompt_audio_summarization[:15] == prediction_string[:15]:
+        prediction_string = prediction_string[50: ]
+    return prediction_string
 @timer
 def rate_video_frames(video_frames):
     """
     inp_frames = np.array(video_frames, dtype = np.float32).reshape(len(video_frames)//5, 5, 224,224,3)# 20,5,224,224,3
+    with torch.no_grad():
+        video_frame_emb = video_rating_model(torch.tensor(inp_frames) )
     overall_sim, count_upg = cosine_sim(emb1 = base_frame_emb,
+                                        emb2 = video_frame_emb,
+                                        threshold=0.95
                                         )
     perc_of_upg = count_upg / (len(video_frames)//5)
+    perc_of_upg = math.floor(perc_of_upg*100)
+    non_upg_perc = math.ceil(100 - perc_of_upg)
+    response_string = f"{perc_of_upg} % of important moments from this video contain Under or PG content, rest of {non_upg_perc} % moments contain atleast PG-13, R or even NC-17 content."
+    return response_string
 @st.cache_resource
 def load_models():
+    transcriber = whisper.load_model("base")
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
     tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
     base_frame_emb = torch.tensor(
                                   np.load('base_frame_medoid.npz')['arr'],
+                                  dtype = torch.float32
                                  )
+    session = SiameseNetwork()
+    checkpoint = torch.load('./checkpoint__fl_batch_480_epoch_0.pt', weights_only=True)
+    session.load_state_dict(checkpoint['model_state_dict'])
+    _ = session.eval()
     return (
         transcriber, (tokenizer, model), session, base_frame_emb

pages.py CHANGED Viewed

@@ -76,7 +76,7 @@ async def model_inference_page():
             try:
                 video_rating_scale = rate_video_frames(important_frames)
             except Exception as e:
-                video_rating_scale = f"Sorry, we couldn't generate rating of your video because of this error: {e} "
         st.toast("Done")
         st.header("Movie Scale Rating of Your Video: ", divider = True)
@@ -91,7 +91,7 @@ async def model_inference_page():
             try:
                 video_summary_text = get_text_from_audio(sst["audio_transcript"])
             except Exception as e:
-                video_summary_text = f"Sorry, we couldn't extract text from audio of this file because of this error: {e} "
         st.toast("Done")
         if video_summary_text[:5] != "Sorry":
@@ -99,7 +99,7 @@ async def model_inference_page():
                 try:
                     video_summary_text = summarize_from_text(video_summary_text)
                 except Exception as e:
-                    video_summary_text = f"Sorry, we couldn't summarize text from audio of this file because of this error: {e} "
             st.toast("Done")
         st.header("Audio Transcript summary of your video: ", divider = True)

             try:
                 video_rating_scale = rate_video_frames(important_frames)
             except Exception as e:
+                video_rating_scale = "Sorry, we couldn't generate rating of your video because of this error "
         st.toast("Done")
         st.header("Movie Scale Rating of Your Video: ", divider = True)
             try:
                 video_summary_text = get_text_from_audio(sst["audio_transcript"])
             except Exception as e:
+                video_summary_text = "Sorry, we couldn't extract text from audio of this file because of this error"
         st.toast("Done")
         if video_summary_text[:5] != "Sorry":
                 try:
                     video_summary_text = summarize_from_text(video_summary_text)
                 except Exception as e:
+                    video_summary_text = "Sorry, we couldn't summarize text from audio of this file"
             st.toast("Done")
         st.header("Audio Transcript summary of your video: ", divider = True)

utils.py CHANGED Viewed

@@ -12,6 +12,8 @@ import time
 from io import BytesIO
 import torch
 import soundfile as sf
 import subprocess
 from typing import List
@@ -19,6 +21,34 @@ from typing import List
 prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
 def timer(func):
     def wrapper(*args, **kwargs):
         start = time.time()
@@ -138,3 +168,4 @@ def cosine_sim(emb1, emb2, threshold = 0.5):
     cosine_sim = F.cosine_similarity(emb1, emb2)
     counts = torch.count_nonzero(cosine_sim > threshold).numpy()
     return (cosine_sim.mean(), counts)

 from io import BytesIO
 import torch
+import torchvision.models as models
+import torch.nn as nn
 import soundfile as sf
 import subprocess
 from typing import List
 prompt_audio_summarization = "This is a video transcript, tell me what is this about: "
+class SiameseNetwork(nn.Module):
+    def __init__(self, model_name="vit_b_16"):
+        super(SiameseNetwork, self).__init__()
+        self.encoder = models.vit_b_16(weights="IMAGENET1K_V1")  # Pretrained ViT
+        self.encoder.heads = nn.Identity()  # Remove classification head
+        self.fc = nn.Linear(768, 128)  # Reduce to 128-d embedding
+    def forward(self, frames):
+        B,num_frames,H,W,C = frames.shape  # (Batch,num_frames, H, W, C)
+        # Flatten frames into batch dimension for ViT
+        frames = frames.permute(0,1,4,2,3).reshape(B * num_frames, C,H,W)
+        # Extract frame-level embeddings
+        emb = self.encoder(frames)
+        # Reshape back to (B, T, 768) and average over T
+        #TODO: Change this to use LSTM instead of averaging
+        emb = emb.reshape(B, num_frames, -1).mean(dim=1)  # (B, 768)
+        # Pass through fully connected layer
+        emb = self.fc(emb)
+        return emb
 def timer(func):
     def wrapper(*args, **kwargs):
         start = time.time()
     cosine_sim = F.cosine_similarity(emb1, emb2)
     counts = torch.count_nonzero(cosine_sim > threshold).numpy()
     return (cosine_sim.mean(), counts)