AIL-Caption-lalala-Dup

Runtime error

App Files Files Community

Monius commited on Jun 5, 2024

Commit

dc465b0

1 Parent(s): 1416e63

v0.3 by <M0n-ius>

Browse files

Files changed (10) hide show

constraint.py +77 -4
run.py +17 -14
test.py +34 -0
utils/__init__.py +5 -1
utils/anthropic.py +17 -0
utils/azure.py +13 -33
utils/base.py +47 -0
utils/google.py +16 -0
utils/openai.py +10 -0
utils/video.py +46 -9

constraint.py CHANGED Viewed

@@ -1,9 +1,82 @@
-SYS_PROMPT = """Create a detailed and accurate video description, starting from a specific scene and possibly transitioning through various themes and settings. Begin by describing the initial scene in detail, including the environment, key objects, any characters or creatures and their actions, and the overall atmosphere, considering specific aspects such as shot sizes (extreme close-up, close-up, medium, full, wide, etc.), camera movements (push, pull, shake, pan, tilt, rise, descend, etc.), and more. For example, if the scene involves a person like a young man sitting on a chair reading a book, describe his appearance and the surrounding environment, including basic features such as the character's gender, age, race, etc., as well as actions, emotions, dialogues, and performance content. If the scene includes animals or natural elements such as cats, the sky, or landscapes, vividly describe these elements and their behaviors or states, and consider the emotions and thematic elements introduced in this opening scene. Then, as the video progresses, describe the evolving visual effects, how they present a more vivid and rich picture through camera movements and special effects, considering aesthetics (style, tone, color palette, atmosphere, emotions, etc.). If the scene changes, explain how it transitions, what new elements are introduced, whether the atmosphere remains consistent or changes, and how this affects the overall narrative or theme of the video. If the video contains multiple scenes, describe the connections between them, whether creating a story, presenting a contrast, or highlighting different aspects of a theme, considering scenes (day, night, indoor, outdoor, etc.), props (relationship with characters and scenes, relationship with camera and scheduling), and scene scheduling (single character, multiple characters with camera and narrative association, and how they relate to scene props). Finally, conclude with a summary that encapsulates the essence of the video, combining all the described elements into a cohesive narrative or message, emphasizing the sensory and emotional experience provided by the video, and speculating on the impact or message intended for the audience, allowing viewers to engage in profound reflection and insight during the viewing process, thus achieving a deeper impact. The generated description should adhere to English grammar and be no less than 120 words in length.
-"""
-USER_PROMPT = "Here are the frames from the video."
 SKIP = 2
 TEMP = 0.3
 TOP = 0.75
-MAX_TOKEN = 512

+SYS_PROMPT = ""
+USER_PROMPT = """Create a detailed and accurate video description, starting from a specific scene and possibly transitioning through various themes and settings. Begin by describing the initial scene in detail, including the environment, key objects, any characters or creatures and their actions, and the overall atmosphere, considering specific aspects such as shot sizes (extreme close-up, close-up, medium, full, wide, etc.), camera movements (push, pull, shake, pan, tilt, rise, descend, etc.), and more. For example, if the scene involves a person like a young man sitting on a chair reading a book, describe his appearance and the surrounding environment, including basic features such as the character's gender, age, race, etc., as well as actions, emotions, dialogues, and performance content. If the scene includes animals or natural elements such as cats, the sky, or landscapes, vividly describe these elements and their behaviors or states, and consider the emotions and thematic elements introduced in this opening scene. Then, as the video progresses, describe the evolving visual effects, how they present a more vivid and rich picture through camera movements and special effects, considering aesthetics (style, tone, color palette, atmosphere, emotions, etc.). If the scene changes, explain how it transitions, what new elements are introduced, whether the atmosphere remains consistent or changes, and how this affects the overall narrative or theme of the video. If the video contains multiple scenes, describe the connections between them, whether creating a story, presenting a contrast, or highlighting different aspects of a theme, considering scenes (day, night, indoor, outdoor, etc.), props (relationship with characters and scenes, relationship with camera and scheduling), and scene scheduling (single character, multiple characters with camera and narrative association, and how they relate to scene props). Finally, conclude with a summary that encapsulates the essence of the video, combining all the described elements into a cohesive narrative or message, emphasizing the sensory and emotional experience provided by the video, and speculating on the impact or message intended for the audience, allowing viewers to engage in profound reflection and insight during the viewing process, thus achieving a deeper impact. The generated description should adhere to English grammar and be no less than 120 words in length.
+"""
 SKIP = 2
 TEMP = 0.3
 TOP = 0.75
+MAX_TOKEN = 512
+API_CLASSES = {
+    'Azure': 'AzureAPI',
+    'Google': 'GoogleAPI',
+    'Anthropic': 'AnthropicAPI',
+    'OpenAI': 'OpenAIAPI'
+}
+PROVIDERS_CONFIG = {
+    'Azure': {
+        'model': ['GPT-4o', 'GPT-4v'],
+        'key_label': 'Azure API Key',
+        'endpoint_label': 'Azure Endpoint'
+    },
+    'Google': {
+        'model': ['Gemini-1.5-Flash', 'Gemini-1.5-Pro'],
+        'key_label': 'Google API Key',
+        'endpoint_label': 'Google API Endpoint'
+    },
+    'Anthropic': {
+        'model': ['Claude-3-Opus', 'Claude-3-Sonnet'],
+        'key_label': 'Anthropic API Key',
+        'endpoint_label': 'Anthropic Endpoint'
+    },
+    'OpenAI': {
+        'model': ['GPT-4o', 'GPT-4v'],
+        'key_label': 'OpenAI API Key',
+        'endpoint_label': 'OpenAI Endpoint'
+    }
+}
+GENERAL_CONFIG = {
+    'temp': {
+        'label': 'Temperature',
+        'default': 0.3,
+        'min': 0,
+        'max': 1,
+        'step': 0.1
+    },
+    'top_p': {
+        'label': 'Top-P',
+        'default': 0.75,
+        'min': 0,
+        'max': 1,
+        'step': 0.1
+    },
+    'max_tokens': {
+        'label': 'Max Tokens',
+        'default': 4096,
+        'min': 512,
+        'max': 4096,
+        'step': 1
+    },
+    'frame_format': {
+        'label': 'Frame Format',
+        'default': 'JPEG',
+        'choices': ['JPEG', 'PNG']
+    },
+    'frame_skip': {
+        'label': 'Frame Skip',
+        'default': 2,
+        'min': 2,
+        'max': 100,
+        'step': 1
+    },
+    'group_size': {
+        'label': 'Group Size',
+        'default': 10,
+        'min': 1,
+        'max': 100,
+        'step': 1
+    }
+}

run.py CHANGED Viewed

@@ -1,20 +1,20 @@
 # app.py
 import gradio as gr
-from utils import VideoProcessor, AzureAPI
 from constraint import SYS_PROMPT, USER_PROMPT
-def process_caption(prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_skip, group_size):
-    processor = VideoProcessor(frame_format=frame_format, frame_skip=frame_skip, group_size=group_size)
-    frames = processor.decode(video)
-    concatenated_images = processor.group_and_concatenate(frames)
-    base64_list = processor.to_base64_list(concatenated_images)
-    debug_image = processor.concatenate(concatenated_images, "vertical")
     if not key or not endpoint:
         return "", f"API key or endpoint is missing. Processed {len(frames)} frames.", debug_image
     api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
-    caption = api.get_caption(prompt, USER_PROMPT, base64_list)
     return f"{caption}", f"Using model '{model}' with {len(frames)} frames extracted.", debug_image
 with gr.Blocks() as Core:
@@ -27,12 +27,15 @@ with gr.Blocks() as Core:
                 with gr.Row():
                     temp = gr.Slider(0, 1, 0.3, step=0.1, label="Temperature")
                     top_p = gr.Slider(0, 1, 0.75, step=0.1, label="Top-P")
-                    max_tokens = gr.Slider(512, 4096, 3000, step=1, label="Max Tokens")
                 with gr.Row():
                     frame_format = gr.Dropdown(label="Frame Format", value="JPEG", choices=["JPEG", "PNG"], interactive=False)
-                    frame_skip = gr.Slider(2, 100, 2, step=1, label="Frame Skip")
-                    group_size = gr.Slider(1, 100, 10, step=1, label="Group Size")
-            prompt = gr.Textbox(SYS_PROMPT, label="Prompt", lines=10, max_lines=100, show_copy_button=True)
             with gr.Tabs():
                 with gr.Tab("Azure"):
                     result = gr.Textbox(label="Result", lines=15, max_lines=100, show_copy_button=True, interactive=False)
@@ -78,8 +81,8 @@ with gr.Blocks() as Core:
                             video_gd_auth = gr.Text(label="Google Drive Access Token")
                 caption_button = gr.Button("Caption", variant="primary", size="lg")
         caption_button.click(
-            process_caption,
-            inputs=[prompt, temp, top_p, max_tokens, model, key, endpoint, video_src, frame_format, frame_skip, group_size],
             outputs=[result, info, frame]
         )

 # app.py
 import gradio as gr
+from utils import VideoProcessor, AzureAPI, GoogleAPI, AnthropicAPI, OpenAIAPI
 from constraint import SYS_PROMPT, USER_PROMPT
+def fast_caption(sys_prompt, usr_prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_limit):
+    processor = VideoProcessor(frame_format=frame_format, frame_limit=frame_limit)
+    frames = processor._decode(video)
+    base64_list = processor.to_base64_list(frames)
+    debug_image = processor.concatenate(frames)
     if not key or not endpoint:
         return "", f"API key or endpoint is missing. Processed {len(frames)} frames.", debug_image
     api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
+    caption = api.get_caption(sys_prompt, usr_prompt, base64_list)
     return f"{caption}", f"Using model '{model}' with {len(frames)} frames extracted.", debug_image
 with gr.Blocks() as Core:
                 with gr.Row():
                     temp = gr.Slider(0, 1, 0.3, step=0.1, label="Temperature")
                     top_p = gr.Slider(0, 1, 0.75, step=0.1, label="Top-P")
+                    max_tokens = gr.Slider(512, 4096, 1024, step=1, label="Max Tokens")
                 with gr.Row():
                     frame_format = gr.Dropdown(label="Frame Format", value="JPEG", choices=["JPEG", "PNG"], interactive=False)
+                    frame_limit = gr.Slider(1, 100, 10, step=1, label="Frame Limits")
+            with gr.Tabs():
+                with gr.Tab("User"):
+                    usr_prompt = gr.Textbox(USER_PROMPT, label="User Prompt", lines=10, max_lines=100, show_copy_button=True)
+                with gr.Tab("System"):
+                    sys_prompt = gr.Textbox(SYS_PROMPT, label="System Prompt", lines=10, max_lines=100, show_copy_button=True)
             with gr.Tabs():
                 with gr.Tab("Azure"):
                     result = gr.Textbox(label="Result", lines=15, max_lines=100, show_copy_button=True, interactive=False)
                             video_gd_auth = gr.Text(label="Google Drive Access Token")
                 caption_button = gr.Button("Caption", variant="primary", size="lg")
         caption_button.click(
+            fast_caption,
+            inputs=[sys_prompt, usr_prompt, temp, top_p, max_tokens, model, key, endpoint, video_src, frame_format, frame_limit],
             outputs=[result, info, frame]
         )

test.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import av
+import time
+def sample(N, K):
+    array = list(range(N))
+    length = len(array)
+    if K >= length or K<2:
+        return array
+    k = length // K
+    sampled_points = [array[i] for i in range(0, length, k)][:K-1]
+    sampled_points.append(array[-1])
+    return sampled_points
+start = time.time()
+with av.open('/Users/monius/Documents/YueZhang/Video/Sora/1.mp4') as container:
+    frames = []
+    src = container.streams.video[0]
+    time_base = src.time_base
+    framerate = src.average_rate
+    N = src.frames
+    K = 10
+    for i in sample(N, K):
+        n = round((i / framerate) / time_base)
+        container.seek(n, backward=True, stream=src)
+        frame = next(container.decode(video=0))
+        im = frame.to_image()
+        frames.append(im)
+elapsed = time.time() - start
+print(elapsed)

utils/__init__.py CHANGED Viewed

@@ -1,2 +1,6 @@
 from .azure import AzureAPI
-from .video import VideoProcessor

 from .azure import AzureAPI
+from .google import GoogleAPI
+from .anthropic import AnthropicAPI
+from .openai import OpenAIAPI
+from .video import VideoProcessor

utils/anthropic.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .base import BaseAPI
+from typing import List
+class AnthropicAPI(BaseAPI):
+    def get_headers(self):
+        return {
+            'Content-Type': 'application/json',
+            'x-api-key': self.key
+        }
+    def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
+        url_suffix = f'/v1/messages'
+        return super().get_caption(prompt, user_prompt, images_base64, url_suffix)
+    def parse_response(self, response: dict) -> str:
+        return response['completion']

utils/azure.py CHANGED Viewed

@@ -1,36 +1,16 @@
-# azure_api.py
-import requests
 from typing import List
-class AzureAPI:
-    def __init__(self, key: str, endpoint: str, model: str, temp: float = 0.3, top_p: float = 0.75, max_tokens: int = 1024):
-        self.key = key
-        self.endpoint = endpoint
-        self.model = model
-        self.temp = temp
-        self.top_p = top_p
-        self.max_tokens = max_tokens
-        self.version = "2024-02-15-preview"
-    def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
-        headers = {
-            'content-type': 'application/json',
-            "api-key": self.key,
-        }
-        system_msg = {"role": "system", "content": prompt}
-        user_msg = [{"type": "text", "text": user_prompt}]
-        img_msg = [
-            {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{image}'}}
-            for image in images_base64
-        ]
-        payload = {
-            'messages': [system_msg, {"role": "user", "content": user_msg + img_msg}],
-            'temperature': self.temp,
-            'top_p': self.top_p,
-            'max_tokens': self.max_tokens,
-            'model': self.model
         }
-        url = f'{self.endpoint}/openai/deployments/{self.model}/chat/completions?api-version={self.version}'
-        response = requests.post(url, headers=headers, json=payload)
-        response.raise_for_status()
-        return response.json()['choices'][0]['message']['content']

+from .base import BaseAPI
 from typing import List
+class AzureAPI(BaseAPI):
+    def get_headers(self):
+        return {
+            'Content-Type': 'application/json',
+            'api-key': self.key
         }
+    def get_caption(self, sys_prompt: str, user_prompt: str, images_base64: List[str]) -> str:
+        url_suffix = f'/openai/deployments/{self.model}/chat/completions?api-version=2024-02-15-preview'
+        return super().get_caption(sys_prompt, user_prompt, images_base64, url_suffix)
+    def parse_response(self, response: dict) -> str:
+        return response['choices'][0]['message']['content']

utils/base.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import requests
+from typing import List
+class BaseAPI:
+    def __init__(self, key: str, endpoint: str, model: str, temp: float = 0.3, top_p: float = 0.75, max_tokens: int = 1024):
+        self.key = key
+        self.endpoint = endpoint
+        self.model = model
+        self.temp = temp
+        self.top_p = top_p
+        self.max_tokens = max_tokens
+    def get_headers(self):
+        return {
+            'Content-Type': 'application/json',
+            'Authorization': f'Bearer {self.key}'
+        }
+    def get_payload(self, sys_prompt: str, user_prompt: str, images_base64: List[str]) -> dict:
+        _txt = [{"type": "text", "text": user_prompt}]
+        _img = [
+            {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{image}'}}
+            for image in images_base64
+        ]
+        sys_msg = {"role": "system", "content": sys_prompt}
+        usr_msg = {"role": "user", "content": _txt + _img}
+        return {
+            'messages': [sys_msg, usr_msg],
+            'temperature': self.temp,
+            'top_p': self.top_p,
+            'max_tokens': self.max_tokens,
+            'model': self.model
+        }
+    def get_caption(self, sys_prompt: str, user_prompt: str, images_base64: List[str], url_suffix: str) -> str:
+        headers = self.get_headers()
+        payload = self.get_payload(sys_prompt, user_prompt, images_base64)
+        url = f'{self.endpoint}{url_suffix}'
+        response = requests.post(url, headers=headers, json=payload)
+        response.raise_for_status()
+        return self.parse_response(response.json())
+    def parse_response(self, response: dict) -> str:
+        raise NotImplementedError("Subclasses should implement this method")

utils/google.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from .base import BaseAPI
+from typing import List
+class GoogleAPI(BaseAPI):
+    def get_headers(self):
+        return {
+            'Authorization': f'Bearer {self.key}'
+        }
+    def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
+        url_suffix = f'/v1/models/{self.model}:generateContent?key={self.key}'
+        return super().get_caption(prompt, user_prompt, images_base64, url_suffix)
+    def parse_response(self, response: dict) -> str:
+        return response['predictions'][0]['content']

utils/openai.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .base import BaseAPI
+from typing import List
+class OpenAIAPI(BaseAPI):
+    def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
+        url_suffix = f'/v1/completions'
+        return super().get_caption(prompt, user_prompt, images_base64, url_suffix)
+    def parse_response(self, response: dict) -> str:
+        return response['choices'][0]['text']

utils/video.py CHANGED Viewed

@@ -6,12 +6,54 @@ from PIL import Image
 from typing import List
 from dataclasses import dataclass
 @dataclass
 class VideoProcessor:
     frame_format: str = "JPEG"
-    frame_skip: int = 10
-    group_size: int = 10
     def decode(self, video_path: str) -> List[Image.Image]:
         frames = []
         container = av.open(video_path)
@@ -44,13 +86,8 @@ class VideoProcessor:
         return concatenated_image
-    def group_and_concatenate(self, frames: List[Image.Image], limit=10) -> List[Image.Image]:
-        xs = len(frames) // self.group_size
-        groups = [frames[i:i + xs] for i in range(0, len(frames), xs)]
-        sampled_groups = []
-        for group in groups:
-            interval = max(1, len(group) // limit)
-            sampled_groups.append([group[i] for i in range(0, len(group), interval)])
         return [self.concatenate(group) for group in sampled_groups]
     def to_base64_list(self, images: List[Image.Image]) -> List[str]:

 from typing import List
 from dataclasses import dataclass
+def sample(N, K):
+    array = list(range(N))
+    length = len(array)
+    if K >= length or K<2:
+        return array
+    k = length // K
+    sampled_points = [array[i] for i in range(0, length, k)][:K-1]
+    sampled_points.append(array[-1])
+    return sampled_points
+def grid_sample(array, N, K):
+    group_size, remainder = len(array) // K, len(array) % K
+    sampled_groups = []
+    for i in range(K):
+        s = i * group_size + min(i, remainder)
+        e = s + group_size + (1 if i < remainder else 0)
+        group = array[s:e]
+        if N >= len(group):
+            sampled_groups.append(group)
+        else:
+            interval = len(group) // N
+            sampled_groups.append([group[j * interval] for j in range(N)])
+    return sampled_groups
 @dataclass
 class VideoProcessor:
     frame_format: str = "JPEG"
+    frame_limit: int = 10
+    def _decode(self, video_path: str) -> List[Image.Image]:
+        frames = []
+        with av.open(video_path) as container:
+            src = container.streams.video[0]
+            time_base = src.time_base
+            framerate = src.average_rate
+            for i in sample(src.frames, self.frame_limit):
+                n = round((i / framerate) / time_base)
+                container.seek(n, backward=True, stream=src)
+                frame = next(container.decode(video=0))
+                im = frame.to_image()
+                frames.append(im)
+        return frames
     def decode(self, video_path: str) -> List[Image.Image]:
         frames = []
         container = av.open(video_path)
         return concatenated_image
+    def grid_concatenate(self, frames: List[Image.Image], group_size, limit=10) -> List[Image.Image]:
+        sampled_groups = grid_sample(frames, group_size, limit)
         return [self.concatenate(group) for group in sampled_groups]
     def to_base64_list(self, images: List[Image.Image]) -> List[str]: