Spaces:
Runtime error
Runtime error
Monius
commited on
Commit
Β·
dc465b0
1
Parent(s):
1416e63
v0.3 by <M0n-ius>
Browse files- constraint.py +77 -4
- run.py +17 -14
- test.py +34 -0
- utils/__init__.py +5 -1
- utils/anthropic.py +17 -0
- utils/azure.py +13 -33
- utils/base.py +47 -0
- utils/google.py +16 -0
- utils/openai.py +10 -0
- utils/video.py +46 -9
constraint.py
CHANGED
@@ -1,9 +1,82 @@
|
|
1 |
-
SYS_PROMPT = ""
|
2 |
-
"""
|
3 |
|
4 |
-
USER_PROMPT = "
|
|
|
5 |
|
6 |
SKIP = 2
|
7 |
TEMP = 0.3
|
8 |
TOP = 0.75
|
9 |
-
MAX_TOKEN = 512
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SYS_PROMPT = ""
|
|
|
2 |
|
3 |
+
USER_PROMPT = """Create a detailed and accurate video description, starting from a specific scene and possibly transitioning through various themes and settings. Begin by describing the initial scene in detail, including the environment, key objects, any characters or creatures and their actions, and the overall atmosphere, considering specific aspects such as shot sizes (extreme close-up, close-up, medium, full, wide, etc.), camera movements (push, pull, shake, pan, tilt, rise, descend, etc.), and more. For example, if the scene involves a person like a young man sitting on a chair reading a book, describe his appearance and the surrounding environment, including basic features such as the character's gender, age, race, etc., as well as actions, emotions, dialogues, and performance content. If the scene includes animals or natural elements such as cats, the sky, or landscapes, vividly describe these elements and their behaviors or states, and consider the emotions and thematic elements introduced in this opening scene. Then, as the video progresses, describe the evolving visual effects, how they present a more vivid and rich picture through camera movements and special effects, considering aesthetics (style, tone, color palette, atmosphere, emotions, etc.). If the scene changes, explain how it transitions, what new elements are introduced, whether the atmosphere remains consistent or changes, and how this affects the overall narrative or theme of the video. If the video contains multiple scenes, describe the connections between them, whether creating a story, presenting a contrast, or highlighting different aspects of a theme, considering scenes (day, night, indoor, outdoor, etc.), props (relationship with characters and scenes, relationship with camera and scheduling), and scene scheduling (single character, multiple characters with camera and narrative association, and how they relate to scene props). Finally, conclude with a summary that encapsulates the essence of the video, combining all the described elements into a cohesive narrative or message, emphasizing the sensory and emotional experience provided by the video, and speculating on the impact or message intended for the audience, allowing viewers to engage in profound reflection and insight during the viewing process, thus achieving a deeper impact. The generated description should adhere to English grammar and be no less than 120 words in length.
|
4 |
+
"""
|
5 |
|
6 |
SKIP = 2
|
7 |
TEMP = 0.3
|
8 |
TOP = 0.75
|
9 |
+
MAX_TOKEN = 512
|
10 |
+
|
11 |
+
API_CLASSES = {
|
12 |
+
'Azure': 'AzureAPI',
|
13 |
+
'Google': 'GoogleAPI',
|
14 |
+
'Anthropic': 'AnthropicAPI',
|
15 |
+
'OpenAI': 'OpenAIAPI'
|
16 |
+
}
|
17 |
+
|
18 |
+
PROVIDERS_CONFIG = {
|
19 |
+
'Azure': {
|
20 |
+
'model': ['GPT-4o', 'GPT-4v'],
|
21 |
+
'key_label': 'Azure API Key',
|
22 |
+
'endpoint_label': 'Azure Endpoint'
|
23 |
+
},
|
24 |
+
'Google': {
|
25 |
+
'model': ['Gemini-1.5-Flash', 'Gemini-1.5-Pro'],
|
26 |
+
'key_label': 'Google API Key',
|
27 |
+
'endpoint_label': 'Google API Endpoint'
|
28 |
+
},
|
29 |
+
'Anthropic': {
|
30 |
+
'model': ['Claude-3-Opus', 'Claude-3-Sonnet'],
|
31 |
+
'key_label': 'Anthropic API Key',
|
32 |
+
'endpoint_label': 'Anthropic Endpoint'
|
33 |
+
},
|
34 |
+
'OpenAI': {
|
35 |
+
'model': ['GPT-4o', 'GPT-4v'],
|
36 |
+
'key_label': 'OpenAI API Key',
|
37 |
+
'endpoint_label': 'OpenAI Endpoint'
|
38 |
+
}
|
39 |
+
}
|
40 |
+
|
41 |
+
GENERAL_CONFIG = {
|
42 |
+
'temp': {
|
43 |
+
'label': 'Temperature',
|
44 |
+
'default': 0.3,
|
45 |
+
'min': 0,
|
46 |
+
'max': 1,
|
47 |
+
'step': 0.1
|
48 |
+
},
|
49 |
+
'top_p': {
|
50 |
+
'label': 'Top-P',
|
51 |
+
'default': 0.75,
|
52 |
+
'min': 0,
|
53 |
+
'max': 1,
|
54 |
+
'step': 0.1
|
55 |
+
},
|
56 |
+
'max_tokens': {
|
57 |
+
'label': 'Max Tokens',
|
58 |
+
'default': 4096,
|
59 |
+
'min': 512,
|
60 |
+
'max': 4096,
|
61 |
+
'step': 1
|
62 |
+
},
|
63 |
+
'frame_format': {
|
64 |
+
'label': 'Frame Format',
|
65 |
+
'default': 'JPEG',
|
66 |
+
'choices': ['JPEG', 'PNG']
|
67 |
+
},
|
68 |
+
'frame_skip': {
|
69 |
+
'label': 'Frame Skip',
|
70 |
+
'default': 2,
|
71 |
+
'min': 2,
|
72 |
+
'max': 100,
|
73 |
+
'step': 1
|
74 |
+
},
|
75 |
+
'group_size': {
|
76 |
+
'label': 'Group Size',
|
77 |
+
'default': 10,
|
78 |
+
'min': 1,
|
79 |
+
'max': 100,
|
80 |
+
'step': 1
|
81 |
+
}
|
82 |
+
}
|
run.py
CHANGED
@@ -1,20 +1,20 @@
|
|
1 |
# app.py
|
2 |
import gradio as gr
|
3 |
-
from utils import VideoProcessor, AzureAPI
|
4 |
from constraint import SYS_PROMPT, USER_PROMPT
|
5 |
|
6 |
-
def
|
7 |
-
processor = VideoProcessor(frame_format=frame_format,
|
8 |
-
frames = processor.
|
9 |
-
|
10 |
-
base64_list = processor.to_base64_list(
|
11 |
-
debug_image = processor.concatenate(
|
12 |
|
13 |
if not key or not endpoint:
|
14 |
return "", f"API key or endpoint is missing. Processed {len(frames)} frames.", debug_image
|
15 |
|
16 |
api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
|
17 |
-
caption = api.get_caption(
|
18 |
return f"{caption}", f"Using model '{model}' with {len(frames)} frames extracted.", debug_image
|
19 |
|
20 |
with gr.Blocks() as Core:
|
@@ -27,12 +27,15 @@ with gr.Blocks() as Core:
|
|
27 |
with gr.Row():
|
28 |
temp = gr.Slider(0, 1, 0.3, step=0.1, label="Temperature")
|
29 |
top_p = gr.Slider(0, 1, 0.75, step=0.1, label="Top-P")
|
30 |
-
max_tokens = gr.Slider(512, 4096,
|
31 |
with gr.Row():
|
32 |
frame_format = gr.Dropdown(label="Frame Format", value="JPEG", choices=["JPEG", "PNG"], interactive=False)
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
with gr.Tabs():
|
37 |
with gr.Tab("Azure"):
|
38 |
result = gr.Textbox(label="Result", lines=15, max_lines=100, show_copy_button=True, interactive=False)
|
@@ -78,8 +81,8 @@ with gr.Blocks() as Core:
|
|
78 |
video_gd_auth = gr.Text(label="Google Drive Access Token")
|
79 |
caption_button = gr.Button("Caption", variant="primary", size="lg")
|
80 |
caption_button.click(
|
81 |
-
|
82 |
-
inputs=[
|
83 |
outputs=[result, info, frame]
|
84 |
)
|
85 |
|
|
|
1 |
# app.py
|
2 |
import gradio as gr
|
3 |
+
from utils import VideoProcessor, AzureAPI, GoogleAPI, AnthropicAPI, OpenAIAPI
|
4 |
from constraint import SYS_PROMPT, USER_PROMPT
|
5 |
|
6 |
+
def fast_caption(sys_prompt, usr_prompt, temp, top_p, max_tokens, model, key, endpoint, video, frame_format, frame_limit):
|
7 |
+
processor = VideoProcessor(frame_format=frame_format, frame_limit=frame_limit)
|
8 |
+
frames = processor._decode(video)
|
9 |
+
|
10 |
+
base64_list = processor.to_base64_list(frames)
|
11 |
+
debug_image = processor.concatenate(frames)
|
12 |
|
13 |
if not key or not endpoint:
|
14 |
return "", f"API key or endpoint is missing. Processed {len(frames)} frames.", debug_image
|
15 |
|
16 |
api = AzureAPI(key=key, endpoint=endpoint, model=model, temp=temp, top_p=top_p, max_tokens=max_tokens)
|
17 |
+
caption = api.get_caption(sys_prompt, usr_prompt, base64_list)
|
18 |
return f"{caption}", f"Using model '{model}' with {len(frames)} frames extracted.", debug_image
|
19 |
|
20 |
with gr.Blocks() as Core:
|
|
|
27 |
with gr.Row():
|
28 |
temp = gr.Slider(0, 1, 0.3, step=0.1, label="Temperature")
|
29 |
top_p = gr.Slider(0, 1, 0.75, step=0.1, label="Top-P")
|
30 |
+
max_tokens = gr.Slider(512, 4096, 1024, step=1, label="Max Tokens")
|
31 |
with gr.Row():
|
32 |
frame_format = gr.Dropdown(label="Frame Format", value="JPEG", choices=["JPEG", "PNG"], interactive=False)
|
33 |
+
frame_limit = gr.Slider(1, 100, 10, step=1, label="Frame Limits")
|
34 |
+
with gr.Tabs():
|
35 |
+
with gr.Tab("User"):
|
36 |
+
usr_prompt = gr.Textbox(USER_PROMPT, label="User Prompt", lines=10, max_lines=100, show_copy_button=True)
|
37 |
+
with gr.Tab("System"):
|
38 |
+
sys_prompt = gr.Textbox(SYS_PROMPT, label="System Prompt", lines=10, max_lines=100, show_copy_button=True)
|
39 |
with gr.Tabs():
|
40 |
with gr.Tab("Azure"):
|
41 |
result = gr.Textbox(label="Result", lines=15, max_lines=100, show_copy_button=True, interactive=False)
|
|
|
81 |
video_gd_auth = gr.Text(label="Google Drive Access Token")
|
82 |
caption_button = gr.Button("Caption", variant="primary", size="lg")
|
83 |
caption_button.click(
|
84 |
+
fast_caption,
|
85 |
+
inputs=[sys_prompt, usr_prompt, temp, top_p, max_tokens, model, key, endpoint, video_src, frame_format, frame_limit],
|
86 |
outputs=[result, info, frame]
|
87 |
)
|
88 |
|
test.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import av
|
2 |
+
import time
|
3 |
+
|
4 |
+
def sample(N, K):
|
5 |
+
array = list(range(N))
|
6 |
+
length = len(array)
|
7 |
+
if K >= length or K<2:
|
8 |
+
return array
|
9 |
+
|
10 |
+
k = length // K
|
11 |
+
sampled_points = [array[i] for i in range(0, length, k)][:K-1]
|
12 |
+
sampled_points.append(array[-1])
|
13 |
+
return sampled_points
|
14 |
+
|
15 |
+
start = time.time()
|
16 |
+
|
17 |
+
with av.open('/Users/monius/Documents/YueZhang/Video/Sora/1.mp4') as container:
|
18 |
+
frames = []
|
19 |
+
src = container.streams.video[0]
|
20 |
+
time_base = src.time_base
|
21 |
+
framerate = src.average_rate
|
22 |
+
N = src.frames
|
23 |
+
K = 10
|
24 |
+
|
25 |
+
for i in sample(N, K):
|
26 |
+
n = round((i / framerate) / time_base)
|
27 |
+
container.seek(n, backward=True, stream=src)
|
28 |
+
frame = next(container.decode(video=0))
|
29 |
+
im = frame.to_image()
|
30 |
+
frames.append(im)
|
31 |
+
|
32 |
+
elapsed = time.time() - start
|
33 |
+
|
34 |
+
print(elapsed)
|
utils/__init__.py
CHANGED
@@ -1,2 +1,6 @@
|
|
1 |
from .azure import AzureAPI
|
2 |
-
from .
|
|
|
|
|
|
|
|
|
|
1 |
from .azure import AzureAPI
|
2 |
+
from .google import GoogleAPI
|
3 |
+
from .anthropic import AnthropicAPI
|
4 |
+
from .openai import OpenAIAPI
|
5 |
+
|
6 |
+
from .video import VideoProcessor
|
utils/anthropic.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import BaseAPI
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
class AnthropicAPI(BaseAPI):
|
5 |
+
def get_headers(self):
|
6 |
+
return {
|
7 |
+
'Content-Type': 'application/json',
|
8 |
+
'x-api-key': self.key
|
9 |
+
}
|
10 |
+
|
11 |
+
def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
|
12 |
+
url_suffix = f'/v1/messages'
|
13 |
+
return super().get_caption(prompt, user_prompt, images_base64, url_suffix)
|
14 |
+
|
15 |
+
def parse_response(self, response: dict) -> str:
|
16 |
+
return response['completion']
|
17 |
+
|
utils/azure.py
CHANGED
@@ -1,36 +1,16 @@
|
|
1 |
-
|
2 |
-
import requests
|
3 |
from typing import List
|
4 |
|
5 |
-
class AzureAPI:
|
6 |
-
def
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
self.temp = temp
|
11 |
-
self.top_p = top_p
|
12 |
-
self.max_tokens = max_tokens
|
13 |
-
self.version = "2024-02-15-preview"
|
14 |
-
|
15 |
-
def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
|
16 |
-
headers = {
|
17 |
-
'content-type': 'application/json',
|
18 |
-
"api-key": self.key,
|
19 |
-
}
|
20 |
-
system_msg = {"role": "system", "content": prompt}
|
21 |
-
user_msg = [{"type": "text", "text": user_prompt}]
|
22 |
-
img_msg = [
|
23 |
-
{"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{image}'}}
|
24 |
-
for image in images_base64
|
25 |
-
]
|
26 |
-
payload = {
|
27 |
-
'messages': [system_msg, {"role": "user", "content": user_msg + img_msg}],
|
28 |
-
'temperature': self.temp,
|
29 |
-
'top_p': self.top_p,
|
30 |
-
'max_tokens': self.max_tokens,
|
31 |
-
'model': self.model
|
32 |
}
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
return
|
|
|
|
|
|
|
|
1 |
+
from .base import BaseAPI
|
|
|
2 |
from typing import List
|
3 |
|
4 |
+
class AzureAPI(BaseAPI):
|
5 |
+
def get_headers(self):
|
6 |
+
return {
|
7 |
+
'Content-Type': 'application/json',
|
8 |
+
'api-key': self.key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
}
|
10 |
+
|
11 |
+
def get_caption(self, sys_prompt: str, user_prompt: str, images_base64: List[str]) -> str:
|
12 |
+
url_suffix = f'/openai/deployments/{self.model}/chat/completions?api-version=2024-02-15-preview'
|
13 |
+
return super().get_caption(sys_prompt, user_prompt, images_base64, url_suffix)
|
14 |
+
|
15 |
+
def parse_response(self, response: dict) -> str:
|
16 |
+
return response['choices'][0]['message']['content']
|
utils/base.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
class BaseAPI:
|
5 |
+
def __init__(self, key: str, endpoint: str, model: str, temp: float = 0.3, top_p: float = 0.75, max_tokens: int = 1024):
|
6 |
+
self.key = key
|
7 |
+
self.endpoint = endpoint
|
8 |
+
self.model = model
|
9 |
+
self.temp = temp
|
10 |
+
self.top_p = top_p
|
11 |
+
self.max_tokens = max_tokens
|
12 |
+
|
13 |
+
def get_headers(self):
|
14 |
+
return {
|
15 |
+
'Content-Type': 'application/json',
|
16 |
+
'Authorization': f'Bearer {self.key}'
|
17 |
+
}
|
18 |
+
|
19 |
+
def get_payload(self, sys_prompt: str, user_prompt: str, images_base64: List[str]) -> dict:
|
20 |
+
_txt = [{"type": "text", "text": user_prompt}]
|
21 |
+
_img = [
|
22 |
+
{"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{image}'}}
|
23 |
+
for image in images_base64
|
24 |
+
]
|
25 |
+
|
26 |
+
sys_msg = {"role": "system", "content": sys_prompt}
|
27 |
+
usr_msg = {"role": "user", "content": _txt + _img}
|
28 |
+
|
29 |
+
return {
|
30 |
+
'messages': [sys_msg, usr_msg],
|
31 |
+
'temperature': self.temp,
|
32 |
+
'top_p': self.top_p,
|
33 |
+
'max_tokens': self.max_tokens,
|
34 |
+
'model': self.model
|
35 |
+
}
|
36 |
+
|
37 |
+
def get_caption(self, sys_prompt: str, user_prompt: str, images_base64: List[str], url_suffix: str) -> str:
|
38 |
+
headers = self.get_headers()
|
39 |
+
payload = self.get_payload(sys_prompt, user_prompt, images_base64)
|
40 |
+
url = f'{self.endpoint}{url_suffix}'
|
41 |
+
|
42 |
+
response = requests.post(url, headers=headers, json=payload)
|
43 |
+
response.raise_for_status()
|
44 |
+
return self.parse_response(response.json())
|
45 |
+
|
46 |
+
def parse_response(self, response: dict) -> str:
|
47 |
+
raise NotImplementedError("Subclasses should implement this method")
|
utils/google.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import BaseAPI
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
class GoogleAPI(BaseAPI):
|
5 |
+
def get_headers(self):
|
6 |
+
return {
|
7 |
+
'Authorization': f'Bearer {self.key}'
|
8 |
+
}
|
9 |
+
|
10 |
+
def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
|
11 |
+
url_suffix = f'/v1/models/{self.model}:generateContent?key={self.key}'
|
12 |
+
return super().get_caption(prompt, user_prompt, images_base64, url_suffix)
|
13 |
+
|
14 |
+
def parse_response(self, response: dict) -> str:
|
15 |
+
return response['predictions'][0]['content']
|
16 |
+
|
utils/openai.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .base import BaseAPI
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
class OpenAIAPI(BaseAPI):
|
5 |
+
def get_caption(self, prompt: str, user_prompt: str, images_base64: List[str]) -> str:
|
6 |
+
url_suffix = f'/v1/completions'
|
7 |
+
return super().get_caption(prompt, user_prompt, images_base64, url_suffix)
|
8 |
+
|
9 |
+
def parse_response(self, response: dict) -> str:
|
10 |
+
return response['choices'][0]['text']
|
utils/video.py
CHANGED
@@ -6,12 +6,54 @@ from PIL import Image
|
|
6 |
from typing import List
|
7 |
from dataclasses import dataclass
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
@dataclass
|
10 |
class VideoProcessor:
|
11 |
frame_format: str = "JPEG"
|
12 |
-
|
13 |
-
group_size: int = 10
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def decode(self, video_path: str) -> List[Image.Image]:
|
16 |
frames = []
|
17 |
container = av.open(video_path)
|
@@ -44,13 +86,8 @@ class VideoProcessor:
|
|
44 |
|
45 |
return concatenated_image
|
46 |
|
47 |
-
def
|
48 |
-
|
49 |
-
groups = [frames[i:i + xs] for i in range(0, len(frames), xs)]
|
50 |
-
sampled_groups = []
|
51 |
-
for group in groups:
|
52 |
-
interval = max(1, len(group) // limit)
|
53 |
-
sampled_groups.append([group[i] for i in range(0, len(group), interval)])
|
54 |
return [self.concatenate(group) for group in sampled_groups]
|
55 |
|
56 |
def to_base64_list(self, images: List[Image.Image]) -> List[str]:
|
|
|
6 |
from typing import List
|
7 |
from dataclasses import dataclass
|
8 |
|
9 |
+
def sample(N, K):
|
10 |
+
array = list(range(N))
|
11 |
+
length = len(array)
|
12 |
+
if K >= length or K<2:
|
13 |
+
return array
|
14 |
+
|
15 |
+
k = length // K
|
16 |
+
sampled_points = [array[i] for i in range(0, length, k)][:K-1]
|
17 |
+
sampled_points.append(array[-1])
|
18 |
+
return sampled_points
|
19 |
+
|
20 |
+
def grid_sample(array, N, K):
|
21 |
+
group_size, remainder = len(array) // K, len(array) % K
|
22 |
+
sampled_groups = []
|
23 |
+
|
24 |
+
for i in range(K):
|
25 |
+
s = i * group_size + min(i, remainder)
|
26 |
+
e = s + group_size + (1 if i < remainder else 0)
|
27 |
+
group = array[s:e]
|
28 |
+
|
29 |
+
if N >= len(group):
|
30 |
+
sampled_groups.append(group)
|
31 |
+
else:
|
32 |
+
interval = len(group) // N
|
33 |
+
sampled_groups.append([group[j * interval] for j in range(N)])
|
34 |
+
|
35 |
+
return sampled_groups
|
36 |
+
|
37 |
@dataclass
|
38 |
class VideoProcessor:
|
39 |
frame_format: str = "JPEG"
|
40 |
+
frame_limit: int = 10
|
|
|
41 |
|
42 |
+
def _decode(self, video_path: str) -> List[Image.Image]:
|
43 |
+
frames = []
|
44 |
+
with av.open(video_path) as container:
|
45 |
+
src = container.streams.video[0]
|
46 |
+
time_base = src.time_base
|
47 |
+
framerate = src.average_rate
|
48 |
+
|
49 |
+
for i in sample(src.frames, self.frame_limit):
|
50 |
+
n = round((i / framerate) / time_base)
|
51 |
+
container.seek(n, backward=True, stream=src)
|
52 |
+
frame = next(container.decode(video=0))
|
53 |
+
im = frame.to_image()
|
54 |
+
frames.append(im)
|
55 |
+
return frames
|
56 |
+
|
57 |
def decode(self, video_path: str) -> List[Image.Image]:
|
58 |
frames = []
|
59 |
container = av.open(video_path)
|
|
|
86 |
|
87 |
return concatenated_image
|
88 |
|
89 |
+
def grid_concatenate(self, frames: List[Image.Image], group_size, limit=10) -> List[Image.Image]:
|
90 |
+
sampled_groups = grid_sample(frames, group_size, limit)
|
|
|
|
|
|
|
|
|
|
|
91 |
return [self.concatenate(group) for group in sampled_groups]
|
92 |
|
93 |
def to_base64_list(self, images: List[Image.Image]) -> List[str]:
|