chenjoya commited on
Commit
4cc5cdc
1 Parent(s): eed2301

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -0
app.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, torchvision, transformers
2
+ torchvision.set_video_backend('video_reader')
3
+ from functools import partial
4
+ import gradio as gr
5
+
6
+ from data.utils import ffmpeg_once
7
+
8
+ from .inference import LiveInfer
9
+ logger = transformers.logging.get_logger('liveinfer')
10
+
11
+ # python -m demo.app --resume_from_checkpoint ...
12
+
13
+ liveinfer = LiveInfer()
14
+
15
+ css = """
16
+ #gr_title {text-align: center;}
17
+ #gr_video {max-height: 480px;}
18
+ #gr_chatbot {max-height: 480px;}
19
+ """
20
+
21
+ get_gr_video_current_time = """async (video, _) => {
22
+ const videoEl = document.querySelector("#gr_video video");
23
+ return [video, videoEl.currentTime];
24
+ }"""
25
+
26
+ with gr.Blocks(title="VideoLLM-online", css=css) as demo:
27
+ gr.Markdown("# VideoLLM-online: Online Video Large Language Model for Streaming Video", elem_id='gr_title')
28
+ with gr.Row():
29
+ with gr.Column():
30
+ gr_video = gr.Video(label="video stream", elem_id="gr_video", visible=True, sources=['upload'], autoplay=True)
31
+ gr_examples = gr.Examples(
32
+ examples=[["demo/assets/cooking.mp4"], ["demo/assets/bicycle.mp4"], ["demo/assets/egoexo4d.mp4"]],
33
+ inputs=gr_video,
34
+ outputs=gr_video,
35
+ label="Examples"
36
+ )
37
+ gr.Markdown("## Tips:")
38
+ gr.Markdown("- When you upload/click a video, the model starts processing the video stream. You can input a query before or after that, at any point during the video as you like.")
39
+ gr.Markdown("- **Gradio refreshes the chatbot box to update the answer, which will delay the program. If you want to enjoy faster demo as we show in teaser video, please use https://github.com/showlab/videollm-online/blob/main/demo/cli.py.**")
40
+ gr.Markdown("- This work is primarily done at a university, and our resources are limited. Our model is trained with limited data, so it may not solve very complicated questions. However, we have seen the potential of 'learning in streaming'. We are working on new data method to scale streaming dialogue data to our next model.")
41
+
42
+ with gr.Column():
43
+ gr_chat_interface = gr.ChatInterface(
44
+ fn=liveinfer.input_query_stream,
45
+ chatbot=gr.Chatbot(
46
+ elem_id="gr_chatbot",
47
+ label='chatbot',
48
+ avatar_images=('demo/user_avatar.png', 'demo/assistant_avatar.png'),
49
+ render=False
50
+ ),
51
+ examples=['Please narrate the video in real time.', 'Please describe what I am doing.', 'Could you summarize what have been done?', 'Hi, guide me the next step.'],
52
+ )
53
+
54
+ def gr_frame_token_interval_threshold_change(frame_token_interval_threshold):
55
+ liveinfer.frame_token_interval_threshold = frame_token_interval_threshold
56
+ gr_frame_token_interval_threshold = gr.Slider(minimum=0, maximum=1, step=0.05, value=liveinfer.frame_token_interval_threshold, interactive=True, label="Streaming Threshold")
57
+ gr_frame_token_interval_threshold.change(gr_frame_token_interval_threshold_change, inputs=[gr_frame_token_interval_threshold])
58
+
59
+ gr_video_time = gr.Number(value=0, visible=False)
60
+ gr_liveinfer_queue_refresher = gr.Number(value=False, visible=False)
61
+
62
+ def gr_video_change(src_video_path, history, video_time, gate):
63
+ name, ext = os.path.splitext(src_video_path)
64
+ ffmpeg_video_path = os.path.join('demo/assets/cache', name + f'_{liveinfer.frame_fps}fps_{liveinfer.frame_resolution}' + ext)
65
+ if not os.path.exists(ffmpeg_video_path):
66
+ os.makedirs(os.path.dirname(ffmpeg_video_path), exist_ok=True)
67
+ ffmpeg_once(src_video_path, ffmpeg_video_path, fps=liveinfer.frame_fps, resolution=liveinfer.frame_resolution)
68
+ logger.warning(f'{src_video_path} -> {ffmpeg_video_path}, {liveinfer.frame_fps} FPS, {liveinfer.frame_resolution} Resolution')
69
+ liveinfer.load_video(ffmpeg_video_path)
70
+ liveinfer.input_video_stream(0)
71
+ query, response = liveinfer()
72
+ if query or response:
73
+ history.append((query, response))
74
+ return history, video_time + 1 / liveinfer.frame_fps, not gate
75
+ gr_video.change(
76
+ gr_video_change, inputs=[gr_video, gr_chat_interface.chatbot, gr_video_time, gr_liveinfer_queue_refresher],
77
+ outputs=[gr_chat_interface.chatbot, gr_video_time, gr_liveinfer_queue_refresher]
78
+ )
79
+
80
+ def gr_video_time_change(_, video_time):
81
+ liveinfer.input_video_stream(video_time)
82
+ return video_time
83
+ gr_video_time.change(gr_video_time_change, [gr_video, gr_video_time], [gr_video_time], js=get_gr_video_current_time)
84
+
85
+ def gr_liveinfer_queue_refresher_change(history):
86
+ while True:
87
+ query, response = liveinfer()
88
+ if query or response:
89
+ history[-1][1] += f'\n{response}'
90
+ yield history
91
+ gr_liveinfer_queue_refresher.change(gr_liveinfer_queue_refresher_change, inputs=[gr_chat_interface.chatbot], outputs=[gr_chat_interface.chatbot])
92
+
93
+ demo.queue()
94
+ demo.launch(share=False)