File size: 6,287 Bytes
0a54cb2
2ec5927
 
 
 
 
 
 
e5d21c3
 
24aba9b
2ec5927
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342d9ff
2ec5927
9056339
 
2ec5927
 
 
342d9ff
2ec5927
 
 
 
 
 
 
 
529525e
b5ea36b
2ec5927
 
 
 
 
 
 
 
529525e
2ec5927
5dc1a2c
2ec5927
 
 
 
 
 
529525e
2ec5927
529525e
c5185c8
2ec5927
 
24aba9b
 
2ec5927
 
 
9056339
 
24aba9b
0fb50a1
 
 
 
 
 
 
 
24aba9b
0fb50a1
 
 
2ec5927
529525e
2ec5927
6dcb124
24aba9b
6dcb124
9056339
 
f1dea0c
 
 
 
 
9056339
 
b5ea36b
9056339
 
 
f1dea0c
9056339
 
 
 
 
2ec5927
 
 
 
 
 
 
 
 
 
 
 
9056339
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import torch
import math
import cv2
import os
import sys
import FFV1MT_MS
import flow_tools
import spaces

@spaces.GPU # for dynamic GPU resource
def process_images(videos, x, y):
    # read video file
    cap = cv2.VideoCapture(videos)
    # transform images to a list of images ndarray
    images = []
    while True:
        ret, frame = cap.read()
        if ret:
            images.append(frame)
        else:
            break
    if len(images) < 11:
        print('video is too short')
        return
    # only use the first 11 frames
    images = images[:11]
    # transform images to a list of images tensor
    images = [torch.from_numpy(img).permute(2, 0, 1).float().to(device).unsqueeze(0) / 255.0 for img in images]
    # if the max size of the image is larger than 1024, resize the image to 768 with same ratio
    max_size = max(images[0].shape[2], images[0].shape[3])
    if max_size > 768:
        ratio = 768 / max_size
        images = [torch.nn.functional.interpolate(img, scale_factor=ratio, mode='bicubic', align_corners=True) for img
                  in images]
    # transform color image to gray image
    
    result = model.forward_viz(images, layer=7, x=x, y=y)
    flow = result['flow']
    attention = result['attention']
    activation = result['activation']

    return [flow, activation, attention]


title = "Modelling Human Visual Motion Processing with Trainable Motion Energy Sensing and a Self-attention Network πŸ€— "
description = "## Introduction πŸ”₯πŸ”₯πŸ”₯\n" \
              " The intersection of cognitive neuroscience and computer vision offers exciting advancements in " \
              "how machines perceive motion. Our research bridges the gap between these fields by proposing a novel " \
              "image-computable model that aligns with human motion perception mechanisms. By integrating trainable" \
              " motion energy sensing with recurrent self-attention networks, we can simulate the complex motion " \
              "processing of the human visual cortex, particularly the V1-MT pathway. Our model not only parallels" \
              " physiological responses in V1 and MT neurons but also replicates human psychophysical responses " \
              "to dynamic stimuli. \n\n\n" \
              "![](https://drive.google.com/uc?id=10PcKzQ9X1nsXKUi8OPR0jN_ZsjlCAV47) \n" \
              "## Environment Configuration 🐑 \n" \
              "To run our model, the basic environment configuration is required:\n" \
              '- gradio == 4.7.1'\
              '- Python 3.8 or higher \n' \
              '- Pyotrch 2.0 \n' \
              '- CUDA Toolkit 11.x (for GPU acceleration)\n' \
              '- opencv-python \n' \
              '- Imageio \n' \
              '- Matplotlib \n\n' \
              "## Preprint Paper πŸ“ \n" \
              "The paper is available at [arXiv](https://arxiv.org/abs/2305.09156) \n" \
              "## Video Presentation πŸ“Ή \n" \
              "The video presentation is available at [Video Record](https://recorder-v3.slideslive.com/?share=85662&s=6afe157c-e764-4e3c-9302-2c6dd6887db1). \n" \
              "## Conference Website \n" \
              "The project is presented at [NeurIPS 2023](https://neurips.cc/virtual/2023/poster/70202). \n" \
              "## Below is the interactive demo of our model.  You can select the video examples below or upload your own videos. "\
              "The model outputs the motion flow field, the activation of the first stage, and the attention map of the second stage." \
              "We also provide two sliders to adjust the location of the attention visualizer. \n" \
              " **Note**: The demo is running on CPU, so it may take a while to process the video.  \n"

examples = [["example_1.mp4", 62, 56], ["example_2.mp4", 59, 55], ["example_3.mp4", 50, 50], ["example_4.mp4", 50, 50],
            ["example_5.mp4", 39, 72]]

md = "## Citation \n"  \
    "If you do think this work helps your research, please cite our work as:\n"\
    "```\n"\
    "@inproceedings{ \n"\
    "sun2023modeling,\n"\
    "title={Modeling Human Visual Motion Processing with Trainable Motion Energy Sensing and a Self-attention Network},\n"\
    "author={Zitang Sun and Yen-Ju Chen and Yung-Hao Yang and Shin'ya Nishida},\n"\
    "booktitle={Thirty-seventh Conference on Neural Information Processing Systems},\n"\
    "year={2023},\n"\
    "url={https://openreview.net/forum?id=tRKimbAk5D}\n"\
    "}\n"\
    "```\n"\
     "## Author \n" \
     "This project page is developed by Zitang Sun πŸ“§ (zitangsun96 @ gmail.com)\n" \
     "## LICENSE \n" \
     "This project is licensed under the terms of the MIT license. \n"\
     "## Address 🏑 \n" \
     "[Cognitive Informatics Lab](http://www.cog.ist.i.kyoto-u.ac.jp/en/index.html), Graduate School of Informatics, Kyoto University, Japan \n"

if __name__ =='__main__':
    # torch.cuda.init()
    # print(f"Is CUDA available: {torch.cuda.is_available()}")
    # # True
    # print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    # # Tesla T4
    
    model = FFV1MT_MS.FFV1DNN()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    print('Number fo parameters: {}'.format(model.num_parameters()))
    model.to(device)
    model_dict = torch.load('Model_example.pth.tar', map_location="cpu")['state_dict']
    # save model
    model.load_state_dict(model_dict, strict=True)
    model.eval()

    iface = gr.Interface(fn=process_images,
                     inputs=[gr.Video(label="Upload video or use the example images below"),
                             gr.Slider(0, 100, label='X location of attention visualizer'),
                             gr.Slider(0, 100, label='Y location of attention visualizer')],
                     # out put is three images
                     outputs=[gr.Image(type="numpy", label="Motion flow field"),
                              gr.Image(type="numpy", label="Activation of Stage I"),
                              gr.Image(type="numpy", label="Attention map of Stage II")],
                     title=title,
                     description=description,
                     article=md,
                     examples=examples)

    iface.launch(debug=True)