xhwang00 commited on
Commit
5f72936
·
verified ·
1 Parent(s): 374db79

Upload 4 files

Browse files
.gitattributes CHANGED
@@ -37,3 +37,5 @@ random_10_sample_of_vidstg/random_10_sample_of_test.json filter=lfs diff=lfs mer
37
  random_10_sample_of_vidstg/random_10_sample_of_train.json filter=lfs diff=lfs merge=lfs -text
38
  random_10_sample_of_vidstg/random_10_sample_of_val.json filter=lfs diff=lfs merge=lfs -text
39
  Processed_Dataset/vidstg_train_processed.json filter=lfs diff=lfs merge=lfs -text
 
 
 
37
  random_10_sample_of_vidstg/random_10_sample_of_train.json filter=lfs diff=lfs merge=lfs -text
38
  random_10_sample_of_vidstg/random_10_sample_of_val.json filter=lfs diff=lfs merge=lfs -text
39
  Processed_Dataset/vidstg_train_processed.json filter=lfs diff=lfs merge=lfs -text
40
+ xitong/train_annotations.json filter=lfs diff=lfs merge=lfs -text
41
+ xitong/vidstg_train_processed.json filter=lfs diff=lfs merge=lfs -text
xitong/train_annotations.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a8d13641e9445ccbd5cfc97a3a9b82585460d529ad748de95305c5b4c97c775
3
+ size 29011250
xitong/vidstg_train_processed.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cf502f7ee5ab0c1c9502d6b992b41c4e4a62f7ba36e0e935544bd64edcb5b38
3
+ size 14731937
xitong/xl_main.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import openai
4
+ import os
5
+ from openai import OpenAI
6
+
7
+
8
+
9
+ prompt = '''
10
+ A Referential Dialogue question-and-answer pair is like this:
11
+
12
+ "conversations": [
13
+ {{
14
+ "from": "human",
15
+ "value": "Is there an event "a man in sunglasses hugging a woman in a black and white blouse" in the video?"
16
+ }},
17
+ {{
18
+ "from": "gpt",
19
+ "value": "In the image, there is a man [100.0, 116.16000000000001, 477.0, 437.8] and a woman in the video. The man [100.0, 116.16000000000001, 477.0, 437.8] has his arm around the woman , which suggests that he is hugging her. Given the appearance of these individuals and their positioning, the event "a man in sunglasses hugging a woman in a black and white blouse" is present in the video. Therefore, the answer is yes."
20
+ }}
21
+ ]
22
+
23
+ Vidstg is a spatio-temporal video grounding dataset. Each video is annotated with a question-and-answer pair, and a series of bounding boxes that contain the trajectories of the corresponding target object.
24
+
25
+ Generate a referential dialogue question-answer pair from a video's question "{}" and the description "{}"
26
+
27
+ In this case, the target object is "{}".
28
+
29
+ The target object is initially at {} in frame {} and stops appearing at position {} in frame {}. Mention the position and frame of the target person in the answer.
30
+
31
+ Generate the JSON only.
32
+ '''
33
+
34
+
35
+
36
+ def main():
37
+ with open("vidstg_train_processed.json", 'r') as json_file:
38
+ data = json.load(json_file)
39
+
40
+ with open("train_annotations.json", 'r') as json_file:
41
+ annotation = json.load(json_file)
42
+
43
+ all_results = []
44
+
45
+ API_KEY = "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"
46
+ os.environ['OPENAI_API_KEY'] =API_KEY
47
+ openai.api_key = os.getenv("OPENAI_API_KEY")
48
+ client = OpenAI()
49
+
50
+ for a_list in data[0:100]:
51
+ vid = a_list['vid']
52
+ question = a_list['question']
53
+ answer = a_list['answer']
54
+ target = a_list['target']
55
+ start_bbox = a_list['start_bbox']['bbox']
56
+ start_frame = a_list['start_frame']
57
+ end_bbox = a_list['end_bbox']['bbox']
58
+ end_frame = a_list['end_frame']
59
+
60
+ for ann in annotation:
61
+ if ann['vid'] == vid:
62
+ frame_count = ann['frame_count']
63
+ break
64
+
65
+ my_prompt = prompt.format(question, answer, target, start_bbox, start_frame, end_bbox, end_frame)
66
+
67
+ print(my_prompt)
68
+
69
+ response = client.chat.completions.create(
70
+ model="gpt-3.5-turbo",
71
+ messages=[
72
+ # {"role": "system", "content": "You will be provided with sentences and you have to predict the next sentence."},
73
+ {"role": "user", "content": my_prompt}
74
+ ]
75
+ )
76
+
77
+ result_json = response.choices[0].message.content
78
+ print(result_json)
79
+ json_obj = json.loads(result_json)
80
+ json_obj["meta"] = {"vid": vid, "frame_count": frame_count, "start_frame": start_frame, "end_frame": end_frame}
81
+
82
+ all_results.append(json_obj)
83
+
84
+
85
+ with open('xl_results.json', 'w') as f:
86
+ json.dump(all_results, f, indent=4)
87
+
88
+
89
+
90
+ if __name__=='__main__':
91
+ main()
xitong/xl_results.json ADDED
@@ -0,0 +1,1802 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "conversations": [
4
+ {
5
+ "from": "human",
6
+ "value": "What does the adult man in black hold?"
7
+ },
8
+ {
9
+ "from": "gpt",
10
+ "value": "In the video, the adult man in black holds a glass cup. The target object, which is the adult man, appears in frame 124 at position [0, 31, 84, 239] and stops appearing in frame 604 at position [93, 27, 187, 239]."
11
+ }
12
+ ],
13
+ "meta": {
14
+ "vid": "2727682922",
15
+ "frame_count": 604,
16
+ "start_frame": 124,
17
+ "end_frame": 604
18
+ }
19
+ },
20
+ {
21
+ "conversations": [
22
+ {
23
+ "from": "human",
24
+ "value": "What is beneath the adult man in black?"
25
+ },
26
+ {
27
+ "from": "gpt",
28
+ "value": "In the video, there is a white horse beneath an adult man in black. The horse appears at [0, 109, 34, 269] in frame 180 and stops appearing at position [364, 159, 508, 461] in frame 1710."
29
+ }
30
+ ],
31
+ "meta": {
32
+ "vid": "2769996915",
33
+ "frame_count": 1725,
34
+ "start_frame": 180,
35
+ "end_frame": 1710
36
+ }
37
+ },
38
+ {
39
+ "conversations": [
40
+ {
41
+ "from": "human",
42
+ "value": "Who is towards the adult wearing a mask?"
43
+ },
44
+ {
45
+ "from": "gpt",
46
+ "value": "In frame 869, a child wearing blue pants can be seen towards the adult wearing a mask at position [198, 32, 242, 164]. The child stops appearing in frame 1109 at position [0, 79, 27, 243]. Therefore, the child is the one towards the adult wearing a mask."
47
+ }
48
+ ],
49
+ "meta": {
50
+ "vid": "4655213495",
51
+ "frame_count": 1824,
52
+ "start_frame": 869,
53
+ "end_frame": 1109
54
+ }
55
+ },
56
+ {
57
+ "conversations": [
58
+ {
59
+ "from": "human",
60
+ "value": "Who is next to the toy on the ground?"
61
+ },
62
+ {
63
+ "from": "gpt",
64
+ "value": "In frame 515, a child is next to the toy on the ground at position [448, 0, 639, 169]. The child stops appearing next to the toy on the ground at position [526, 0, 589, 50] in frame 569."
65
+ }
66
+ ],
67
+ "meta": {
68
+ "vid": "4598213889",
69
+ "frame_count": 855,
70
+ "start_frame": 515,
71
+ "end_frame": 569
72
+ }
73
+ },
74
+ {
75
+ "conversations": [
76
+ {
77
+ "from": "human",
78
+ "value": "Who hugs the other child in black in the room?"
79
+ },
80
+ {
81
+ "from": "gpt",
82
+ "value": "In the video, a child in white hugs another child in black in the room. The child in white initially appears at position [252, 139, 352, 375] in frame 252 and stops appearing at position [383, 206, 525, 294] in frame 359. Therefore, the child in white hugs the other child in black in the room."
83
+ }
84
+ ],
85
+ "meta": {
86
+ "vid": "4284560730",
87
+ "frame_count": 386,
88
+ "start_frame": 252,
89
+ "end_frame": 359
90
+ }
91
+ },
92
+ {
93
+ "conversations": [
94
+ {
95
+ "from": "human",
96
+ "value": "Who is away from the adult?"
97
+ },
98
+ {
99
+ "from": "gpt",
100
+ "value": "In the video, a child wearing black pants is away from an adult. The child is initially located at position [251, 90, 437, 290] in frame 420 and stops appearing at position [456, 151, 513, 264] in frame 570."
101
+ }
102
+ ],
103
+ "meta": {
104
+ "vid": "6391532457",
105
+ "frame_count": 607,
106
+ "start_frame": 420,
107
+ "end_frame": 570
108
+ }
109
+ },
110
+ {
111
+ "conversations": [
112
+ {
113
+ "from": "human",
114
+ "value": "Who hugs the toy?"
115
+ },
116
+ {
117
+ "from": "gpt",
118
+ "value": "In the video, a child is seen hugging the toy. The child is initially located at position [126, 66, 161, 189] in frame 393 and stops appearing at position [100, 54, 177, 239] in frame 836."
119
+ }
120
+ ],
121
+ "meta": {
122
+ "vid": "2440057130",
123
+ "frame_count": 836,
124
+ "start_frame": 393,
125
+ "end_frame": 836
126
+ }
127
+ },
128
+ {
129
+ "conversations": [
130
+ {
131
+ "from": "human",
132
+ "value": "who is in front of the other child in black clothes?"
133
+ },
134
+ {
135
+ "from": "gpt",
136
+ "value": "In the video, a child in yellow clothes is in front of another child in black clothes. The child in yellow clothes is initially located at position [237, 105, 383, 341] in frame 0 and the other child in black clothes is behind them. Therefore, the answer to the question is that a child in yellow clothes is in front of the child in black clothes."
137
+ }
138
+ ],
139
+ "meta": {
140
+ "vid": "6811523608",
141
+ "frame_count": 309,
142
+ "start_frame": 0,
143
+ "end_frame": 270
144
+ }
145
+ },
146
+ {
147
+ "conversations": [
148
+ {
149
+ "from": "human",
150
+ "value": "Who opens the bus outdoors?"
151
+ },
152
+ {
153
+ "from": "gpt",
154
+ "value": "In frame 207 of the video, an adult in green can be seen opening a bus outdoors. The adult is located at position [238, 127, 278, 228]. This individual stops appearing at position [232, 106, 278, 197] in frame 253."
155
+ }
156
+ ],
157
+ "meta": {
158
+ "vid": "7791533260",
159
+ "frame_count": 649,
160
+ "start_frame": 207,
161
+ "end_frame": 253
162
+ }
163
+ },
164
+ {
165
+ "conversations": [
166
+ {
167
+ "from": "human",
168
+ "value": "Who is in front of the black sofa on the blanket?"
169
+ },
170
+ {
171
+ "from": "gpt",
172
+ "value": "In frame 1248, there is a black sofa [113, 32, 270, 193] behind a child in gray and white on the blanket. The sofa stops appearing in frame 1316 at position [0, 0, 235, 102]. Therefore, the answer is nobody is in front of the black sofa on the blanket."
173
+ }
174
+ ],
175
+ "meta": {
176
+ "vid": "8212773822",
177
+ "frame_count": 1461,
178
+ "start_frame": 1248,
179
+ "end_frame": 1316
180
+ }
181
+ },
182
+ {
183
+ "conversations": [
184
+ {
185
+ "from": "human",
186
+ "value": "What does the adult in orange carry in the snow?"
187
+ },
188
+ {
189
+ "from": "gpt",
190
+ "value": "The adult in orange carries a black backpack in the snow. The adult is initially located at position [454, 161, 523, 260] in frame 0 and stops appearing at position [481, 28, 548, 156] in frame 511."
191
+ }
192
+ ],
193
+ "meta": {
194
+ "vid": "3281490756",
195
+ "frame_count": 511,
196
+ "start_frame": 0,
197
+ "end_frame": 511
198
+ }
199
+ },
200
+ {
201
+ "conversations": [
202
+ {
203
+ "from": "human",
204
+ "value": "Who is inside the car on the road?"
205
+ },
206
+ {
207
+ "from": "gpt",
208
+ "value": "In the video, an adult is inside a car on the road. The adult is initially located at position [386, 140, 399, 160] in frame 120 and stops appearing at position [0, 174, 48, 205] in frame 159."
209
+ }
210
+ ],
211
+ "meta": {
212
+ "vid": "5643227230",
213
+ "frame_count": 175,
214
+ "start_frame": 120,
215
+ "end_frame": 159
216
+ }
217
+ },
218
+ {
219
+ "conversations": [
220
+ {
221
+ "from": "human",
222
+ "value": "What is away the other dog in the room?"
223
+ },
224
+ {
225
+ "from": "gpt",
226
+ "value": "In the video, there is a dog initially located at position [0, 312, 166, 484] in frame 1531. The dog stops appearing at position [219, 262, 386, 469] in frame 1589. Therefore, the other dog in the room is located at [219, 262, 386, 469] in frame 1589."
227
+ }
228
+ ],
229
+ "meta": {
230
+ "vid": "6341769390",
231
+ "frame_count": 1728,
232
+ "start_frame": 1531,
233
+ "end_frame": 1589
234
+ }
235
+ },
236
+ {
237
+ "conversations": [
238
+ {
239
+ "from": "human",
240
+ "value": "Who is the gray cat next to?"
241
+ },
242
+ {
243
+ "from": "gpt",
244
+ "value": "In the video, there is a gray cat next to a female adult. The gray cat is initially located at position [164, 19, 495, 477] in frame 0 and stops appearing at position [214, 0, 576, 295] in frame 694. Therefore, the answer to your question is that the gray cat is next to a female adult."
245
+ }
246
+ ],
247
+ "meta": {
248
+ "vid": "3447876447",
249
+ "frame_count": 729,
250
+ "start_frame": 0,
251
+ "end_frame": 694
252
+ }
253
+ },
254
+ {
255
+ "conversations": [
256
+ {
257
+ "from": "human",
258
+ "value": "Who is next to the adult in white on the sofa?"
259
+ },
260
+ {
261
+ "from": "gpt",
262
+ "value": "In frame 600, a child in black is next to an adult in white on the sofa at position [0, 306, 55, 359]. The child stops appearing at position [53, 0, 129, 248] in frame 690."
263
+ }
264
+ ],
265
+ "meta": {
266
+ "vid": "8890945814",
267
+ "frame_count": 702,
268
+ "start_frame": 600,
269
+ "end_frame": 690
270
+ }
271
+ },
272
+ {
273
+ "conversations": [
274
+ {
275
+ "from": "human",
276
+ "value": "Who hugs the red toy?"
277
+ },
278
+ {
279
+ "from": "gpt",
280
+ "value": "In the video, a baby [356, 112, 613, 477] with blond hair can be seen hugging the red toy. The baby is initially located at position [356, 112, 613, 477] in frame 523 and stops appearing at position [331, 43, 528, 410] in frame 578."
281
+ }
282
+ ],
283
+ "meta": {
284
+ "vid": "6314288870",
285
+ "frame_count": 1593,
286
+ "start_frame": 523,
287
+ "end_frame": 578
288
+ }
289
+ },
290
+ {
291
+ "conversations": [
292
+ {
293
+ "from": "human",
294
+ "value": "Who is beneath the other adult under the water?"
295
+ },
296
+ {
297
+ "from": "gpt",
298
+ "value": "In the video, an adult is beneath another adult under the water. The target adult starts at position [280, 147, 484, 249] in frame 0 and stops appearing at position [208, 121, 401, 206] in frame 150."
299
+ }
300
+ ],
301
+ "meta": {
302
+ "vid": "3975461732",
303
+ "frame_count": 333,
304
+ "start_frame": 0,
305
+ "end_frame": 150
306
+ }
307
+ },
308
+ {
309
+ "conversations": [
310
+ {
311
+ "from": "human",
312
+ "value": "Who hugs the toy in red on the bed?"
313
+ },
314
+ {
315
+ "from": "gpt",
316
+ "value": "In frame 925, a baby [108, 32, 410, 361] is hugging a toy in red on the bed. The baby's position changes to [116, 13, 417, 355] in frame 959, where the baby stops appearing. Therefore, the baby is the one who hugs the toy in red on the bed."
317
+ }
318
+ ],
319
+ "meta": {
320
+ "vid": "8109383060",
321
+ "frame_count": 959,
322
+ "start_frame": 925,
323
+ "end_frame": 959
324
+ }
325
+ },
326
+ {
327
+ "conversations": [
328
+ {
329
+ "from": "human",
330
+ "value": "What is in front of the penguin outdoors?"
331
+ },
332
+ {
333
+ "from": "gpt",
334
+ "value": "In the video, there is a penguin [358, 400, 451, 544] behind another penguin outdoors. The target penguin is initially at position [358, 400, 451, 544] in frame 0 and stops appearing at position [0, 717, 186, 854] in frame 330. Therefore, there is another penguin in front of the target penguin outdoors."
335
+ }
336
+ ],
337
+ "meta": {
338
+ "vid": "7536107096",
339
+ "frame_count": 479,
340
+ "start_frame": 0,
341
+ "end_frame": 330
342
+ }
343
+ },
344
+ {
345
+ "conversations": [
346
+ {
347
+ "from": "human",
348
+ "value": "What is beneath the cup outdoors?"
349
+ },
350
+ {
351
+ "from": "gpt",
352
+ "value": "In the video, there is a cup positioned above a table outdoors. The cup initially appears at [0, 224, 86, 367] in frame 0 and stops appearing at [0, 40, 54, 235] in frame 107. Therefore, beneath the cup outdoors, there is a table."
353
+ }
354
+ ],
355
+ "meta": {
356
+ "vid": "2748674628",
357
+ "frame_count": 954,
358
+ "start_frame": 0,
359
+ "end_frame": 107
360
+ }
361
+ },
362
+ {
363
+ "conversations": [
364
+ {
365
+ "from": "human",
366
+ "value": "Who bites the yellow bread?"
367
+ },
368
+ {
369
+ "from": "gpt",
370
+ "value": "In the video, the target object 'adult' bites the yellow bread. The adult starts appearing at position [0, 0, 486, 479] in frame 1824 and stops appearing at position [236, 115, 495, 479] in frame 1931."
371
+ }
372
+ ],
373
+ "meta": {
374
+ "vid": "3127391012",
375
+ "frame_count": 2249,
376
+ "start_frame": 1824,
377
+ "end_frame": 1931
378
+ }
379
+ },
380
+ {
381
+ "conversations": [
382
+ {
383
+ "from": "human",
384
+ "value": "Who holds the toy in the video?"
385
+ },
386
+ {
387
+ "from": "gpt",
388
+ "value": "In the video, a child in green clothes is holding a toy. The child appears at position [0, 0, 639, 1137] in frame 124 and stops appearing at position [0, 0, 635, 1137] in frame 190."
389
+ }
390
+ ],
391
+ "meta": {
392
+ "vid": "13497317534",
393
+ "frame_count": 355,
394
+ "start_frame": 124,
395
+ "end_frame": 190
396
+ }
397
+ },
398
+ {
399
+ "conversations": [
400
+ {
401
+ "from": "human",
402
+ "value": "Who is next to a handbag?"
403
+ },
404
+ {
405
+ "from": "gpt",
406
+ "value": "In the video, the adult is next to a handbag on a motorcycle. The adult is initially located at [13, 0, 193, 359] in frame 392 and stops appearing at position [590, 0, 639, 240] in frame 509."
407
+ }
408
+ ],
409
+ "meta": {
410
+ "vid": "7499375636",
411
+ "frame_count": 1600,
412
+ "start_frame": 392,
413
+ "end_frame": 509
414
+ }
415
+ },
416
+ {
417
+ "conversations": [
418
+ {
419
+ "from": "human",
420
+ "value": "Who is above the motorcycle on the wild?"
421
+ },
422
+ {
423
+ "from": "gpt",
424
+ "value": "In frame 0, there is a motorcycle located at position [91, 176, 147, 234]. Above the motorcycle, there is an adult on the wild. The adult is positioned at [0, 195, 45, 267], indicating that they are above the motorcycle. Therefore, the answer is an adult."
425
+ }
426
+ ],
427
+ "meta": {
428
+ "vid": "5344750146",
429
+ "frame_count": 282,
430
+ "start_frame": 0,
431
+ "end_frame": 124
432
+ }
433
+ },
434
+ {
435
+ "conversations": [
436
+ {
437
+ "from": "human",
438
+ "value": "Who pushes the toy outside the house?"
439
+ },
440
+ {
441
+ "from": "gpt",
442
+ "value": "In the video, a baby is pushing a toy outside the house. The baby appears at position [134, 43, 243, 224] in frame 215 and stops appearing at position [99, 95, 179, 202] in frame 248."
443
+ }
444
+ ],
445
+ "meta": {
446
+ "vid": "3837521252",
447
+ "frame_count": 716,
448
+ "start_frame": 215,
449
+ "end_frame": 248
450
+ }
451
+ },
452
+ {
453
+ "conversations": [
454
+ {
455
+ "from": "human",
456
+ "value": "Who pushes the toy outside the house?"
457
+ },
458
+ {
459
+ "from": "gpt",
460
+ "value": "In frame 376, a baby is located at position [134, 43, 243, 224] pushing a toy outside the house. The baby stops appearing at position [99, 95, 179, 202] in frame 406."
461
+ }
462
+ ],
463
+ "meta": {
464
+ "vid": "3837521252",
465
+ "frame_count": 716,
466
+ "start_frame": 376,
467
+ "end_frame": 406
468
+ }
469
+ },
470
+ {
471
+ "conversations": [
472
+ {
473
+ "from": "human",
474
+ "value": "Who pushes the toy outside the house?"
475
+ },
476
+ {
477
+ "from": "gpt",
478
+ "value": "In the video, a baby is pushing a toy outside the house. The baby is initially located at position [134, 43, 243, 224] in frame 457 and stops appearing at position [99, 95, 179, 202] in frame 533."
479
+ }
480
+ ],
481
+ "meta": {
482
+ "vid": "3837521252",
483
+ "frame_count": 716,
484
+ "start_frame": 457,
485
+ "end_frame": 533
486
+ }
487
+ },
488
+ {
489
+ "conversations": [
490
+ {
491
+ "from": "human",
492
+ "value": "What is away from the child in the swimming pool?"
493
+ },
494
+ {
495
+ "from": "gpt",
496
+ "value": "In the video, there is a ball away from the child in the swimming pool. The ball is initially located at position [310, 206, 333, 224] in frame 60 and stops appearing at position [621, 266, 639, 293] in frame 317."
497
+ }
498
+ ],
499
+ "meta": {
500
+ "vid": "3557607884",
501
+ "frame_count": 1044,
502
+ "start_frame": 60,
503
+ "end_frame": 317
504
+ }
505
+ },
506
+ {
507
+ "conversations": [
508
+ {
509
+ "from": "human",
510
+ "value": "Who is next to the adult?"
511
+ },
512
+ {
513
+ "from": "gpt",
514
+ "value": "In the video, there is an adult standing next to another adult. The first adult is located at position [0, 139, 127, 341] in frame 220 and stops appearing at position [144, 164, 292, 350] in frame 260. Therefore, the answer to your question is that another adult is next to the adult in the video."
515
+ }
516
+ ],
517
+ "meta": {
518
+ "vid": "5017783246",
519
+ "frame_count": 261,
520
+ "start_frame": 220,
521
+ "end_frame": 260
522
+ }
523
+ },
524
+ {
525
+ "conversations": [
526
+ {
527
+ "from": "human",
528
+ "value": "Who is above the watercraft in the river?"
529
+ },
530
+ {
531
+ "from": "gpt",
532
+ "value": "In frame 420, a child is above the watercraft in the river at position [180, 184, 245, 233]. The child stops appearing at position [181, 171, 311, 224] in frame 586."
533
+ }
534
+ ],
535
+ "meta": {
536
+ "vid": "7896372830",
537
+ "frame_count": 586,
538
+ "start_frame": 420,
539
+ "end_frame": 586
540
+ }
541
+ },
542
+ {
543
+ "conversations": [
544
+ {
545
+ "from": "human",
546
+ "value": "Who touches the toy on the watercraft?"
547
+ },
548
+ {
549
+ "from": "gpt",
550
+ "value": "In frame 435 of the video, an adult with green pants is touching a toy on the watercraft. The adult is located at position [0, 0, 365, 325]. The adult stops appearing at position [0, 0, 385, 384] in frame 481."
551
+ }
552
+ ],
553
+ "meta": {
554
+ "vid": "4855999801",
555
+ "frame_count": 902,
556
+ "start_frame": 435,
557
+ "end_frame": 481
558
+ }
559
+ },
560
+ {
561
+ "conversations": [
562
+ {
563
+ "from": "human",
564
+ "value": "Who touches the toy on the watercraft?"
565
+ },
566
+ {
567
+ "from": "gpt",
568
+ "value": "In frame 521, there is an adult with green pants at position [0, 0, 365, 325] touching a toy on the watercraft. The adult continues to appear in subsequent frames until frame 616 at position [0, 0, 385, 384]. Therefore, the answer is an adult with green pants."
569
+ }
570
+ ],
571
+ "meta": {
572
+ "vid": "4855999801",
573
+ "frame_count": 902,
574
+ "start_frame": 521,
575
+ "end_frame": 616
576
+ }
577
+ },
578
+ {
579
+ "conversations": [
580
+ {
581
+ "from": "human",
582
+ "value": "Who is above the green sofa?"
583
+ },
584
+ {
585
+ "from": "gpt",
586
+ "value": "In frame 0, there is a green sofa beneath a child in green at position [0, 0, 639, 479]. The child remains above the green sofa until frame 1197 when they are still at position [0, 0, 639, 479]. Therefore, the answer is a child in green."
587
+ }
588
+ ],
589
+ "meta": {
590
+ "vid": "2475104074",
591
+ "frame_count": 1197,
592
+ "start_frame": 0,
593
+ "end_frame": 1197
594
+ }
595
+ },
596
+ {
597
+ "conversations": [
598
+ {
599
+ "from": "human",
600
+ "value": "What is beneath the bottle in the room?"
601
+ },
602
+ {
603
+ "from": "gpt",
604
+ "value": "In frame 1680, the target object 'bottle' is located at position [517, 193, 555, 264] above a table in the room. As the bottle moves, in frame 2697, it is positioned at [508, 195, 546, 264]. Therefore, there is nothing specifically mentioned as being beneath the bottle in the room."
605
+ }
606
+ ],
607
+ "meta": {
608
+ "vid": "8614946329",
609
+ "frame_count": 2697,
610
+ "start_frame": 1680,
611
+ "end_frame": 2697
612
+ }
613
+ },
614
+ {
615
+ "conversations": [
616
+ {
617
+ "from": "human",
618
+ "value": "Who does the adult in red hold 1?"
619
+ },
620
+ {
621
+ "from": "gpt",
622
+ "value": "In the video, the adult in red, initially located at position [412, 0, 603, 359] in frame 76, is holding a baby with few hair 1. The adult stops appearing at position [400, 0, 639, 359] in frame 178."
623
+ }
624
+ ],
625
+ "meta": {
626
+ "vid": "3898784753",
627
+ "frame_count": 1740,
628
+ "start_frame": 76,
629
+ "end_frame": 178
630
+ }
631
+ },
632
+ {
633
+ "conversations": [
634
+ {
635
+ "from": "human",
636
+ "value": "Who does the adult in red hold 1?"
637
+ },
638
+ {
639
+ "from": "gpt",
640
+ "value": "The adult in red holds a baby with few hair 1. The adult is initially located at position [412, 0, 603, 359] in frame 229 and stops appearing at position [400, 0, 639, 359] in frame 825."
641
+ }
642
+ ],
643
+ "meta": {
644
+ "vid": "3898784753",
645
+ "frame_count": 1740,
646
+ "start_frame": 229,
647
+ "end_frame": 825
648
+ }
649
+ },
650
+ {
651
+ "conversations": [
652
+ {
653
+ "from": "human",
654
+ "value": "Who does the adult in red hold 1?"
655
+ },
656
+ {
657
+ "from": "gpt",
658
+ "value": "In the video, the adult in red is holding a baby with few hairs 1. The adult is initially located at position [412, 0, 603, 359] in frame 911 and stops appearing at position [400, 0, 639, 359] in frame 1105."
659
+ }
660
+ ],
661
+ "meta": {
662
+ "vid": "3898784753",
663
+ "frame_count": 1740,
664
+ "start_frame": 911,
665
+ "end_frame": 1105
666
+ }
667
+ },
668
+ {
669
+ "conversations": [
670
+ {
671
+ "from": "human",
672
+ "value": "Who does the adult in red hold?"
673
+ },
674
+ {
675
+ "from": "gpt",
676
+ "value": "In the video, an adult in red holds a baby with few hair. The adult is initially located at [412, 0, 603, 359] in frame 1195 and stops appearing at position [400, 0, 639, 359] in frame 1439. Therefore, the adult in red holds a baby with few hair in the video."
677
+ }
678
+ ],
679
+ "meta": {
680
+ "vid": "3898784753",
681
+ "frame_count": 1740,
682
+ "start_frame": 1195,
683
+ "end_frame": 1439
684
+ }
685
+ },
686
+ {
687
+ "conversations": [
688
+ {
689
+ "from": "human",
690
+ "value": "Who does the adult in red hold 1?"
691
+ },
692
+ {
693
+ "from": "gpt",
694
+ "value": "In the video, an adult in red is holding a baby with few hair 1. The adult is located at position [412, 0, 603, 359] in frame 1541 and stops appearing at position [400, 0, 639, 359] in frame 1582."
695
+ }
696
+ ],
697
+ "meta": {
698
+ "vid": "3898784753",
699
+ "frame_count": 1740,
700
+ "start_frame": 1541,
701
+ "end_frame": 1582
702
+ }
703
+ },
704
+ {
705
+ "conversations": [
706
+ {
707
+ "from": "human",
708
+ "value": "Who does the adult in red hold 1?"
709
+ },
710
+ {
711
+ "from": "gpt",
712
+ "value": "In the video, an adult in red is holding a baby with few hair 1. The adult starts appearing at position [412, 0, 603, 359] in frame 1705 and stops appearing at position [400, 0, 639, 359] in frame 1740."
713
+ }
714
+ ],
715
+ "meta": {
716
+ "vid": "3898784753",
717
+ "frame_count": 1740,
718
+ "start_frame": 1705,
719
+ "end_frame": 1740
720
+ }
721
+ },
722
+ {
723
+ "conversations": [
724
+ {
725
+ "from": "human",
726
+ "value": "what does the adult in black ride on the road?"
727
+ },
728
+ {
729
+ "from": "gpt",
730
+ "value": "In the video, an adult in black is riding a motorcycle on a road. The adult is initially located at position [378, 161, 406, 193] in frame 0 and stops appearing at position [371, 150, 399, 189] in frame 210."
731
+ }
732
+ ],
733
+ "meta": {
734
+ "vid": "3439709092",
735
+ "frame_count": 210,
736
+ "start_frame": 0,
737
+ "end_frame": 210
738
+ }
739
+ },
740
+ {
741
+ "conversations": [
742
+ {
743
+ "from": "human",
744
+ "value": "What is beneath the child on the ground?"
745
+ },
746
+ {
747
+ "from": "gpt",
748
+ "value": "In the video, the child is initially above a skateboard on the ground at position [217, 65, 328, 219] in frame 55. The child stops appearing at position [415, 114, 533, 300] in frame 269. Therefore, beneath the child on the ground is a skateboard. This is confirmed by the trajectory of the child's position in the video."
749
+ }
750
+ ],
751
+ "meta": {
752
+ "vid": "13372177653",
753
+ "frame_count": 288,
754
+ "start_frame": 55,
755
+ "end_frame": 269
756
+ }
757
+ },
758
+ {
759
+ "conversations": [
760
+ {
761
+ "from": "human",
762
+ "value": "What is beneath the cup in the room?"
763
+ },
764
+ {
765
+ "from": "gpt",
766
+ "value": "In the video, there is a table beneath the cup in the room. The table is initially located at position [0, 358, 149, 479] in frame 942 and stops appearing at position [228, 429, 483, 479] in frame 1159."
767
+ }
768
+ ],
769
+ "meta": {
770
+ "vid": "2521681557",
771
+ "frame_count": 2013,
772
+ "start_frame": 942,
773
+ "end_frame": 1159
774
+ }
775
+ },
776
+ {
777
+ "conversations": [
778
+ {
779
+ "from": "human",
780
+ "value": "who hugs the child in white on the chair?"
781
+ },
782
+ {
783
+ "from": "gpt",
784
+ "value": "In the video, an adult man in white hugs a child in white on a chair. The adult man is initially located at [216, 10, 404, 347] in frame 0 and stops appearing at position [64, 17, 381, 381] in frame 129."
785
+ }
786
+ ],
787
+ "meta": {
788
+ "vid": "4760962392",
789
+ "frame_count": 2697,
790
+ "start_frame": 0,
791
+ "end_frame": 129
792
+ }
793
+ },
794
+ {
795
+ "conversations": [
796
+ {
797
+ "from": "human",
798
+ "value": "What does the child in striped clothes watch?"
799
+ },
800
+ {
801
+ "from": "gpt",
802
+ "value": "The child in striped clothes is located at [227, 34, 315, 234] in frame 1197 and watches a red and purple toy. The child stops appearing at position [0, 24, 209, 374] in frame 1453."
803
+ }
804
+ ],
805
+ "meta": {
806
+ "vid": "2677153721",
807
+ "frame_count": 1453,
808
+ "start_frame": 1197,
809
+ "end_frame": 1453
810
+ }
811
+ },
812
+ {
813
+ "conversations": [
814
+ {
815
+ "from": "human",
816
+ "value": "Who grabs the cup in the classroom?"
817
+ },
818
+ {
819
+ "from": "gpt",
820
+ "value": "In the video, an adult is seen grabbing a cup in the classroom. The adult is initially located at position [159, 100, 230, 173] in frame 651 and stops appearing at position [134, 77, 202, 144] in frame 695. Therefore, the answer is the adult."
821
+ }
822
+ ],
823
+ "meta": {
824
+ "vid": "3113067503",
825
+ "frame_count": 1237,
826
+ "start_frame": 651,
827
+ "end_frame": 695
828
+ }
829
+ },
830
+ {
831
+ "conversations": [
832
+ {
833
+ "from": "human",
834
+ "value": "Who carries the green toy at home?"
835
+ },
836
+ {
837
+ "from": "gpt",
838
+ "value": "In the video, a child with black hair can be seen carrying a green toy at home. The child is initially located at position [0, 177, 27, 429] in frame 40 and stops appearing at position [209, 330, 430, 479] in frame 210."
839
+ }
840
+ ],
841
+ "meta": {
842
+ "vid": "3487539480",
843
+ "frame_count": 967,
844
+ "start_frame": 40,
845
+ "end_frame": 210
846
+ }
847
+ },
848
+ {
849
+ "conversations": [
850
+ {
851
+ "from": "human",
852
+ "value": "Who carries the green toy at home?"
853
+ },
854
+ {
855
+ "from": "gpt",
856
+ "value": "In the video, a child with black hair can be seen carrying a green toy at home. The child is initially located at position [0, 177, 27, 429] in frame 458 and stops appearing at position [209, 330, 430, 479] in frame 495. Therefore, the answer to your question is the child."
857
+ }
858
+ ],
859
+ "meta": {
860
+ "vid": "3487539480",
861
+ "frame_count": 967,
862
+ "start_frame": 458,
863
+ "end_frame": 495
864
+ }
865
+ },
866
+ {
867
+ "conversations": [
868
+ {
869
+ "from": "human",
870
+ "value": "Who leans on the sofa at home?"
871
+ },
872
+ {
873
+ "from": "gpt",
874
+ "value": "In the video, there is an adult leaning on a sofa at home. The adult is initially located at position [27, 0, 479, 639] in frame 290 and stops appearing at position [0, 0, 440, 459] in frame 362. Therefore, the answer is an adult."
875
+ }
876
+ ],
877
+ "meta": {
878
+ "vid": "5773250639",
879
+ "frame_count": 1122,
880
+ "start_frame": 290,
881
+ "end_frame": 362
882
+ }
883
+ },
884
+ {
885
+ "conversations": [
886
+ {
887
+ "from": "human",
888
+ "value": "Who leans on the sofa at home?"
889
+ },
890
+ {
891
+ "from": "gpt",
892
+ "value": "In the video, an adult in grey is leaning on the sofa at home. The adult is initially located at position [27, 0, 479, 639] in frame 423 and stops appearing at position [0, 0, 440, 459] in frame 832."
893
+ }
894
+ ],
895
+ "meta": {
896
+ "vid": "5773250639",
897
+ "frame_count": 1122,
898
+ "start_frame": 423,
899
+ "end_frame": 832
900
+ }
901
+ },
902
+ {
903
+ "conversations": [
904
+ {
905
+ "from": "human",
906
+ "value": "Who kisses the baby on the bed?"
907
+ },
908
+ {
909
+ "from": "gpt",
910
+ "value": "In frame 326 of the video, an adult is located at position [414, 0, 639, 122] and is kissing the baby on the bed. The adult stops appearing at position [375, 0, 639, 72] in frame 341."
911
+ }
912
+ ],
913
+ "meta": {
914
+ "vid": "6989151729",
915
+ "frame_count": 403,
916
+ "start_frame": 326,
917
+ "end_frame": 341
918
+ }
919
+ },
920
+ {
921
+ "conversations": [
922
+ {
923
+ "from": "human",
924
+ "value": "What above the child with blond-hair?"
925
+ },
926
+ {
927
+ "from": "gpt",
928
+ "value": "In frame 660, above the child with blond-hair, there is a guitar at position [224, 189, 639, 373]. The guitar stops appearing in frame 750 at position [0, 225, 522, 467]."
929
+ }
930
+ ],
931
+ "meta": {
932
+ "vid": "6591879935",
933
+ "frame_count": 1783,
934
+ "start_frame": 660,
935
+ "end_frame": 750
936
+ }
937
+ },
938
+ {
939
+ "conversations": [
940
+ {
941
+ "from": "human",
942
+ "value": "Who does the 1 adult chase on the grass?"
943
+ },
944
+ {
945
+ "from": "gpt",
946
+ "value": "In the video, a 1 adult is seen chasing another adult on the grass. The target adult is initially located at [348, 104, 488, 359] in frame 129 and stops appearing at position [291, 95, 365, 234] in frame 157."
947
+ }
948
+ ],
949
+ "meta": {
950
+ "vid": "6859274622",
951
+ "frame_count": 263,
952
+ "start_frame": 129,
953
+ "end_frame": 157
954
+ }
955
+ },
956
+ {
957
+ "conversations": [
958
+ {
959
+ "from": "human",
960
+ "value": "Who is towards the other child in the yard?"
961
+ },
962
+ {
963
+ "from": "gpt",
964
+ "value": "In frame 1920, a child in white is towards another child in a yard. The target child is initially located at position [552, 61, 639, 274] and stops appearing at position [277, 72, 358, 276] in frame 2031."
965
+ }
966
+ ],
967
+ "meta": {
968
+ "vid": "7971444414",
969
+ "frame_count": 2031,
970
+ "start_frame": 1920,
971
+ "end_frame": 2031
972
+ }
973
+ },
974
+ {
975
+ "conversations": [
976
+ {
977
+ "from": "human",
978
+ "value": "Who is playing the guitar in the video?"
979
+ },
980
+ {
981
+ "from": "gpt",
982
+ "value": "In the video, an adult in a black hat is playing the guitar. The adult can be seen initially at position [436, 95, 511, 304] in frame 118 and stops appearing at position [402, 91, 478, 277] in frame 1922."
983
+ }
984
+ ],
985
+ "meta": {
986
+ "vid": "5776386760",
987
+ "frame_count": 1922,
988
+ "start_frame": 118,
989
+ "end_frame": 1922
990
+ }
991
+ },
992
+ {
993
+ "conversations": [
994
+ {
995
+ "from": "human",
996
+ "value": "Who grabs the sports ball?"
997
+ },
998
+ {
999
+ "from": "gpt",
1000
+ "value": "In the video, an adult is seen grabbing the sports ball. The adult is initially located at position [345, 76, 407, 218] in frame 0 and stops appearing at position [604, 46, 639, 183] in frame 14. Therefore, the answer is the adult."
1001
+ }
1002
+ ],
1003
+ "meta": {
1004
+ "vid": "6977471721",
1005
+ "frame_count": 297,
1006
+ "start_frame": 0,
1007
+ "end_frame": 14
1008
+ }
1009
+ },
1010
+ {
1011
+ "conversations": [
1012
+ {
1013
+ "from": "human",
1014
+ "value": "Who is in front of the screen?"
1015
+ },
1016
+ {
1017
+ "from": "gpt",
1018
+ "value": "In frame 1001, an adult [0, 156, 98, 417] wearing a red hat is in front of the screen. The adult stops appearing at position [0, 257, 42, 383] in frame 1214."
1019
+ }
1020
+ ],
1021
+ "meta": {
1022
+ "vid": "3182862956",
1023
+ "frame_count": 1259,
1024
+ "start_frame": 1001,
1025
+ "end_frame": 1214
1026
+ }
1027
+ },
1028
+ {
1029
+ "conversations": [
1030
+ {
1031
+ "from": "human",
1032
+ "value": "Who is behind the other adult in the snow?"
1033
+ },
1034
+ {
1035
+ "from": "gpt",
1036
+ "value": "In the video, there is an adult man in black who is behind another adult in the snow. The target object, which is an adult, initially appears at position [623, 226, 639, 276] in frame 196 and stops appearing at position [617, 212, 639, 265] in frame 248."
1037
+ }
1038
+ ],
1039
+ "meta": {
1040
+ "vid": "5312674396",
1041
+ "frame_count": 388,
1042
+ "start_frame": 196,
1043
+ "end_frame": 248
1044
+ }
1045
+ },
1046
+ {
1047
+ "conversations": [
1048
+ {
1049
+ "from": "human",
1050
+ "value": "What is above the white baby seat at home?"
1051
+ },
1052
+ {
1053
+ "from": "gpt",
1054
+ "value": "In the video, there is a green toy above the white baby seat at home. The toy is initially located at position [356, 153, 438, 231] in frame 660 and stops appearing at position [187, 265, 325, 312] in frame 2045."
1055
+ }
1056
+ ],
1057
+ "meta": {
1058
+ "vid": "14040889372",
1059
+ "frame_count": 2045,
1060
+ "start_frame": 660,
1061
+ "end_frame": 2045
1062
+ }
1063
+ },
1064
+ {
1065
+ "conversations": [
1066
+ {
1067
+ "from": "human",
1068
+ "value": "Who pushes the stool on the floor?"
1069
+ },
1070
+ {
1071
+ "from": "gpt",
1072
+ "value": "In the video, a child in green clothes pushes a stool on the floor. The child is initially located at position [262, 144, 354, 359] in frame 296 and stops appearing at position [321, 181, 639, 359] in frame 345. Therefore, the child is the one who pushes the stool on the floor."
1073
+ }
1074
+ ],
1075
+ "meta": {
1076
+ "vid": "5907883532",
1077
+ "frame_count": 1882,
1078
+ "start_frame": 296,
1079
+ "end_frame": 345
1080
+ }
1081
+ },
1082
+ {
1083
+ "conversations": [
1084
+ {
1085
+ "from": "human",
1086
+ "value": "Who pushes the stool on the floor?"
1087
+ },
1088
+ {
1089
+ "from": "gpt",
1090
+ "value": "In the video, a child in green clothes pushes a stool on the floor. The child is initially located at [262, 144, 354, 359] in frame 426 and stops appearing at position [321, 181, 639, 359] in frame 485."
1091
+ }
1092
+ ],
1093
+ "meta": {
1094
+ "vid": "5907883532",
1095
+ "frame_count": 1882,
1096
+ "start_frame": 426,
1097
+ "end_frame": 485
1098
+ }
1099
+ },
1100
+ {
1101
+ "conversations": [
1102
+ {
1103
+ "from": "human",
1104
+ "value": "Who pushes the stool on the floor?"
1105
+ },
1106
+ {
1107
+ "from": "gpt",
1108
+ "value": "In the video, a child in green clothes is seen pushing a stool on the floor. The child starts pushing the stool at position [262, 144, 354, 359] in frame 559 and continues until position [321, 181, 639, 359] in frame 644. Therefore, the answer is the child in green clothes."
1109
+ }
1110
+ ],
1111
+ "meta": {
1112
+ "vid": "5907883532",
1113
+ "frame_count": 1882,
1114
+ "start_frame": 559,
1115
+ "end_frame": 644
1116
+ }
1117
+ },
1118
+ {
1119
+ "conversations": [
1120
+ {
1121
+ "from": "human",
1122
+ "value": "Who pushes the stool on the floor?"
1123
+ },
1124
+ {
1125
+ "from": "gpt",
1126
+ "value": "In the video, a child in green clothes pushes a stool on the floor. The child is initially located at [262, 144, 354, 359] in frame 777 and stops appearing at position [321, 181, 639, 359] in frame 808."
1127
+ }
1128
+ ],
1129
+ "meta": {
1130
+ "vid": "5907883532",
1131
+ "frame_count": 1882,
1132
+ "start_frame": 777,
1133
+ "end_frame": 808
1134
+ }
1135
+ },
1136
+ {
1137
+ "conversations": [
1138
+ {
1139
+ "from": "human",
1140
+ "value": "Who pushes the stool on the floor?"
1141
+ },
1142
+ {
1143
+ "from": "gpt",
1144
+ "value": "In the video, a child in green clothes starts pushing a stool on the floor at frame 890 with bounding box [262, 144, 354, 359]. The child continues to push the stool until frame 914 where the child is last seen at position [321, 181, 639, 359]. Therefore, the answer is the child."
1145
+ }
1146
+ ],
1147
+ "meta": {
1148
+ "vid": "5907883532",
1149
+ "frame_count": 1882,
1150
+ "start_frame": 890,
1151
+ "end_frame": 914
1152
+ }
1153
+ },
1154
+ {
1155
+ "conversations": [
1156
+ {
1157
+ "from": "human",
1158
+ "value": "who pushes the stool on the floor?"
1159
+ },
1160
+ {
1161
+ "from": "gpt",
1162
+ "value": "In the video, a child in green clothes can be seen pushing a stool on the floor. The child appears in frame 1013 at position [262, 144, 354, 359] and stops appearing in frame 1129 at position [321, 181, 639, 359]. Therefore, the answer to the question 'who pushes the stool on the floor?' is the child in green clothes."
1163
+ }
1164
+ ],
1165
+ "meta": {
1166
+ "vid": "5907883532",
1167
+ "frame_count": 1882,
1168
+ "start_frame": 1013,
1169
+ "end_frame": 1129
1170
+ }
1171
+ },
1172
+ {
1173
+ "conversations": [
1174
+ {
1175
+ "from": "human",
1176
+ "value": "Who pushes the stool on the floor?"
1177
+ },
1178
+ {
1179
+ "from": "gpt",
1180
+ "value": "In the video, a child in green clothes is pushing the stool on the floor. The child can be seen at position [262, 144, 354, 359] in frame 1268 and stops appearing at position [321, 181, 639, 359] in frame 1322."
1181
+ }
1182
+ ],
1183
+ "meta": {
1184
+ "vid": "5907883532",
1185
+ "frame_count": 1882,
1186
+ "start_frame": 1268,
1187
+ "end_frame": 1322
1188
+ }
1189
+ },
1190
+ {
1191
+ "conversations": [
1192
+ {
1193
+ "from": "human",
1194
+ "value": "Who pushes the stool on the floor?"
1195
+ },
1196
+ {
1197
+ "from": "gpt",
1198
+ "value": "In the video, a child in green clothes can be seen pushing a stool on the floor. The child is initially located at position [262, 144, 354, 359] in frame 1450, and stops appearing at position [321, 181, 639, 359] in frame 1536."
1199
+ }
1200
+ ],
1201
+ "meta": {
1202
+ "vid": "5907883532",
1203
+ "frame_count": 1882,
1204
+ "start_frame": 1450,
1205
+ "end_frame": 1536
1206
+ }
1207
+ },
1208
+ {
1209
+ "conversations": [
1210
+ {
1211
+ "from": "human",
1212
+ "value": "Who is the adult towards on the ground?"
1213
+ },
1214
+ {
1215
+ "from": "gpt",
1216
+ "value": "In the video, an adult is towards another adult on the ground. The target adult is initially located at position [306, 165, 398, 428] in frame 1104 and stops appearing at position [232, 145, 315, 438] in frame 1169."
1217
+ }
1218
+ ],
1219
+ "meta": {
1220
+ "vid": "4747203885",
1221
+ "frame_count": 1732,
1222
+ "start_frame": 1104,
1223
+ "end_frame": 1169
1224
+ }
1225
+ },
1226
+ {
1227
+ "conversations": [
1228
+ {
1229
+ "from": "human",
1230
+ "value": "What does the child watch inside the car?"
1231
+ },
1232
+ {
1233
+ "from": "gpt",
1234
+ "value": "In the video, the child is located inside the car at position [265, 50, 523, 479] in frame 520. The child is watching a toy inside the car at this position. The child stops appearing at position [218, 42, 500, 479] in frame 578. Therefore, the child was watching a toy inside the car."
1235
+ }
1236
+ ],
1237
+ "meta": {
1238
+ "vid": "6783572295",
1239
+ "frame_count": 627,
1240
+ "start_frame": 520,
1241
+ "end_frame": 578
1242
+ }
1243
+ },
1244
+ {
1245
+ "conversations": [
1246
+ {
1247
+ "from": "human",
1248
+ "value": "Who is away the child in the room?"
1249
+ },
1250
+ {
1251
+ "from": "gpt",
1252
+ "value": "In the video, an adult in white can be seen away a child in the room. The adult is initially located at [210, 238, 334, 479] in frame 240 and is no longer visible at position [188, 132, 269, 479] in frame 312."
1253
+ }
1254
+ ],
1255
+ "meta": {
1256
+ "vid": "4242713789",
1257
+ "frame_count": 1395,
1258
+ "start_frame": 240,
1259
+ "end_frame": 312
1260
+ }
1261
+ },
1262
+ {
1263
+ "conversations": [
1264
+ {
1265
+ "from": "human",
1266
+ "value": "What is behind the adult man in white inside the subway station?"
1267
+ },
1268
+ {
1269
+ "from": "gpt",
1270
+ "value": "In frame 528, an adult man in white is located at position [18, 86, 38, 209] inside the subway station. As per the description, the man is in front of a red chair. There is no specific object mentioned to be behind the adult man in the given scenario."
1271
+ }
1272
+ ],
1273
+ "meta": {
1274
+ "vid": "2420718744",
1275
+ "frame_count": 839,
1276
+ "start_frame": 528,
1277
+ "end_frame": 585
1278
+ }
1279
+ },
1280
+ {
1281
+ "conversations": [
1282
+ {
1283
+ "from": "human",
1284
+ "value": "What does the adult in red carry outside the room?"
1285
+ },
1286
+ {
1287
+ "from": "gpt",
1288
+ "value": "In frame 137, there is an adult [105, 294, 138, 440] in red carrying a handbag outside the room. The adult stops appearing at position [0, 272, 35, 477] in frame 170."
1289
+ }
1290
+ ],
1291
+ "meta": {
1292
+ "vid": "4997032186",
1293
+ "frame_count": 420,
1294
+ "start_frame": 137,
1295
+ "end_frame": 170
1296
+ }
1297
+ },
1298
+ {
1299
+ "conversations": [
1300
+ {
1301
+ "from": "human",
1302
+ "value": "Who is the adult away from in the room?"
1303
+ },
1304
+ {
1305
+ "from": "gpt",
1306
+ "value": "In the video, an adult is away from another adult in the room. The target object, which is an adult, is initially located at position [108, 175, 188, 318] in frame 300 and stops appearing at position [22, 73, 96, 257] in frame 407."
1307
+ }
1308
+ ],
1309
+ "meta": {
1310
+ "vid": "7162680320",
1311
+ "frame_count": 407,
1312
+ "start_frame": 300,
1313
+ "end_frame": 407
1314
+ }
1315
+ },
1316
+ {
1317
+ "conversations": [
1318
+ {
1319
+ "from": "human",
1320
+ "value": "Who is away by adult?"
1321
+ },
1322
+ {
1323
+ "from": "gpt",
1324
+ "value": "In the video, there is an adult in white away a squatting adult. The target object, the adult, is initially located at position [296, 66, 449, 479] in frame 192 and stops appearing at position [380, 82, 608, 479] in frame 312."
1325
+ }
1326
+ ],
1327
+ "meta": {
1328
+ "vid": "3347531832",
1329
+ "frame_count": 863,
1330
+ "start_frame": 192,
1331
+ "end_frame": 312
1332
+ }
1333
+ },
1334
+ {
1335
+ "conversations": [
1336
+ {
1337
+ "from": "human",
1338
+ "value": "What does the adult in grey clothes carry?"
1339
+ },
1340
+ {
1341
+ "from": "gpt",
1342
+ "value": "The adult in grey clothes is carrying a handbag. The adult is initially located at position [40, 87, 200, 479] in frame 0 and stops appearing at position [0, 0, 619, 479] in frame 122."
1343
+ }
1344
+ ],
1345
+ "meta": {
1346
+ "vid": "5988355975",
1347
+ "frame_count": 224,
1348
+ "start_frame": 0,
1349
+ "end_frame": 122
1350
+ }
1351
+ },
1352
+ {
1353
+ "conversations": [
1354
+ {
1355
+ "from": "human",
1356
+ "value": "Who leans on the chair?"
1357
+ },
1358
+ {
1359
+ "from": "gpt",
1360
+ "value": "In the video, an adult in black clothes is leaning on a chair. The adult is initially positioned at [108, 264, 317, 479] in frame 58 and stops appearing at position [87, 0, 330, 432] in frame 210."
1361
+ }
1362
+ ],
1363
+ "meta": {
1364
+ "vid": "6140084750",
1365
+ "frame_count": 210,
1366
+ "start_frame": 58,
1367
+ "end_frame": 210
1368
+ }
1369
+ },
1370
+ {
1371
+ "conversations": [
1372
+ {
1373
+ "from": "human",
1374
+ "value": "Who is in front of the adult in gray?"
1375
+ },
1376
+ {
1377
+ "from": "system",
1378
+ "value": "In the video, the child with a dress is in front of the adult in gray. The child is initially located at position [388, 248, 511, 359] in frame 94 and stops appearing at position [432, 231, 545, 359] in frame 173."
1379
+ }
1380
+ ],
1381
+ "meta": {
1382
+ "vid": "7604112162",
1383
+ "frame_count": 398,
1384
+ "start_frame": 94,
1385
+ "end_frame": 173
1386
+ }
1387
+ },
1388
+ {
1389
+ "conversations": [
1390
+ {
1391
+ "from": "human",
1392
+ "value": "What is above the blue table?"
1393
+ },
1394
+ {
1395
+ "from": "gpt",
1396
+ "value": "In the video, there is a brown cake above a blue table. The cake is initially located at position [80, 164, 364, 328] in frame 0 and stops appearing at position [77, 153, 376, 313] in frame 792."
1397
+ }
1398
+ ],
1399
+ "meta": {
1400
+ "vid": "4748191834",
1401
+ "frame_count": 798,
1402
+ "start_frame": 0,
1403
+ "end_frame": 792
1404
+ }
1405
+ },
1406
+ {
1407
+ "conversations": [
1408
+ {
1409
+ "from": "human",
1410
+ "value": "Who is above the sink in the bathroom?"
1411
+ },
1412
+ {
1413
+ "from": "gpt",
1414
+ "value": "In the video, there is a sink located at position [0, 105, 585, 359] in frame 0 and it stops appearing at position [0, 30, 639, 359] in frame 325. There is a baby above the sink in the bathroom."
1415
+ }
1416
+ ],
1417
+ "meta": {
1418
+ "vid": "5561006511",
1419
+ "frame_count": 530,
1420
+ "start_frame": 0,
1421
+ "end_frame": 325
1422
+ }
1423
+ },
1424
+ {
1425
+ "conversations": [
1426
+ {
1427
+ "from": "human",
1428
+ "value": "What is away from the child?"
1429
+ },
1430
+ {
1431
+ "from": "gpt",
1432
+ "value": "In the video, there is a purple ball away from the child. The ball is initially located at [191, 95, 251, 154] in frame 0 and stops appearing at position [615, 202, 639, 245] in frame 67."
1433
+ }
1434
+ ],
1435
+ "meta": {
1436
+ "vid": "7368321214",
1437
+ "frame_count": 90,
1438
+ "start_frame": 0,
1439
+ "end_frame": 67
1440
+ }
1441
+ },
1442
+ {
1443
+ "conversations": [
1444
+ {
1445
+ "from": "human",
1446
+ "value": "Who rides the ski in the snow?"
1447
+ },
1448
+ {
1449
+ "from": "gpt",
1450
+ "value": "In the video, an adult in black clothes rides a ski in the snow. The adult is initially located at position [323, 161, 352, 242] in frame 87 and stops appearing at position [0, 0, 252, 479] in frame 330."
1451
+ }
1452
+ ],
1453
+ "meta": {
1454
+ "vid": "8728461458",
1455
+ "frame_count": 420,
1456
+ "start_frame": 87,
1457
+ "end_frame": 330
1458
+ }
1459
+ },
1460
+ {
1461
+ "conversations": [
1462
+ {
1463
+ "from": "human",
1464
+ "value": "What is beneath the child in blue on the water?"
1465
+ },
1466
+ {
1467
+ "from": "gpt",
1468
+ "value": "In the video, the child in blue is initially located at position [327, 220, 364, 288] in frame 0 and stops appearing at position [294, 149, 380, 296] in frame 906. Beneath the child in blue on the water is a black surfboard. Therefore, the answer is a black surfboard."
1469
+ }
1470
+ ],
1471
+ "meta": {
1472
+ "vid": "4807956569",
1473
+ "frame_count": 906,
1474
+ "start_frame": 0,
1475
+ "end_frame": 906
1476
+ }
1477
+ },
1478
+ {
1479
+ "conversations": [
1480
+ {
1481
+ "from": "human",
1482
+ "value": "What is beneath the child wearing the blue hat?"
1483
+ },
1484
+ {
1485
+ "from": "gpt",
1486
+ "value": "In frame 1320, the child wearing a blue hat is located at position [249, 54, 319, 239] which is above a skateboard. As the child moves, in frame 1612, the child stops appearing at position [169, 0, 293, 318]. Therefore, beneath the child wearing the blue hat is a skateboard."
1487
+ }
1488
+ ],
1489
+ "meta": {
1490
+ "vid": "4805511103",
1491
+ "frame_count": 1621,
1492
+ "start_frame": 1320,
1493
+ "end_frame": 1612
1494
+ }
1495
+ },
1496
+ {
1497
+ "conversations": [
1498
+ {
1499
+ "from": "human",
1500
+ "value": "What is away from the car?"
1501
+ },
1502
+ {
1503
+ "from": "gpt",
1504
+ "value": "In the video, there is a black dog away from a car. The dog is initially located at position [242, 126, 305, 158] in frame 840 and stops appearing at position [276, 99, 319, 213] in frame 990. Therefore, the object away from the car is a black dog."
1505
+ }
1506
+ ],
1507
+ "meta": {
1508
+ "vid": "3190710669",
1509
+ "frame_count": 1064,
1510
+ "start_frame": 840,
1511
+ "end_frame": 990
1512
+ }
1513
+ },
1514
+ {
1515
+ "conversations": [
1516
+ {
1517
+ "from": "human",
1518
+ "value": "Who is the white dog away from?"
1519
+ },
1520
+ {
1521
+ "from": "gpt",
1522
+ "value": "In the video, there is a white dog [0, 139, 245, 359] away from a child with blond-hair. The white dog remains at [0, 139, 245, 359] from frame 770 to frame 811. Therefore, the white dog is away from the child with blond-hair."
1523
+ }
1524
+ ],
1525
+ "meta": {
1526
+ "vid": "8212773822",
1527
+ "frame_count": 1461,
1528
+ "start_frame": 770,
1529
+ "end_frame": 811
1530
+ }
1531
+ },
1532
+ {
1533
+ "conversations": [
1534
+ {
1535
+ "from": "human",
1536
+ "value": "What is the sofa towards to in the room?"
1537
+ },
1538
+ {
1539
+ "from": "gpt",
1540
+ "value": "The sofa in the room is initially located towards a screen at position [0, 142, 123, 479] in frame 540. It stops appearing towards the screen at position [0, 134, 100, 244] in frame 630."
1541
+ }
1542
+ ],
1543
+ "meta": {
1544
+ "vid": "5220256786",
1545
+ "frame_count": 2100,
1546
+ "start_frame": 540,
1547
+ "end_frame": 630
1548
+ }
1549
+ },
1550
+ {
1551
+ "conversations": [
1552
+ {
1553
+ "from": "human",
1554
+ "value": "Who is towards the adult outdoors?"
1555
+ },
1556
+ {
1557
+ "from": "gpt",
1558
+ "value": "In the video, a child in black is towards an adult outdoors. The child is initially located at position [324, 168, 391, 344] in frame 120 and stops appearing at position [207, 180, 239, 254] in frame 210."
1559
+ }
1560
+ ],
1561
+ "meta": {
1562
+ "vid": "8577770314",
1563
+ "frame_count": 2644,
1564
+ "start_frame": 120,
1565
+ "end_frame": 210
1566
+ }
1567
+ },
1568
+ {
1569
+ "conversations": [
1570
+ {
1571
+ "from": "human",
1572
+ "value": "Who is inside the car on the road?"
1573
+ },
1574
+ {
1575
+ "from": "gpt",
1576
+ "value": "In the video, an adult with sunglasses is inside a car on the road. The adult is initially located at position [257, 84, 442, 284] in frame 0 and stops appearing at position [135, 135, 240, 337] in frame 830."
1577
+ }
1578
+ ],
1579
+ "meta": {
1580
+ "vid": "2746688672",
1581
+ "frame_count": 1103,
1582
+ "start_frame": 0,
1583
+ "end_frame": 830
1584
+ }
1585
+ },
1586
+ {
1587
+ "conversations": [
1588
+ {
1589
+ "from": "human",
1590
+ "value": "What is behind the child in pink?"
1591
+ },
1592
+ {
1593
+ "from": "gpt",
1594
+ "value": "In frame 557, the chair is located behind the child in pink at position [0, 0, 168, 168]. The chair continues to be behind the child in pink until frame 669, where it is positioned at [373, 85, 479, 271]. Therefore, the answer is 'the chair'."
1595
+ }
1596
+ ],
1597
+ "meta": {
1598
+ "vid": "4930421543",
1599
+ "frame_count": 794,
1600
+ "start_frame": 557,
1601
+ "end_frame": 669
1602
+ }
1603
+ },
1604
+ {
1605
+ "conversations": [
1606
+ {
1607
+ "from": "human",
1608
+ "value": "What does the child in gray hold at home?"
1609
+ },
1610
+ {
1611
+ "from": "gpt",
1612
+ "value": "In frame 654, the child in gray is located at position [0, 0, 599, 359]. The child holds a silver cellphone at home. The child stops appearing at position [248, 106, 382, 359] in frame 718."
1613
+ }
1614
+ ],
1615
+ "meta": {
1616
+ "vid": "3152504882",
1617
+ "frame_count": 2403,
1618
+ "start_frame": 654,
1619
+ "end_frame": 718
1620
+ }
1621
+ },
1622
+ {
1623
+ "conversations": [
1624
+ {
1625
+ "from": "human",
1626
+ "value": "What does the child in gray hold at home?"
1627
+ },
1628
+ {
1629
+ "from": "gpt",
1630
+ "value": "In frame 831, the child in gray is located at [0, 0, 599, 359] and holds a silver cellphone at home. The child stops appearing at position [248, 106, 382, 359] in frame 913."
1631
+ }
1632
+ ],
1633
+ "meta": {
1634
+ "vid": "3152504882",
1635
+ "frame_count": 2403,
1636
+ "start_frame": 831,
1637
+ "end_frame": 913
1638
+ }
1639
+ },
1640
+ {
1641
+ "conversations": [
1642
+ {
1643
+ "from": "human",
1644
+ "value": "What does the child in gray hold at home?"
1645
+ },
1646
+ {
1647
+ "from": "gpt",
1648
+ "value": "In the video, a child in gray holds a silver cellphone at home. The child is initially located at [0, 0, 599, 359] in frame 985 and stops appearing at position [248, 106, 382, 359] in frame 1065."
1649
+ }
1650
+ ],
1651
+ "meta": {
1652
+ "vid": "3152504882",
1653
+ "frame_count": 2403,
1654
+ "start_frame": 985,
1655
+ "end_frame": 1065
1656
+ }
1657
+ },
1658
+ {
1659
+ "conversations": [
1660
+ {
1661
+ "from": "human",
1662
+ "value": "What does the child in gray hold at home?"
1663
+ },
1664
+ {
1665
+ "from": "gpt",
1666
+ "value": "In the video, the child in gray holds a silver cellphone at home. The child is initially located at position [0, 0, 599, 359] in frame 1088 and stops appearing at position [248, 106, 382, 359] in frame 1160."
1667
+ }
1668
+ ],
1669
+ "meta": {
1670
+ "vid": "3152504882",
1671
+ "frame_count": 2403,
1672
+ "start_frame": 1088,
1673
+ "end_frame": 1160
1674
+ }
1675
+ },
1676
+ {
1677
+ "conversations": [
1678
+ {
1679
+ "from": "human",
1680
+ "value": "What does the child in gray hold at home?"
1681
+ },
1682
+ {
1683
+ "from": "gpt",
1684
+ "value": "The child in gray holds a silver cellphone at home. The child is initially located at [0, 0, 599, 359] in frame 1177 and stops appearing at position [248, 106, 382, 359] in frame 1563."
1685
+ }
1686
+ ],
1687
+ "meta": {
1688
+ "vid": "3152504882",
1689
+ "frame_count": 2403,
1690
+ "start_frame": 1177,
1691
+ "end_frame": 1563
1692
+ }
1693
+ },
1694
+ {
1695
+ "conversations": [
1696
+ {
1697
+ "from": "human",
1698
+ "value": "Who caresses the black cat?"
1699
+ },
1700
+ {
1701
+ "from": "gpt",
1702
+ "value": "In the video, an adult in striped clothes is caressing the black cat. The adult is initially located at position [538, 86, 639, 357] in frame 150 and stops appearing at position [571, 54, 639, 332] in frame 273."
1703
+ }
1704
+ ],
1705
+ "meta": {
1706
+ "vid": "7499914046",
1707
+ "frame_count": 1760,
1708
+ "start_frame": 150,
1709
+ "end_frame": 273
1710
+ }
1711
+ },
1712
+ {
1713
+ "conversations": [
1714
+ {
1715
+ "from": "human",
1716
+ "value": "Who holds hand of the other adult?"
1717
+ },
1718
+ {
1719
+ "from": "gpt",
1720
+ "value": "In the video, there is an adult in red clothes holding hands with another adult. The adult holding the hand of the other adult is initially located at position [105, 118, 339, 479] in frame 0 and stops appearing at position [517, 45, 639, 479] in frame 87. Therefore, the answer is the adult in red clothes."
1721
+ }
1722
+ ],
1723
+ "meta": {
1724
+ "vid": "5076916325",
1725
+ "frame_count": 315,
1726
+ "start_frame": 0,
1727
+ "end_frame": 87
1728
+ }
1729
+ },
1730
+ {
1731
+ "conversations": [
1732
+ {
1733
+ "from": "human",
1734
+ "value": "who is in front of the adult in white?"
1735
+ },
1736
+ {
1737
+ "from": "gpt",
1738
+ "value": "The child in gray is in front of the adult in white. The child is initially located at position [0, 152, 36, 359] in frame 0 and stops appearing at position [0, 152, 49, 359] in frame 181."
1739
+ }
1740
+ ],
1741
+ "meta": {
1742
+ "vid": "12064423875",
1743
+ "frame_count": 377,
1744
+ "start_frame": 0,
1745
+ "end_frame": 181
1746
+ }
1747
+ },
1748
+ {
1749
+ "conversations": [
1750
+ {
1751
+ "from": "human",
1752
+ "value": "What is in front of the moving adult?"
1753
+ },
1754
+ {
1755
+ "from": "gpt",
1756
+ "value": "In the video, there is a gray car in front of the moving adult. The car's trajectory starts at [599, 180, 639, 213] in frame 360 and ends at [599, 215, 639, 249] in frame 630."
1757
+ }
1758
+ ],
1759
+ "meta": {
1760
+ "vid": "5148466016",
1761
+ "frame_count": 2535,
1762
+ "start_frame": 360,
1763
+ "end_frame": 630
1764
+ }
1765
+ },
1766
+ {
1767
+ "conversations": [
1768
+ {
1769
+ "from": "human",
1770
+ "value": "Who is the child towards to at home?"
1771
+ },
1772
+ {
1773
+ "from": "gpt",
1774
+ "value": "In the video, the child is initially located at [177, 23, 248, 239] in frame 1020 and is seen moving towards an adult at home. The child stops appearing at position [180, 34, 253, 239] in frame 1110. Therefore, the child is towards an adult at home."
1775
+ }
1776
+ ],
1777
+ "meta": {
1778
+ "vid": "3609482906",
1779
+ "frame_count": 1149,
1780
+ "start_frame": 1020,
1781
+ "end_frame": 1110
1782
+ }
1783
+ },
1784
+ {
1785
+ "conversations": [
1786
+ {
1787
+ "from": "human",
1788
+ "value": "Who closes the laptop in the study?"
1789
+ },
1790
+ {
1791
+ "from": "gpt",
1792
+ "value": "In the video, a baby is closing a laptop in the study. The baby is initially at position [80, 0, 639, 479] in frame 19 and stops appearing at position [331, 55, 639, 479] in frame 63."
1793
+ }
1794
+ ],
1795
+ "meta": {
1796
+ "vid": "3096078017",
1797
+ "frame_count": 1281,
1798
+ "start_frame": 19,
1799
+ "end_frame": 63
1800
+ }
1801
+ }
1802
+ ]