diff --git "a/eval_milebench/ActionSequence/pred_with_extracted.json" "b/eval_milebench/ActionSequence/pred_with_extracted.json" new file mode 100644--- /dev/null +++ "b/eval_milebench/ActionSequence/pred_with_extracted.json" @@ -0,0 +1,10591 @@ +[ + { + "sample_id": 0, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person took the food?\nChoice list: \nA. Ate the medicine.\nB. Tidied up the blanket.\nC. Put down the cup/glass/bottle.\nD. Took the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "ate the medicine", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "ate the medicine", + "tidied up the blanket", + "put down the cup glass bottle", + "took the box" + ], + "image_quantity_level": "Many", + "image": [ + "ZS9XR/ZS9XR_0.jpeg", + "ZS9XR/ZS9XR_1.jpeg", + "ZS9XR/ZS9XR_2.jpeg", + "ZS9XR/ZS9XR_3.jpeg", + "ZS9XR/ZS9XR_4.jpeg", + "ZS9XR/ZS9XR_5.jpeg", + "ZS9XR/ZS9XR_6.jpeg", + "ZS9XR/ZS9XR_7.jpeg", + "ZS9XR/ZS9XR_8.jpeg", + "ZS9XR/ZS9XR_9.jpeg", + "ZS9XR/ZS9XR_10.jpeg", + "ZS9XR/ZS9XR_11.jpeg", + "ZS9XR/ZS9XR_12.jpeg", + "ZS9XR/ZS9XR_13.jpeg", + "ZS9XR/ZS9XR_14.jpeg", + "ZS9XR/ZS9XR_15.jpeg", + "ZS9XR/ZS9XR_16.jpeg", + "ZS9XR/ZS9XR_17.jpeg", + "ZS9XR/ZS9XR_18.jpeg", + "ZS9XR/ZS9XR_19.jpeg", + "ZS9XR/ZS9XR_20.jpeg", + "ZS9XR/ZS9XR_21.jpeg", + "ZS9XR/ZS9XR_22.jpeg", + "ZS9XR/ZS9XR_23.jpeg", + "ZS9XR/ZS9XR_24.jpeg", + "ZS9XR/ZS9XR_25.jpeg", + "ZS9XR/ZS9XR_26.jpeg", + "ZS9XR/ZS9XR_27.jpeg", + "ZS9XR/ZS9XR_28.jpeg", + "ZS9XR/ZS9XR_29.jpeg", + "ZS9XR/ZS9XR_30.jpeg", + "ZS9XR/ZS9XR_31.jpeg", + "ZS9XR/ZS9XR_32.jpeg", + "ZS9XR/ZS9XR_33.jpeg", + "ZS9XR/ZS9XR_34.jpeg", + "ZS9XR/ZS9XR_35.jpeg", + "ZS9XR/ZS9XR_36.jpeg", + "ZS9XR/ZS9XR_37.jpeg", + "ZS9XR/ZS9XR_38.jpeg", + "ZS9XR/ZS9XR_39.jpeg", + "ZS9XR/ZS9XR_40.jpeg", + "ZS9XR/ZS9XR_41.jpeg", + "ZS9XR/ZS9XR_42.jpeg", + "ZS9XR/ZS9XR_43.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 36, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person took the food?\nChoice list: \nA. Ate the medicine.\nB. Tidied up the blanket.\nC. Put down the cup/glass/bottle.\nD. Took the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "ate the medicine", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "ate the medicine", + "tidied up the blanket", + "put down the cup glass bottle", + "took the box" + ], + "image_quantity_level": "Many", + "image": [ + "ZS9XR/ZS9XR_0.jpeg", + "ZS9XR/ZS9XR_1.jpeg", + "ZS9XR/ZS9XR_2.jpeg", + "ZS9XR/ZS9XR_3.jpeg", + "ZS9XR/ZS9XR_4.jpeg", + "ZS9XR/ZS9XR_5.jpeg", + "ZS9XR/ZS9XR_6.jpeg", + "ZS9XR/ZS9XR_7.jpeg", + "ZS9XR/ZS9XR_8.jpeg", + "ZS9XR/ZS9XR_9.jpeg", + "ZS9XR/ZS9XR_10.jpeg", + "ZS9XR/ZS9XR_11.jpeg", + "ZS9XR/ZS9XR_12.jpeg", + "ZS9XR/ZS9XR_13.jpeg", + "ZS9XR/ZS9XR_14.jpeg", + "ZS9XR/ZS9XR_15.jpeg", + "ZS9XR/ZS9XR_16.jpeg", + "ZS9XR/ZS9XR_17.jpeg", + "ZS9XR/ZS9XR_18.jpeg", + "ZS9XR/ZS9XR_19.jpeg", + "ZS9XR/ZS9XR_20.jpeg", + "ZS9XR/ZS9XR_21.jpeg", + "ZS9XR/ZS9XR_22.jpeg", + "ZS9XR/ZS9XR_23.jpeg", + "ZS9XR/ZS9XR_24.jpeg", + "ZS9XR/ZS9XR_25.jpeg", + "ZS9XR/ZS9XR_26.jpeg", + "ZS9XR/ZS9XR_27.jpeg", + "ZS9XR/ZS9XR_28.jpeg", + "ZS9XR/ZS9XR_29.jpeg", + "ZS9XR/ZS9XR_30.jpeg", + "ZS9XR/ZS9XR_31.jpeg", + "ZS9XR/ZS9XR_32.jpeg", + "ZS9XR/ZS9XR_33.jpeg", + "ZS9XR/ZS9XR_34.jpeg", + "ZS9XR/ZS9XR_35.jpeg", + "ZS9XR/ZS9XR_36.jpeg", + "ZS9XR/ZS9XR_37.jpeg", + "ZS9XR/ZS9XR_38.jpeg", + "ZS9XR/ZS9XR_39.jpeg", + "ZS9XR/ZS9XR_40.jpeg", + "ZS9XR/ZS9XR_41.jpeg", + "ZS9XR/ZS9XR_42.jpeg", + "ZS9XR/ZS9XR_43.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 1, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person watched at the book?\nChoice list: \nA. Tidied up the table.\nB. Took the phone/camera.\nC. Opened the closet/cabinet.\nD. Washed the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidied up the table", + "took the phone camera", + "opened the closet cabinet", + "washed the table" + ], + "image_quantity_level": "Medium", + "image": [ + "EY6P4/EY6P4_0.jpeg", + "EY6P4/EY6P4_1.jpeg", + "EY6P4/EY6P4_2.jpeg", + "EY6P4/EY6P4_3.jpeg", + "EY6P4/EY6P4_4.jpeg", + "EY6P4/EY6P4_5.jpeg", + "EY6P4/EY6P4_6.jpeg", + "EY6P4/EY6P4_7.jpeg", + "EY6P4/EY6P4_8.jpeg", + "EY6P4/EY6P4_9.jpeg", + "EY6P4/EY6P4_10.jpeg", + "EY6P4/EY6P4_11.jpeg", + "EY6P4/EY6P4_12.jpeg", + "EY6P4/EY6P4_13.jpeg", + "EY6P4/EY6P4_14.jpeg", + "EY6P4/EY6P4_15.jpeg", + "EY6P4/EY6P4_16.jpeg", + "EY6P4/EY6P4_17.jpeg", + "EY6P4/EY6P4_18.jpeg", + "EY6P4/EY6P4_19.jpeg", + "EY6P4/EY6P4_20.jpeg", + "EY6P4/EY6P4_21.jpeg", + "EY6P4/EY6P4_22.jpeg", + "EY6P4/EY6P4_23.jpeg", + "EY6P4/EY6P4_24.jpeg", + "EY6P4/EY6P4_25.jpeg", + "EY6P4/EY6P4_26.jpeg", + "EY6P4/EY6P4_27.jpeg", + "EY6P4/EY6P4_28.jpeg", + "EY6P4/EY6P4_29.jpeg", + "EY6P4/EY6P4_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 4, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person held the shoe?\nChoice list: \nA. Opened the closet/cabinet.\nB. Threw the broom.\nC. Sat on the sofa/couch.\nD. Washed the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the closet cabinet", + "threw the broom", + "sat on the sofa couch", + "washed the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "9LHP3/9LHP3_0.jpeg", + "9LHP3/9LHP3_1.jpeg", + "9LHP3/9LHP3_2.jpeg", + "9LHP3/9LHP3_3.jpeg", + "9LHP3/9LHP3_4.jpeg", + "9LHP3/9LHP3_5.jpeg", + "9LHP3/9LHP3_6.jpeg", + "9LHP3/9LHP3_7.jpeg", + "9LHP3/9LHP3_8.jpeg", + "9LHP3/9LHP3_9.jpeg", + "9LHP3/9LHP3_10.jpeg", + "9LHP3/9LHP3_11.jpeg", + "9LHP3/9LHP3_12.jpeg", + "9LHP3/9LHP3_13.jpeg", + "9LHP3/9LHP3_14.jpeg", + "9LHP3/9LHP3_15.jpeg", + "9LHP3/9LHP3_16.jpeg", + "9LHP3/9LHP3_17.jpeg", + "9LHP3/9LHP3_18.jpeg", + "9LHP3/9LHP3_19.jpeg", + "9LHP3/9LHP3_20.jpeg", + "9LHP3/9LHP3_21.jpeg", + "9LHP3/9LHP3_22.jpeg", + "9LHP3/9LHP3_23.jpeg", + "9LHP3/9LHP3_24.jpeg", + "9LHP3/9LHP3_25.jpeg", + "9LHP3/9LHP3_26.jpeg", + "9LHP3/9LHP3_27.jpeg", + "9LHP3/9LHP3_28.jpeg", + "9LHP3/9LHP3_29.jpeg", + "9LHP3/9LHP3_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 13, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the dish?\nChoice list: \nA. Sat on the bed.\nB. Ate the sandwich.\nC. Put down the food.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the food", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "sat on the bed", + "ate the sandwich", + "put down the food", + "took the food" + ], + "image_quantity_level": "Medium", + "image": [ + "KFGXC/KFGXC_0.jpeg", + "KFGXC/KFGXC_1.jpeg", + "KFGXC/KFGXC_2.jpeg", + "KFGXC/KFGXC_3.jpeg", + "KFGXC/KFGXC_4.jpeg", + "KFGXC/KFGXC_5.jpeg", + "KFGXC/KFGXC_6.jpeg", + "KFGXC/KFGXC_7.jpeg", + "KFGXC/KFGXC_8.jpeg", + "KFGXC/KFGXC_9.jpeg", + "KFGXC/KFGXC_10.jpeg", + "KFGXC/KFGXC_11.jpeg", + "KFGXC/KFGXC_12.jpeg", + "KFGXC/KFGXC_13.jpeg", + "KFGXC/KFGXC_14.jpeg", + "KFGXC/KFGXC_15.jpeg", + "KFGXC/KFGXC_16.jpeg", + "KFGXC/KFGXC_17.jpeg", + "KFGXC/KFGXC_18.jpeg", + "KFGXC/KFGXC_19.jpeg", + "KFGXC/KFGXC_20.jpeg", + "KFGXC/KFGXC_21.jpeg", + "KFGXC/KFGXC_22.jpeg", + "KFGXC/KFGXC_23.jpeg", + "KFGXC/KFGXC_24.jpeg", + "KFGXC/KFGXC_25.jpeg", + "KFGXC/KFGXC_26.jpeg", + "KFGXC/KFGXC_27.jpeg", + "KFGXC/KFGXC_28.jpeg", + "KFGXC/KFGXC_29.jpeg", + "KFGXC/KFGXC_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 22, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person held the medicine?\nChoice list: \nA. Took the dish.\nB. Opened the box.\nC. Threw the box.\nD. Washed the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the dish", + "opened the box", + "threw the box", + "washed the window" + ], + "image_quantity_level": "Medium", + "image": [ + "MD6P2/MD6P2_0.jpeg", + "MD6P2/MD6P2_1.jpeg", + "MD6P2/MD6P2_2.jpeg", + "MD6P2/MD6P2_3.jpeg", + "MD6P2/MD6P2_4.jpeg", + "MD6P2/MD6P2_5.jpeg", + "MD6P2/MD6P2_6.jpeg", + "MD6P2/MD6P2_7.jpeg", + "MD6P2/MD6P2_8.jpeg", + "MD6P2/MD6P2_9.jpeg", + "MD6P2/MD6P2_10.jpeg", + "MD6P2/MD6P2_11.jpeg", + "MD6P2/MD6P2_12.jpeg", + "MD6P2/MD6P2_13.jpeg", + "MD6P2/MD6P2_14.jpeg", + "MD6P2/MD6P2_15.jpeg", + "MD6P2/MD6P2_16.jpeg", + "MD6P2/MD6P2_17.jpeg", + "MD6P2/MD6P2_18.jpeg", + "MD6P2/MD6P2_19.jpeg", + "MD6P2/MD6P2_20.jpeg", + "MD6P2/MD6P2_21.jpeg", + "MD6P2/MD6P2_22.jpeg", + "MD6P2/MD6P2_23.jpeg", + "MD6P2/MD6P2_24.jpeg", + "MD6P2/MD6P2_25.jpeg", + "MD6P2/MD6P2_26.jpeg", + "MD6P2/MD6P2_27.jpeg", + "MD6P2/MD6P2_28.jpeg", + "MD6P2/MD6P2_29.jpeg", + "MD6P2/MD6P2_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 24, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person drank from the cup/glass/bottle?\nChoice list: \nA. Put down the laptop.\nB. Put down the cup/glass/bottle.\nC. Put down the phone/camera.\nD. Opened the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the laptop", + "put down the cup glass bottle", + "put down the phone camera", + "opened the book" + ], + "image_quantity_level": "Medium", + "image": [ + "8IPWO/8IPWO_0.jpeg", + "8IPWO/8IPWO_1.jpeg", + "8IPWO/8IPWO_2.jpeg", + "8IPWO/8IPWO_3.jpeg", + "8IPWO/8IPWO_4.jpeg", + "8IPWO/8IPWO_5.jpeg", + "8IPWO/8IPWO_6.jpeg", + "8IPWO/8IPWO_7.jpeg", + "8IPWO/8IPWO_8.jpeg", + "8IPWO/8IPWO_9.jpeg", + "8IPWO/8IPWO_10.jpeg", + "8IPWO/8IPWO_11.jpeg", + "8IPWO/8IPWO_12.jpeg", + "8IPWO/8IPWO_13.jpeg", + "8IPWO/8IPWO_14.jpeg", + "8IPWO/8IPWO_15.jpeg", + "8IPWO/8IPWO_16.jpeg", + "8IPWO/8IPWO_17.jpeg", + "8IPWO/8IPWO_18.jpeg", + "8IPWO/8IPWO_19.jpeg", + "8IPWO/8IPWO_20.jpeg", + "8IPWO/8IPWO_21.jpeg", + "8IPWO/8IPWO_22.jpeg", + "8IPWO/8IPWO_23.jpeg", + "8IPWO/8IPWO_24.jpeg", + "8IPWO/8IPWO_25.jpeg", + "8IPWO/8IPWO_26.jpeg", + "8IPWO/8IPWO_27.jpeg", + "8IPWO/8IPWO_28.jpeg", + "8IPWO/8IPWO_29.jpeg", + "8IPWO/8IPWO_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 33, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person opened the bag?\nChoice list: \nA. Put down the book.\nB. Opened the book.\nC. Took the book.\nD. Put down the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the book", + "opened the book", + "took the book", + "put down the blanket" + ], + "image_quantity_level": "Medium", + "image": [ + "4P13T/4P13T_0.jpeg", + "4P13T/4P13T_1.jpeg", + "4P13T/4P13T_2.jpeg", + "4P13T/4P13T_3.jpeg", + "4P13T/4P13T_4.jpeg", + "4P13T/4P13T_5.jpeg", + "4P13T/4P13T_6.jpeg", + "4P13T/4P13T_7.jpeg", + "4P13T/4P13T_8.jpeg", + "4P13T/4P13T_9.jpeg", + "4P13T/4P13T_10.jpeg", + "4P13T/4P13T_11.jpeg", + "4P13T/4P13T_12.jpeg", + "4P13T/4P13T_13.jpeg", + "4P13T/4P13T_14.jpeg", + "4P13T/4P13T_15.jpeg", + "4P13T/4P13T_16.jpeg", + "4P13T/4P13T_17.jpeg", + "4P13T/4P13T_18.jpeg", + "4P13T/4P13T_19.jpeg", + "4P13T/4P13T_20.jpeg", + "4P13T/4P13T_21.jpeg", + "4P13T/4P13T_22.jpeg", + "4P13T/4P13T_23.jpeg", + "4P13T/4P13T_24.jpeg", + "4P13T/4P13T_25.jpeg", + "4P13T/4P13T_26.jpeg", + "4P13T/4P13T_27.jpeg", + "4P13T/4P13T_28.jpeg", + "4P13T/4P13T_29.jpeg", + "4P13T/4P13T_30.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 2, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person watched at the book?\nChoice list: \nA. Put down the sandwich.\nB. Washed the table.\nC. Opened the closet/cabinet.\nD. Put down the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the sandwich", + "washed the table", + "opened the closet cabinet", + "put down the pillow" + ], + "image_quantity_level": "Many", + "image": [ + "WBS4I/WBS4I_0.jpeg", + "WBS4I/WBS4I_1.jpeg", + "WBS4I/WBS4I_2.jpeg", + "WBS4I/WBS4I_3.jpeg", + "WBS4I/WBS4I_4.jpeg", + "WBS4I/WBS4I_5.jpeg", + "WBS4I/WBS4I_6.jpeg", + "WBS4I/WBS4I_7.jpeg", + "WBS4I/WBS4I_8.jpeg", + "WBS4I/WBS4I_9.jpeg", + "WBS4I/WBS4I_10.jpeg", + "WBS4I/WBS4I_11.jpeg", + "WBS4I/WBS4I_12.jpeg", + "WBS4I/WBS4I_13.jpeg", + "WBS4I/WBS4I_14.jpeg", + "WBS4I/WBS4I_15.jpeg", + "WBS4I/WBS4I_16.jpeg", + "WBS4I/WBS4I_17.jpeg", + "WBS4I/WBS4I_18.jpeg", + "WBS4I/WBS4I_19.jpeg", + "WBS4I/WBS4I_20.jpeg", + "WBS4I/WBS4I_21.jpeg", + "WBS4I/WBS4I_22.jpeg", + "WBS4I/WBS4I_23.jpeg", + "WBS4I/WBS4I_24.jpeg", + "WBS4I/WBS4I_25.jpeg", + "WBS4I/WBS4I_26.jpeg", + "WBS4I/WBS4I_27.jpeg", + "WBS4I/WBS4I_28.jpeg", + "WBS4I/WBS4I_29.jpeg", + "WBS4I/WBS4I_30.jpeg", + "WBS4I/WBS4I_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 14, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat on the sofa/couch?\nChoice list: \nA. Ate the sandwich.\nB. Put down the pillow.\nC. Washed the clothes.\nD. Took the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "ate the sandwich", + "put down the pillow", + "washed the clothes", + "took the bag" + ], + "image_quantity_level": "Many", + "image": [ + "GKH0F/GKH0F_0.jpeg", + "GKH0F/GKH0F_1.jpeg", + "GKH0F/GKH0F_2.jpeg", + "GKH0F/GKH0F_3.jpeg", + "GKH0F/GKH0F_4.jpeg", + "GKH0F/GKH0F_5.jpeg", + "GKH0F/GKH0F_6.jpeg", + "GKH0F/GKH0F_7.jpeg", + "GKH0F/GKH0F_8.jpeg", + "GKH0F/GKH0F_9.jpeg", + "GKH0F/GKH0F_10.jpeg", + "GKH0F/GKH0F_11.jpeg", + "GKH0F/GKH0F_12.jpeg", + "GKH0F/GKH0F_13.jpeg", + "GKH0F/GKH0F_14.jpeg", + "GKH0F/GKH0F_15.jpeg", + "GKH0F/GKH0F_16.jpeg", + "GKH0F/GKH0F_17.jpeg", + "GKH0F/GKH0F_18.jpeg", + "GKH0F/GKH0F_19.jpeg", + "GKH0F/GKH0F_20.jpeg", + "GKH0F/GKH0F_21.jpeg", + "GKH0F/GKH0F_22.jpeg", + "GKH0F/GKH0F_23.jpeg", + "GKH0F/GKH0F_24.jpeg", + "GKH0F/GKH0F_25.jpeg", + "GKH0F/GKH0F_26.jpeg", + "GKH0F/GKH0F_27.jpeg", + "GKH0F/GKH0F_28.jpeg", + "GKH0F/GKH0F_29.jpeg", + "GKH0F/GKH0F_30.jpeg", + "GKH0F/GKH0F_31.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 44, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened before the person took the cup/glass/bottle?\nChoice list: \nA. Put down the dish.\nB. Opened the book.\nC. Put down the phone/camera.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the dish", + "opened the book", + "put down the phone camera", + "took the food" + ], + "image_quantity_level": "Many", + "image": [ + "J7BOV/J7BOV_0.jpeg", + "J7BOV/J7BOV_1.jpeg", + "J7BOV/J7BOV_2.jpeg", + "J7BOV/J7BOV_3.jpeg", + "J7BOV/J7BOV_4.jpeg", + "J7BOV/J7BOV_5.jpeg", + "J7BOV/J7BOV_6.jpeg", + "J7BOV/J7BOV_7.jpeg", + "J7BOV/J7BOV_8.jpeg", + "J7BOV/J7BOV_9.jpeg", + "J7BOV/J7BOV_10.jpeg", + "J7BOV/J7BOV_11.jpeg", + "J7BOV/J7BOV_12.jpeg", + "J7BOV/J7BOV_13.jpeg", + "J7BOV/J7BOV_14.jpeg", + "J7BOV/J7BOV_15.jpeg", + "J7BOV/J7BOV_16.jpeg", + "J7BOV/J7BOV_17.jpeg", + "J7BOV/J7BOV_18.jpeg", + "J7BOV/J7BOV_19.jpeg", + "J7BOV/J7BOV_20.jpeg", + "J7BOV/J7BOV_21.jpeg", + "J7BOV/J7BOV_22.jpeg", + "J7BOV/J7BOV_23.jpeg", + "J7BOV/J7BOV_24.jpeg", + "J7BOV/J7BOV_25.jpeg", + "J7BOV/J7BOV_26.jpeg", + "J7BOV/J7BOV_27.jpeg", + "J7BOV/J7BOV_28.jpeg", + "J7BOV/J7BOV_29.jpeg", + "J7BOV/J7BOV_30.jpeg", + "J7BOV/J7BOV_31.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 82, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person tidied up the clothes?\nChoice list: \nA. Opened the bag.\nB. Took the sandwich.\nC. Took the blanket.\nD. Sat on the floor.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the bag", + "took the sandwich", + "took the blanket", + "sat on the floor" + ], + "image_quantity_level": "Many", + "image": [ + "D1NT7/D1NT7_0.jpeg", + "D1NT7/D1NT7_1.jpeg", + "D1NT7/D1NT7_2.jpeg", + "D1NT7/D1NT7_3.jpeg", + "D1NT7/D1NT7_4.jpeg", + "D1NT7/D1NT7_5.jpeg", + "D1NT7/D1NT7_6.jpeg", + "D1NT7/D1NT7_7.jpeg", + "D1NT7/D1NT7_8.jpeg", + "D1NT7/D1NT7_9.jpeg", + "D1NT7/D1NT7_10.jpeg", + "D1NT7/D1NT7_11.jpeg", + "D1NT7/D1NT7_12.jpeg", + "D1NT7/D1NT7_13.jpeg", + "D1NT7/D1NT7_14.jpeg", + "D1NT7/D1NT7_15.jpeg", + "D1NT7/D1NT7_16.jpeg", + "D1NT7/D1NT7_17.jpeg", + "D1NT7/D1NT7_18.jpeg", + "D1NT7/D1NT7_19.jpeg", + "D1NT7/D1NT7_20.jpeg", + "D1NT7/D1NT7_21.jpeg", + "D1NT7/D1NT7_22.jpeg", + "D1NT7/D1NT7_23.jpeg", + "D1NT7/D1NT7_24.jpeg", + "D1NT7/D1NT7_25.jpeg", + "D1NT7/D1NT7_26.jpeg", + "D1NT7/D1NT7_27.jpeg", + "D1NT7/D1NT7_28.jpeg", + "D1NT7/D1NT7_29.jpeg", + "D1NT7/D1NT7_30.jpeg", + "D1NT7/D1NT7_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 3, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person put down the clothes?\nChoice list: \nA. Took the broom.\nB. Took the blanket.\nC. Opened the door.\nD. Threw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the broom", + "took the blanket", + "opened the door", + "threw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "43FG9/43FG9_0.jpeg", + "43FG9/43FG9_1.jpeg", + "43FG9/43FG9_2.jpeg", + "43FG9/43FG9_3.jpeg", + "43FG9/43FG9_4.jpeg", + "43FG9/43FG9_5.jpeg", + "43FG9/43FG9_6.jpeg", + "43FG9/43FG9_7.jpeg", + "43FG9/43FG9_8.jpeg", + "43FG9/43FG9_9.jpeg", + "43FG9/43FG9_10.jpeg", + "43FG9/43FG9_11.jpeg", + "43FG9/43FG9_12.jpeg", + "43FG9/43FG9_13.jpeg", + "43FG9/43FG9_14.jpeg", + "43FG9/43FG9_15.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 5, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person held the phone/camera?\nChoice list: \nA. Closed the refrigerator.\nB. Put down the towel.\nC. Put down the laptop.\nD. Opened the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the refrigerator", + "put down the towel", + "put down the laptop", + "opened the window" + ], + "image_quantity_level": "Many", + "image": [ + "PQYWB/PQYWB_0.jpeg", + "PQYWB/PQYWB_1.jpeg", + "PQYWB/PQYWB_2.jpeg", + "PQYWB/PQYWB_3.jpeg", + "PQYWB/PQYWB_4.jpeg", + "PQYWB/PQYWB_5.jpeg", + "PQYWB/PQYWB_6.jpeg", + "PQYWB/PQYWB_7.jpeg", + "PQYWB/PQYWB_8.jpeg", + "PQYWB/PQYWB_9.jpeg", + "PQYWB/PQYWB_10.jpeg", + "PQYWB/PQYWB_11.jpeg", + "PQYWB/PQYWB_12.jpeg", + "PQYWB/PQYWB_13.jpeg", + "PQYWB/PQYWB_14.jpeg", + "PQYWB/PQYWB_15.jpeg", + "PQYWB/PQYWB_16.jpeg", + "PQYWB/PQYWB_17.jpeg", + "PQYWB/PQYWB_18.jpeg", + "PQYWB/PQYWB_19.jpeg", + "PQYWB/PQYWB_20.jpeg", + "PQYWB/PQYWB_21.jpeg", + "PQYWB/PQYWB_22.jpeg", + "PQYWB/PQYWB_23.jpeg", + "PQYWB/PQYWB_24.jpeg", + "PQYWB/PQYWB_25.jpeg", + "PQYWB/PQYWB_26.jpeg", + "PQYWB/PQYWB_27.jpeg", + "PQYWB/PQYWB_28.jpeg", + "PQYWB/PQYWB_29.jpeg", + "PQYWB/PQYWB_30.jpeg", + "PQYWB/PQYWB_31.jpeg", + "PQYWB/PQYWB_32.jpeg", + "PQYWB/PQYWB_33.jpeg", + "PQYWB/PQYWB_34.jpeg", + "PQYWB/PQYWB_35.jpeg", + "PQYWB/PQYWB_36.jpeg", + "PQYWB/PQYWB_37.jpeg", + "PQYWB/PQYWB_38.jpeg", + "PQYWB/PQYWB_39.jpeg", + "PQYWB/PQYWB_40.jpeg", + "PQYWB/PQYWB_41.jpeg", + "PQYWB/PQYWB_42.jpeg", + "PQYWB/PQYWB_43.jpeg", + "PQYWB/PQYWB_44.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 6, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person took the phone/camera?\nChoice list: \nA. Took the book.\nB. Put down the pillow.\nC. Closed the window.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the window", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the book", + "put down the pillow", + "closed the window", + "took the food" + ], + "image_quantity_level": "Many", + "image": [ + "GLGQJ/GLGQJ_0.jpeg", + "GLGQJ/GLGQJ_1.jpeg", + "GLGQJ/GLGQJ_2.jpeg", + "GLGQJ/GLGQJ_3.jpeg", + "GLGQJ/GLGQJ_4.jpeg", + "GLGQJ/GLGQJ_5.jpeg", + "GLGQJ/GLGQJ_6.jpeg", + "GLGQJ/GLGQJ_7.jpeg", + "GLGQJ/GLGQJ_8.jpeg", + "GLGQJ/GLGQJ_9.jpeg", + "GLGQJ/GLGQJ_10.jpeg", + "GLGQJ/GLGQJ_11.jpeg", + "GLGQJ/GLGQJ_12.jpeg", + "GLGQJ/GLGQJ_13.jpeg", + "GLGQJ/GLGQJ_14.jpeg", + "GLGQJ/GLGQJ_15.jpeg", + "GLGQJ/GLGQJ_16.jpeg", + "GLGQJ/GLGQJ_17.jpeg", + "GLGQJ/GLGQJ_18.jpeg", + "GLGQJ/GLGQJ_19.jpeg", + "GLGQJ/GLGQJ_20.jpeg", + "GLGQJ/GLGQJ_21.jpeg", + "GLGQJ/GLGQJ_22.jpeg", + "GLGQJ/GLGQJ_23.jpeg", + "GLGQJ/GLGQJ_24.jpeg", + "GLGQJ/GLGQJ_25.jpeg", + "GLGQJ/GLGQJ_26.jpeg", + "GLGQJ/GLGQJ_27.jpeg", + "GLGQJ/GLGQJ_28.jpeg", + "GLGQJ/GLGQJ_29.jpeg", + "GLGQJ/GLGQJ_30.jpeg", + "GLGQJ/GLGQJ_31.jpeg", + "GLGQJ/GLGQJ_32.jpeg", + "GLGQJ/GLGQJ_33.jpeg", + "GLGQJ/GLGQJ_34.jpeg", + "GLGQJ/GLGQJ_35.jpeg", + "GLGQJ/GLGQJ_36.jpeg", + "GLGQJ/GLGQJ_37.jpeg", + "GLGQJ/GLGQJ_38.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 7, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person opened the box?\nChoice list: \nA. Threw the clothes.\nB. Put down the food.\nC. Sat at the table.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the clothes", + "put down the food", + "sat at the table", + "put down the paper notebook" + ], + "image_quantity_level": "Many", + "image": [ + "VKXLL/VKXLL_0.jpeg", + "VKXLL/VKXLL_1.jpeg", + "VKXLL/VKXLL_2.jpeg", + "VKXLL/VKXLL_3.jpeg", + "VKXLL/VKXLL_4.jpeg", + "VKXLL/VKXLL_5.jpeg", + "VKXLL/VKXLL_6.jpeg", + "VKXLL/VKXLL_7.jpeg", + "VKXLL/VKXLL_8.jpeg", + "VKXLL/VKXLL_9.jpeg", + "VKXLL/VKXLL_10.jpeg", + "VKXLL/VKXLL_11.jpeg", + "VKXLL/VKXLL_12.jpeg", + "VKXLL/VKXLL_13.jpeg", + "VKXLL/VKXLL_14.jpeg", + "VKXLL/VKXLL_15.jpeg", + "VKXLL/VKXLL_16.jpeg", + "VKXLL/VKXLL_17.jpeg", + "VKXLL/VKXLL_18.jpeg", + "VKXLL/VKXLL_19.jpeg", + "VKXLL/VKXLL_20.jpeg", + "VKXLL/VKXLL_21.jpeg", + "VKXLL/VKXLL_22.jpeg", + "VKXLL/VKXLL_23.jpeg", + "VKXLL/VKXLL_24.jpeg", + "VKXLL/VKXLL_25.jpeg", + "VKXLL/VKXLL_26.jpeg", + "VKXLL/VKXLL_27.jpeg", + "VKXLL/VKXLL_28.jpeg", + "VKXLL/VKXLL_29.jpeg", + "VKXLL/VKXLL_30.jpeg", + "VKXLL/VKXLL_31.jpeg", + "VKXLL/VKXLL_32.jpeg", + "VKXLL/VKXLL_33.jpeg", + "VKXLL/VKXLL_34.jpeg", + "VKXLL/VKXLL_35.jpeg", + "VKXLL/VKXLL_36.jpeg", + "VKXLL/VKXLL_37.jpeg", + "VKXLL/VKXLL_38.jpeg", + "VKXLL/VKXLL_39.jpeg", + "VKXLL/VKXLL_40.jpeg", + "VKXLL/VKXLL_41.jpeg", + "VKXLL/VKXLL_42.jpeg", + "VKXLL/VKXLL_43.jpeg", + "VKXLL/VKXLL_44.jpeg", + "VKXLL/VKXLL_45.jpeg", + "VKXLL/VKXLL_46.jpeg", + "VKXLL/VKXLL_47.jpeg", + "VKXLL/VKXLL_48.jpeg", + "VKXLL/VKXLL_49.jpeg", + "VKXLL/VKXLL_50.jpeg", + "VKXLL/VKXLL_51.jpeg", + "VKXLL/VKXLL_52.jpeg", + "VKXLL/VKXLL_53.jpeg", + "VKXLL/VKXLL_54.jpeg", + "VKXLL/VKXLL_55.jpeg", + "VKXLL/VKXLL_56.jpeg", + "VKXLL/VKXLL_57.jpeg", + "VKXLL/VKXLL_58.jpeg", + "VKXLL/VKXLL_59.jpeg", + "VKXLL/VKXLL_60.jpeg", + "VKXLL/VKXLL_61.jpeg", + "VKXLL/VKXLL_62.jpeg", + "VKXLL/VKXLL_63.jpeg", + "VKXLL/VKXLL_64.jpeg", + "VKXLL/VKXLL_65.jpeg", + "VKXLL/VKXLL_66.jpeg", + "VKXLL/VKXLL_67.jpeg", + "VKXLL/VKXLL_68.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 8, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person sat on the bed?\nChoice list: \nA. Took the phone/camera.\nB. Opened the window.\nC. Took the pillow.\nD. Put down the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the broom", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the phone camera", + "opened the window", + "took the pillow", + "put down the broom" + ], + "image_quantity_level": "Medium", + "image": [ + "0OSJY/0OSJY_0.jpeg", + "0OSJY/0OSJY_1.jpeg", + "0OSJY/0OSJY_2.jpeg", + "0OSJY/0OSJY_3.jpeg", + "0OSJY/0OSJY_4.jpeg", + "0OSJY/0OSJY_5.jpeg", + "0OSJY/0OSJY_6.jpeg", + "0OSJY/0OSJY_7.jpeg", + "0OSJY/0OSJY_8.jpeg", + "0OSJY/0OSJY_9.jpeg", + "0OSJY/0OSJY_10.jpeg", + "0OSJY/0OSJY_11.jpeg", + "0OSJY/0OSJY_12.jpeg", + "0OSJY/0OSJY_13.jpeg", + "0OSJY/0OSJY_14.jpeg", + "0OSJY/0OSJY_15.jpeg", + "0OSJY/0OSJY_16.jpeg", + "0OSJY/0OSJY_17.jpeg", + "0OSJY/0OSJY_18.jpeg", + "0OSJY/0OSJY_19.jpeg", + "0OSJY/0OSJY_20.jpeg", + "0OSJY/0OSJY_21.jpeg", + "0OSJY/0OSJY_22.jpeg", + "0OSJY/0OSJY_23.jpeg", + "0OSJY/0OSJY_24.jpeg", + "0OSJY/0OSJY_25.jpeg", + "0OSJY/0OSJY_26.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 23, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person put down the clothes?\nChoice list: \nA. Opened the window.\nB. Tidied up the closet/cabinet.\nC. Took the food.\nD. Tidied up the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the window", + "tidied up the closet cabinet", + "took the food", + "tidied up the blanket" + ], + "image_quantity_level": "Medium", + "image": [ + "3064K/3064K_0.jpeg", + "3064K/3064K_1.jpeg", + "3064K/3064K_2.jpeg", + "3064K/3064K_3.jpeg", + "3064K/3064K_4.jpeg", + "3064K/3064K_5.jpeg", + "3064K/3064K_6.jpeg", + "3064K/3064K_7.jpeg", + "3064K/3064K_8.jpeg", + "3064K/3064K_9.jpeg", + "3064K/3064K_10.jpeg", + "3064K/3064K_11.jpeg", + "3064K/3064K_12.jpeg", + "3064K/3064K_13.jpeg", + "3064K/3064K_14.jpeg", + "3064K/3064K_15.jpeg", + "3064K/3064K_16.jpeg", + "3064K/3064K_17.jpeg", + "3064K/3064K_18.jpeg", + "3064K/3064K_19.jpeg", + "3064K/3064K_20.jpeg", + "3064K/3064K_21.jpeg", + "3064K/3064K_22.jpeg", + "3064K/3064K_23.jpeg", + "3064K/3064K_24.jpeg", + "3064K/3064K_25.jpeg", + "3064K/3064K_26.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 26, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person held the phone/camera?\nChoice list: \nA. Took the laptop.\nB. Sat on the sofa/couch.\nC. Put down the shoe.\nD. Put down the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the laptop", + "sat on the sofa couch", + "put down the shoe", + "put down the cup glass bottle" + ], + "image_quantity_level": "Medium", + "image": [ + "8G9A9/8G9A9_0.jpeg", + "8G9A9/8G9A9_1.jpeg", + "8G9A9/8G9A9_2.jpeg", + "8G9A9/8G9A9_3.jpeg", + "8G9A9/8G9A9_4.jpeg", + "8G9A9/8G9A9_5.jpeg", + "8G9A9/8G9A9_6.jpeg", + "8G9A9/8G9A9_7.jpeg", + "8G9A9/8G9A9_8.jpeg", + "8G9A9/8G9A9_9.jpeg", + "8G9A9/8G9A9_10.jpeg", + "8G9A9/8G9A9_11.jpeg", + "8G9A9/8G9A9_12.jpeg", + "8G9A9/8G9A9_13.jpeg", + "8G9A9/8G9A9_14.jpeg", + "8G9A9/8G9A9_15.jpeg", + "8G9A9/8G9A9_16.jpeg", + "8G9A9/8G9A9_17.jpeg", + "8G9A9/8G9A9_18.jpeg", + "8G9A9/8G9A9_19.jpeg", + "8G9A9/8G9A9_20.jpeg", + "8G9A9/8G9A9_21.jpeg", + "8G9A9/8G9A9_22.jpeg", + "8G9A9/8G9A9_23.jpeg", + "8G9A9/8G9A9_24.jpeg", + "8G9A9/8G9A9_25.jpeg", + "8G9A9/8G9A9_26.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 47, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened before the person held the mirror?\nChoice list: \nA. Took the cup/glass/bottle.\nB. Sat on the table.\nC. Closed the closet/cabinet.\nD. Opened the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the cup glass bottle", + "sat on the table", + "closed the closet cabinet", + "opened the window" + ], + "image_quantity_level": "Medium", + "image": [ + "1UI6I/1UI6I_0.jpeg", + "1UI6I/1UI6I_1.jpeg", + "1UI6I/1UI6I_2.jpeg", + "1UI6I/1UI6I_3.jpeg", + "1UI6I/1UI6I_4.jpeg", + "1UI6I/1UI6I_5.jpeg", + "1UI6I/1UI6I_6.jpeg", + "1UI6I/1UI6I_7.jpeg", + "1UI6I/1UI6I_8.jpeg", + "1UI6I/1UI6I_9.jpeg", + "1UI6I/1UI6I_10.jpeg", + "1UI6I/1UI6I_11.jpeg", + "1UI6I/1UI6I_12.jpeg", + "1UI6I/1UI6I_13.jpeg", + "1UI6I/1UI6I_14.jpeg", + "1UI6I/1UI6I_15.jpeg", + "1UI6I/1UI6I_16.jpeg", + "1UI6I/1UI6I_17.jpeg", + "1UI6I/1UI6I_18.jpeg", + "1UI6I/1UI6I_19.jpeg", + "1UI6I/1UI6I_20.jpeg", + "1UI6I/1UI6I_21.jpeg", + "1UI6I/1UI6I_22.jpeg", + "1UI6I/1UI6I_23.jpeg", + "1UI6I/1UI6I_24.jpeg", + "1UI6I/1UI6I_25.jpeg", + "1UI6I/1UI6I_26.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 9, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person took the clothes?\nChoice list: \nA. Threw the book.\nB. Put down the bag.\nC. Opened the window.\nD. Put down the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "threw the book", + "put down the bag", + "opened the window", + "put down the cup glass bottle" + ], + "image_quantity_level": "Many", + "image": [ + "HPAYB/HPAYB_0.jpeg", + "HPAYB/HPAYB_1.jpeg", + "HPAYB/HPAYB_2.jpeg", + "HPAYB/HPAYB_3.jpeg", + "HPAYB/HPAYB_4.jpeg", + "HPAYB/HPAYB_5.jpeg", + "HPAYB/HPAYB_6.jpeg", + "HPAYB/HPAYB_7.jpeg", + "HPAYB/HPAYB_8.jpeg", + "HPAYB/HPAYB_9.jpeg", + "HPAYB/HPAYB_10.jpeg", + "HPAYB/HPAYB_11.jpeg", + "HPAYB/HPAYB_12.jpeg", + "HPAYB/HPAYB_13.jpeg", + "HPAYB/HPAYB_14.jpeg", + "HPAYB/HPAYB_15.jpeg", + "HPAYB/HPAYB_16.jpeg", + "HPAYB/HPAYB_17.jpeg", + "HPAYB/HPAYB_18.jpeg", + "HPAYB/HPAYB_19.jpeg", + "HPAYB/HPAYB_20.jpeg", + "HPAYB/HPAYB_21.jpeg", + "HPAYB/HPAYB_22.jpeg", + "HPAYB/HPAYB_23.jpeg", + "HPAYB/HPAYB_24.jpeg", + "HPAYB/HPAYB_25.jpeg", + "HPAYB/HPAYB_26.jpeg", + "HPAYB/HPAYB_27.jpeg", + "HPAYB/HPAYB_28.jpeg", + "HPAYB/HPAYB_29.jpeg", + "HPAYB/HPAYB_30.jpeg", + "HPAYB/HPAYB_31.jpeg", + "HPAYB/HPAYB_32.jpeg", + "HPAYB/HPAYB_33.jpeg", + "HPAYB/HPAYB_34.jpeg", + "HPAYB/HPAYB_35.jpeg", + "HPAYB/HPAYB_36.jpeg", + "HPAYB/HPAYB_37.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 10, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person sat on the sofa/couch?\nChoice list: \nA. Put down the laptop.\nB. Took the paper/notebook.\nC. Put down the pillow.\nD. Ate the medicine.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the laptop", + "took the paper notebook", + "put down the pillow", + "ate the medicine" + ], + "image_quantity_level": "Medium", + "image": [ + "M2XIS/M2XIS_0.jpeg", + "M2XIS/M2XIS_1.jpeg", + "M2XIS/M2XIS_2.jpeg", + "M2XIS/M2XIS_3.jpeg", + "M2XIS/M2XIS_4.jpeg", + "M2XIS/M2XIS_5.jpeg", + "M2XIS/M2XIS_6.jpeg", + "M2XIS/M2XIS_7.jpeg", + "M2XIS/M2XIS_8.jpeg", + "M2XIS/M2XIS_9.jpeg", + "M2XIS/M2XIS_10.jpeg", + "M2XIS/M2XIS_11.jpeg", + "M2XIS/M2XIS_12.jpeg", + "M2XIS/M2XIS_13.jpeg", + "M2XIS/M2XIS_14.jpeg", + "M2XIS/M2XIS_15.jpeg", + "M2XIS/M2XIS_16.jpeg", + "M2XIS/M2XIS_17.jpeg", + "M2XIS/M2XIS_18.jpeg", + "M2XIS/M2XIS_19.jpeg", + "M2XIS/M2XIS_20.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 41, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person opened the refrigerator?\nChoice list: \nA. Closed the window.\nB. Put down the cup/glass/bottle.\nC. Took the paper/notebook.\nD. Sat at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "closed the window", + "put down the cup glass bottle", + "took the paper notebook", + "sat at the table" + ], + "image_quantity_level": "Medium", + "image": [ + "0F453/0F453_0.jpeg", + "0F453/0F453_1.jpeg", + "0F453/0F453_2.jpeg", + "0F453/0F453_3.jpeg", + "0F453/0F453_4.jpeg", + "0F453/0F453_5.jpeg", + "0F453/0F453_6.jpeg", + "0F453/0F453_7.jpeg", + "0F453/0F453_8.jpeg", + "0F453/0F453_9.jpeg", + "0F453/0F453_10.jpeg", + "0F453/0F453_11.jpeg", + "0F453/0F453_12.jpeg", + "0F453/0F453_13.jpeg", + "0F453/0F453_14.jpeg", + "0F453/0F453_15.jpeg", + "0F453/0F453_16.jpeg", + "0F453/0F453_17.jpeg", + "0F453/0F453_18.jpeg", + "0F453/0F453_19.jpeg", + "0F453/0F453_20.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 11, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person held the blanket?\nChoice list: \nA. Tidied up the table.\nB. Took the dish.\nC. Opened the window.\nD. Took the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidied up the table", + "took the dish", + "opened the window", + "took the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "YOCI8/YOCI8_0.jpeg", + "YOCI8/YOCI8_1.jpeg", + "YOCI8/YOCI8_2.jpeg", + "YOCI8/YOCI8_3.jpeg", + "YOCI8/YOCI8_4.jpeg", + "YOCI8/YOCI8_5.jpeg", + "YOCI8/YOCI8_6.jpeg", + "YOCI8/YOCI8_7.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 12, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened before the person held the food?\nChoice list: \nA. Put down the clothes.\nB. Opened the laptop.\nC. Put down the food.\nD. Took the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the clothes", + "opened the laptop", + "put down the food", + "took the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "3Q6N1/3Q6N1_0.jpeg", + "3Q6N1/3Q6N1_1.jpeg", + "3Q6N1/3Q6N1_2.jpeg", + "3Q6N1/3Q6N1_3.jpeg", + "3Q6N1/3Q6N1_4.jpeg", + "3Q6N1/3Q6N1_5.jpeg", + "3Q6N1/3Q6N1_6.jpeg", + "3Q6N1/3Q6N1_7.jpeg", + "3Q6N1/3Q6N1_8.jpeg", + "3Q6N1/3Q6N1_9.jpeg", + "3Q6N1/3Q6N1_10.jpeg", + "3Q6N1/3Q6N1_11.jpeg", + "3Q6N1/3Q6N1_12.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 15, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened before the person threw the clothes?\nChoice list: \nA. Opened the bag.\nB. Took the shoe.\nC. Threw the food.\nD. Tidied up the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the bag", + "took the shoe", + "threw the food", + "tidied up the blanket" + ], + "image_quantity_level": "Medium", + "image": [ + "84893/84893_0.jpeg", + "84893/84893_1.jpeg", + "84893/84893_2.jpeg", + "84893/84893_3.jpeg", + "84893/84893_4.jpeg", + "84893/84893_5.jpeg", + "84893/84893_6.jpeg", + "84893/84893_7.jpeg", + "84893/84893_8.jpeg", + "84893/84893_9.jpeg", + "84893/84893_10.jpeg", + "84893/84893_11.jpeg", + "84893/84893_12.jpeg", + "84893/84893_13.jpeg", + "84893/84893_14.jpeg", + "84893/84893_15.jpeg", + "84893/84893_16.jpeg", + "84893/84893_17.jpeg", + "84893/84893_18.jpeg", + "84893/84893_19.jpeg", + "84893/84893_20.jpeg", + "84893/84893_21.jpeg", + "84893/84893_22.jpeg", + "84893/84893_23.jpeg", + "84893/84893_24.jpeg", + "84893/84893_25.jpeg", + "84893/84893_26.jpeg", + "84893/84893_27.jpeg", + "84893/84893_28.jpeg", + "84893/84893_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 18, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the towel?\nChoice list: \nA. Closed the closet/cabinet.\nB. Threw the clothes.\nC. Lied on the bed.\nD. Lied on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "closed the closet cabinet", + "threw the clothes", + "lied on the bed", + "lied on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "HOYUT/HOYUT_0.jpeg", + "HOYUT/HOYUT_1.jpeg", + "HOYUT/HOYUT_2.jpeg", + "HOYUT/HOYUT_3.jpeg", + "HOYUT/HOYUT_4.jpeg", + "HOYUT/HOYUT_5.jpeg", + "HOYUT/HOYUT_6.jpeg", + "HOYUT/HOYUT_7.jpeg", + "HOYUT/HOYUT_8.jpeg", + "HOYUT/HOYUT_9.jpeg", + "HOYUT/HOYUT_10.jpeg", + "HOYUT/HOYUT_11.jpeg", + "HOYUT/HOYUT_12.jpeg", + "HOYUT/HOYUT_13.jpeg", + "HOYUT/HOYUT_14.jpeg", + "HOYUT/HOYUT_15.jpeg", + "HOYUT/HOYUT_16.jpeg", + "HOYUT/HOYUT_17.jpeg", + "HOYUT/HOYUT_18.jpeg", + "HOYUT/HOYUT_19.jpeg", + "HOYUT/HOYUT_20.jpeg", + "HOYUT/HOYUT_21.jpeg", + "HOYUT/HOYUT_22.jpeg", + "HOYUT/HOYUT_23.jpeg", + "HOYUT/HOYUT_24.jpeg", + "HOYUT/HOYUT_25.jpeg", + "HOYUT/HOYUT_26.jpeg", + "HOYUT/HOYUT_27.jpeg", + "HOYUT/HOYUT_28.jpeg", + "HOYUT/HOYUT_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 19, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person put down the food?\nChoice list: \nA. Opened the bag.\nB. Took the paper/notebook.\nC. Threw the clothes.\nD. Ate the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "opened the bag", + "took the paper notebook", + "threw the clothes", + "ate the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "L9J5I/L9J5I_0.jpeg", + "L9J5I/L9J5I_1.jpeg", + "L9J5I/L9J5I_2.jpeg", + "L9J5I/L9J5I_3.jpeg", + "L9J5I/L9J5I_4.jpeg", + "L9J5I/L9J5I_5.jpeg", + "L9J5I/L9J5I_6.jpeg", + "L9J5I/L9J5I_7.jpeg", + "L9J5I/L9J5I_8.jpeg", + "L9J5I/L9J5I_9.jpeg", + "L9J5I/L9J5I_10.jpeg", + "L9J5I/L9J5I_11.jpeg", + "L9J5I/L9J5I_12.jpeg", + "L9J5I/L9J5I_13.jpeg", + "L9J5I/L9J5I_14.jpeg", + "L9J5I/L9J5I_15.jpeg", + "L9J5I/L9J5I_16.jpeg", + "L9J5I/L9J5I_17.jpeg", + "L9J5I/L9J5I_18.jpeg", + "L9J5I/L9J5I_19.jpeg", + "L9J5I/L9J5I_20.jpeg", + "L9J5I/L9J5I_21.jpeg", + "L9J5I/L9J5I_22.jpeg", + "L9J5I/L9J5I_23.jpeg", + "L9J5I/L9J5I_24.jpeg", + "L9J5I/L9J5I_25.jpeg", + "L9J5I/L9J5I_26.jpeg", + "L9J5I/L9J5I_27.jpeg", + "L9J5I/L9J5I_28.jpeg", + "L9J5I/L9J5I_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 28, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened before the person closed the door?\nChoice list: \nA. Put down the box.\nB. Took the paper/notebook.\nC. Took the towel.\nD. Sat at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the box", + "took the paper notebook", + "took the towel", + "sat at the table" + ], + "image_quantity_level": "Medium", + "image": [ + "JLGLU/JLGLU_0.jpeg", + "JLGLU/JLGLU_1.jpeg", + "JLGLU/JLGLU_2.jpeg", + "JLGLU/JLGLU_3.jpeg", + "JLGLU/JLGLU_4.jpeg", + "JLGLU/JLGLU_5.jpeg", + "JLGLU/JLGLU_6.jpeg", + "JLGLU/JLGLU_7.jpeg", + "JLGLU/JLGLU_8.jpeg", + "JLGLU/JLGLU_9.jpeg", + "JLGLU/JLGLU_10.jpeg", + "JLGLU/JLGLU_11.jpeg", + "JLGLU/JLGLU_12.jpeg", + "JLGLU/JLGLU_13.jpeg", + "JLGLU/JLGLU_14.jpeg", + "JLGLU/JLGLU_15.jpeg", + "JLGLU/JLGLU_16.jpeg", + "JLGLU/JLGLU_17.jpeg", + "JLGLU/JLGLU_18.jpeg", + "JLGLU/JLGLU_19.jpeg", + "JLGLU/JLGLU_20.jpeg", + "JLGLU/JLGLU_21.jpeg", + "JLGLU/JLGLU_22.jpeg", + "JLGLU/JLGLU_23.jpeg", + "JLGLU/JLGLU_24.jpeg", + "JLGLU/JLGLU_25.jpeg", + "JLGLU/JLGLU_26.jpeg", + "JLGLU/JLGLU_27.jpeg", + "JLGLU/JLGLU_28.jpeg", + "JLGLU/JLGLU_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 29, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person held the dish?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Sat on the floor.\nC. Took the cup/glass/bottle.\nD. Threw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the cup glass bottle", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the cup glass bottle", + "sat on the floor", + "took the cup glass bottle", + "threw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "DSZYT/DSZYT_0.jpeg", + "DSZYT/DSZYT_1.jpeg", + "DSZYT/DSZYT_2.jpeg", + "DSZYT/DSZYT_3.jpeg", + "DSZYT/DSZYT_4.jpeg", + "DSZYT/DSZYT_5.jpeg", + "DSZYT/DSZYT_6.jpeg", + "DSZYT/DSZYT_7.jpeg", + "DSZYT/DSZYT_8.jpeg", + "DSZYT/DSZYT_9.jpeg", + "DSZYT/DSZYT_10.jpeg", + "DSZYT/DSZYT_11.jpeg", + "DSZYT/DSZYT_12.jpeg", + "DSZYT/DSZYT_13.jpeg", + "DSZYT/DSZYT_14.jpeg", + "DSZYT/DSZYT_15.jpeg", + "DSZYT/DSZYT_16.jpeg", + "DSZYT/DSZYT_17.jpeg", + "DSZYT/DSZYT_18.jpeg", + "DSZYT/DSZYT_19.jpeg", + "DSZYT/DSZYT_20.jpeg", + "DSZYT/DSZYT_21.jpeg", + "DSZYT/DSZYT_22.jpeg", + "DSZYT/DSZYT_23.jpeg", + "DSZYT/DSZYT_24.jpeg", + "DSZYT/DSZYT_25.jpeg", + "DSZYT/DSZYT_26.jpeg", + "DSZYT/DSZYT_27.jpeg", + "DSZYT/DSZYT_28.jpeg", + "DSZYT/DSZYT_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 34, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person held the food?\nChoice list: \nA. Took the food.\nB. Opened the bag.\nC. Took the sandwich.\nD. Tidied up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the food", + "opened the bag", + "took the sandwich", + "tidied up the table" + ], + "image_quantity_level": "Medium", + "image": [ + "CJ58B/CJ58B_0.jpeg", + "CJ58B/CJ58B_1.jpeg", + "CJ58B/CJ58B_2.jpeg", + "CJ58B/CJ58B_3.jpeg", + "CJ58B/CJ58B_4.jpeg", + "CJ58B/CJ58B_5.jpeg", + "CJ58B/CJ58B_6.jpeg", + "CJ58B/CJ58B_7.jpeg", + "CJ58B/CJ58B_8.jpeg", + "CJ58B/CJ58B_9.jpeg", + "CJ58B/CJ58B_10.jpeg", + "CJ58B/CJ58B_11.jpeg", + "CJ58B/CJ58B_12.jpeg", + "CJ58B/CJ58B_13.jpeg", + "CJ58B/CJ58B_14.jpeg", + "CJ58B/CJ58B_15.jpeg", + "CJ58B/CJ58B_16.jpeg", + "CJ58B/CJ58B_17.jpeg", + "CJ58B/CJ58B_18.jpeg", + "CJ58B/CJ58B_19.jpeg", + "CJ58B/CJ58B_20.jpeg", + "CJ58B/CJ58B_21.jpeg", + "CJ58B/CJ58B_22.jpeg", + "CJ58B/CJ58B_23.jpeg", + "CJ58B/CJ58B_24.jpeg", + "CJ58B/CJ58B_25.jpeg", + "CJ58B/CJ58B_26.jpeg", + "CJ58B/CJ58B_27.jpeg", + "CJ58B/CJ58B_28.jpeg", + "CJ58B/CJ58B_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 38, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person held the food?\nChoice list: \nA. Opened the bag.\nB. Closed the closet/cabinet.\nC. Opened the laptop.\nD. Ate the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the bag", + "closed the closet cabinet", + "opened the laptop", + "ate the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "XFRYR/XFRYR_0.jpeg", + "XFRYR/XFRYR_1.jpeg", + "XFRYR/XFRYR_2.jpeg", + "XFRYR/XFRYR_3.jpeg", + "XFRYR/XFRYR_4.jpeg", + "XFRYR/XFRYR_5.jpeg", + "XFRYR/XFRYR_6.jpeg", + "XFRYR/XFRYR_7.jpeg", + "XFRYR/XFRYR_8.jpeg", + "XFRYR/XFRYR_9.jpeg", + "XFRYR/XFRYR_10.jpeg", + "XFRYR/XFRYR_11.jpeg", + "XFRYR/XFRYR_12.jpeg", + "XFRYR/XFRYR_13.jpeg", + "XFRYR/XFRYR_14.jpeg", + "XFRYR/XFRYR_15.jpeg", + "XFRYR/XFRYR_16.jpeg", + "XFRYR/XFRYR_17.jpeg", + "XFRYR/XFRYR_18.jpeg", + "XFRYR/XFRYR_19.jpeg", + "XFRYR/XFRYR_20.jpeg", + "XFRYR/XFRYR_21.jpeg", + "XFRYR/XFRYR_22.jpeg", + "XFRYR/XFRYR_23.jpeg", + "XFRYR/XFRYR_24.jpeg", + "XFRYR/XFRYR_25.jpeg", + "XFRYR/XFRYR_26.jpeg", + "XFRYR/XFRYR_27.jpeg", + "XFRYR/XFRYR_28.jpeg", + "XFRYR/XFRYR_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 48, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person put down the clothes?\nChoice list: \nA. Tidied up the closet/cabinet.\nB. Tidied up the blanket.\nC. Took the phone/camera.\nD. Sat on the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidied up the closet cabinet", + "tidied up the blanket", + "took the phone camera", + "sat on the table" + ], + "image_quantity_level": "Medium", + "image": [ + "84893/84893_0.jpeg", + "84893/84893_1.jpeg", + "84893/84893_2.jpeg", + "84893/84893_3.jpeg", + "84893/84893_4.jpeg", + "84893/84893_5.jpeg", + "84893/84893_6.jpeg", + "84893/84893_7.jpeg", + "84893/84893_8.jpeg", + "84893/84893_9.jpeg", + "84893/84893_10.jpeg", + "84893/84893_11.jpeg", + "84893/84893_12.jpeg", + "84893/84893_13.jpeg", + "84893/84893_14.jpeg", + "84893/84893_15.jpeg", + "84893/84893_16.jpeg", + "84893/84893_17.jpeg", + "84893/84893_18.jpeg", + "84893/84893_19.jpeg", + "84893/84893_20.jpeg", + "84893/84893_21.jpeg", + "84893/84893_22.jpeg", + "84893/84893_23.jpeg", + "84893/84893_24.jpeg", + "84893/84893_25.jpeg", + "84893/84893_26.jpeg", + "84893/84893_27.jpeg", + "84893/84893_28.jpeg", + "84893/84893_29.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 51, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person closed the door?\nChoice list: \nA. Threw the broom.\nB. Took the book.\nC. Put down the phone/camera.\nD. Put down the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "threw the broom", + "took the book", + "put down the phone camera", + "put down the picture" + ], + "image_quantity_level": "Medium", + "image": [ + "OK2AF/OK2AF_0.jpeg", + "OK2AF/OK2AF_1.jpeg", + "OK2AF/OK2AF_2.jpeg", + "OK2AF/OK2AF_3.jpeg", + "OK2AF/OK2AF_4.jpeg", + "OK2AF/OK2AF_5.jpeg", + "OK2AF/OK2AF_6.jpeg", + "OK2AF/OK2AF_7.jpeg", + "OK2AF/OK2AF_8.jpeg", + "OK2AF/OK2AF_9.jpeg", + "OK2AF/OK2AF_10.jpeg", + "OK2AF/OK2AF_11.jpeg", + "OK2AF/OK2AF_12.jpeg", + "OK2AF/OK2AF_13.jpeg", + "OK2AF/OK2AF_14.jpeg", + "OK2AF/OK2AF_15.jpeg", + "OK2AF/OK2AF_16.jpeg", + "OK2AF/OK2AF_17.jpeg", + "OK2AF/OK2AF_18.jpeg", + "OK2AF/OK2AF_19.jpeg", + "OK2AF/OK2AF_20.jpeg", + "OK2AF/OK2AF_21.jpeg", + "OK2AF/OK2AF_22.jpeg", + "OK2AF/OK2AF_23.jpeg", + "OK2AF/OK2AF_24.jpeg", + "OK2AF/OK2AF_25.jpeg", + "OK2AF/OK2AF_26.jpeg", + "OK2AF/OK2AF_27.jpeg", + "OK2AF/OK2AF_28.jpeg", + "OK2AF/OK2AF_29.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 58, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person closed the book?\nChoice list: \nA. Put down the phone/camera.\nB. Sat at the table.\nC. Put down the cup/glass/bottle.\nD. Put down the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the phone camera", + "sat at the table", + "put down the cup glass bottle", + "put down the broom" + ], + "image_quantity_level": "Medium", + "image": [ + "UDGRS/UDGRS_0.jpeg", + "UDGRS/UDGRS_1.jpeg", + "UDGRS/UDGRS_2.jpeg", + "UDGRS/UDGRS_3.jpeg", + "UDGRS/UDGRS_4.jpeg", + "UDGRS/UDGRS_5.jpeg", + "UDGRS/UDGRS_6.jpeg", + "UDGRS/UDGRS_7.jpeg", + "UDGRS/UDGRS_8.jpeg", + "UDGRS/UDGRS_9.jpeg", + "UDGRS/UDGRS_10.jpeg", + "UDGRS/UDGRS_11.jpeg", + "UDGRS/UDGRS_12.jpeg", + "UDGRS/UDGRS_13.jpeg", + "UDGRS/UDGRS_14.jpeg", + "UDGRS/UDGRS_15.jpeg", + "UDGRS/UDGRS_16.jpeg", + "UDGRS/UDGRS_17.jpeg", + "UDGRS/UDGRS_18.jpeg", + "UDGRS/UDGRS_19.jpeg", + "UDGRS/UDGRS_20.jpeg", + "UDGRS/UDGRS_21.jpeg", + "UDGRS/UDGRS_22.jpeg", + "UDGRS/UDGRS_23.jpeg", + "UDGRS/UDGRS_24.jpeg", + "UDGRS/UDGRS_25.jpeg", + "UDGRS/UDGRS_26.jpeg", + "UDGRS/UDGRS_27.jpeg", + "UDGRS/UDGRS_28.jpeg", + "UDGRS/UDGRS_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 16, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person took the dish?\nChoice list: \nA. Threw the shoe.\nB. Put down the food.\nC. Threw the pillow.\nD. Tidied up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the shoe", + "put down the food", + "threw the pillow", + "tidied up the table" + ], + "image_quantity_level": "Many", + "image": [ + "HL5OP/HL5OP_0.jpeg", + "HL5OP/HL5OP_1.jpeg", + "HL5OP/HL5OP_2.jpeg", + "HL5OP/HL5OP_3.jpeg", + "HL5OP/HL5OP_4.jpeg", + "HL5OP/HL5OP_5.jpeg", + "HL5OP/HL5OP_6.jpeg", + "HL5OP/HL5OP_7.jpeg", + "HL5OP/HL5OP_8.jpeg", + "HL5OP/HL5OP_9.jpeg", + "HL5OP/HL5OP_10.jpeg", + "HL5OP/HL5OP_11.jpeg", + "HL5OP/HL5OP_12.jpeg", + "HL5OP/HL5OP_13.jpeg", + "HL5OP/HL5OP_14.jpeg", + "HL5OP/HL5OP_15.jpeg", + "HL5OP/HL5OP_16.jpeg", + "HL5OP/HL5OP_17.jpeg", + "HL5OP/HL5OP_18.jpeg", + "HL5OP/HL5OP_19.jpeg", + "HL5OP/HL5OP_20.jpeg", + "HL5OP/HL5OP_21.jpeg", + "HL5OP/HL5OP_22.jpeg", + "HL5OP/HL5OP_23.jpeg", + "HL5OP/HL5OP_24.jpeg", + "HL5OP/HL5OP_25.jpeg", + "HL5OP/HL5OP_26.jpeg", + "HL5OP/HL5OP_27.jpeg", + "HL5OP/HL5OP_28.jpeg", + "HL5OP/HL5OP_29.jpeg", + "HL5OP/HL5OP_30.jpeg", + "HL5OP/HL5OP_31.jpeg", + "HL5OP/HL5OP_32.jpeg", + "HL5OP/HL5OP_33.jpeg", + "HL5OP/HL5OP_34.jpeg", + "HL5OP/HL5OP_35.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 17, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person sat at the table?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Opened the box.\nC. Opened the window.\nD. Closed the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the cup glass bottle", + "opened the box", + "opened the window", + "closed the book" + ], + "image_quantity_level": "Medium", + "image": [ + "FQS7O/FQS7O_0.jpeg", + "FQS7O/FQS7O_1.jpeg", + "FQS7O/FQS7O_2.jpeg", + "FQS7O/FQS7O_3.jpeg", + "FQS7O/FQS7O_4.jpeg", + "FQS7O/FQS7O_5.jpeg", + "FQS7O/FQS7O_6.jpeg", + "FQS7O/FQS7O_7.jpeg", + "FQS7O/FQS7O_8.jpeg", + "FQS7O/FQS7O_9.jpeg", + "FQS7O/FQS7O_10.jpeg", + "FQS7O/FQS7O_11.jpeg", + "FQS7O/FQS7O_12.jpeg", + "FQS7O/FQS7O_13.jpeg", + "FQS7O/FQS7O_14.jpeg", + "FQS7O/FQS7O_15.jpeg", + "FQS7O/FQS7O_16.jpeg", + "FQS7O/FQS7O_17.jpeg", + "FQS7O/FQS7O_18.jpeg", + "FQS7O/FQS7O_19.jpeg", + "FQS7O/FQS7O_20.jpeg", + "FQS7O/FQS7O_21.jpeg", + "FQS7O/FQS7O_22.jpeg", + "FQS7O/FQS7O_23.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 20, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person watched at the book?\nChoice list: \nA. Closed the window.\nB. Took the paper/notebook.\nC. Took the book.\nD. Threw the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the window", + "took the paper notebook", + "took the book", + "threw the shoe" + ], + "image_quantity_level": "Many", + "image": [ + "EFR6I/EFR6I_0.jpeg", + "EFR6I/EFR6I_1.jpeg", + "EFR6I/EFR6I_2.jpeg", + "EFR6I/EFR6I_3.jpeg", + "EFR6I/EFR6I_4.jpeg", + "EFR6I/EFR6I_5.jpeg", + "EFR6I/EFR6I_6.jpeg", + "EFR6I/EFR6I_7.jpeg", + "EFR6I/EFR6I_8.jpeg", + "EFR6I/EFR6I_9.jpeg", + "EFR6I/EFR6I_10.jpeg", + "EFR6I/EFR6I_11.jpeg", + "EFR6I/EFR6I_12.jpeg", + "EFR6I/EFR6I_13.jpeg", + "EFR6I/EFR6I_14.jpeg", + "EFR6I/EFR6I_15.jpeg", + "EFR6I/EFR6I_16.jpeg", + "EFR6I/EFR6I_17.jpeg", + "EFR6I/EFR6I_18.jpeg", + "EFR6I/EFR6I_19.jpeg", + "EFR6I/EFR6I_20.jpeg", + "EFR6I/EFR6I_21.jpeg", + "EFR6I/EFR6I_22.jpeg", + "EFR6I/EFR6I_23.jpeg", + "EFR6I/EFR6I_24.jpeg", + "EFR6I/EFR6I_25.jpeg", + "EFR6I/EFR6I_26.jpeg", + "EFR6I/EFR6I_27.jpeg", + "EFR6I/EFR6I_28.jpeg", + "EFR6I/EFR6I_29.jpeg", + "EFR6I/EFR6I_30.jpeg", + "EFR6I/EFR6I_31.jpeg", + "EFR6I/EFR6I_32.jpeg", + "EFR6I/EFR6I_33.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 31, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person sat at the table?\nChoice list: \nA. Opened the book.\nB. Opened the refrigerator.\nC. Took the clothes.\nD. Sat on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "opened the book", + "opened the refrigerator", + "took the clothes", + "sat on the sofa couch" + ], + "image_quantity_level": "Many", + "image": [ + "HOI88/HOI88_0.jpeg", + "HOI88/HOI88_1.jpeg", + "HOI88/HOI88_2.jpeg", + "HOI88/HOI88_3.jpeg", + "HOI88/HOI88_4.jpeg", + "HOI88/HOI88_5.jpeg", + "HOI88/HOI88_6.jpeg", + "HOI88/HOI88_7.jpeg", + "HOI88/HOI88_8.jpeg", + "HOI88/HOI88_9.jpeg", + "HOI88/HOI88_10.jpeg", + "HOI88/HOI88_11.jpeg", + "HOI88/HOI88_12.jpeg", + "HOI88/HOI88_13.jpeg", + "HOI88/HOI88_14.jpeg", + "HOI88/HOI88_15.jpeg", + "HOI88/HOI88_16.jpeg", + "HOI88/HOI88_17.jpeg", + "HOI88/HOI88_18.jpeg", + "HOI88/HOI88_19.jpeg", + "HOI88/HOI88_20.jpeg", + "HOI88/HOI88_21.jpeg", + "HOI88/HOI88_22.jpeg", + "HOI88/HOI88_23.jpeg", + "HOI88/HOI88_24.jpeg", + "HOI88/HOI88_25.jpeg", + "HOI88/HOI88_26.jpeg", + "HOI88/HOI88_27.jpeg", + "HOI88/HOI88_28.jpeg", + "HOI88/HOI88_29.jpeg", + "HOI88/HOI88_30.jpeg", + "HOI88/HOI88_31.jpeg", + "HOI88/HOI88_32.jpeg", + "HOI88/HOI88_33.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 21, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person took the towel?\nChoice list: \nA. Opened the bag.\nB. Put down the phone/camera.\nC. Threw the clothes.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the bag", + "put down the phone camera", + "threw the clothes", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "0OE6M/0OE6M_0.jpeg", + "0OE6M/0OE6M_1.jpeg", + "0OE6M/0OE6M_2.jpeg", + "0OE6M/0OE6M_3.jpeg", + "0OE6M/0OE6M_4.jpeg", + "0OE6M/0OE6M_5.jpeg", + "0OE6M/0OE6M_6.jpeg", + "0OE6M/0OE6M_7.jpeg", + "0OE6M/0OE6M_8.jpeg", + "0OE6M/0OE6M_9.jpeg", + "0OE6M/0OE6M_10.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 25, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened before the person took the pillow?\nChoice list: \nA. Sat on the floor.\nB. Threw the blanket.\nC. Threw the shoe.\nD. Closed the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the floor", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "sat on the floor", + "threw the blanket", + "threw the shoe", + "closed the door" + ], + "image_quantity_level": "Medium", + "image": [ + "DPLMM/DPLMM_0.jpeg", + "DPLMM/DPLMM_1.jpeg", + "DPLMM/DPLMM_2.jpeg", + "DPLMM/DPLMM_3.jpeg", + "DPLMM/DPLMM_4.jpeg", + "DPLMM/DPLMM_5.jpeg", + "DPLMM/DPLMM_6.jpeg", + "DPLMM/DPLMM_7.jpeg", + "DPLMM/DPLMM_8.jpeg", + "DPLMM/DPLMM_9.jpeg", + "DPLMM/DPLMM_10.jpeg", + "DPLMM/DPLMM_11.jpeg", + "DPLMM/DPLMM_12.jpeg", + "DPLMM/DPLMM_13.jpeg", + "DPLMM/DPLMM_14.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 30, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person took the blanket?\nChoice list: \nA. Sat on the sofa/couch.\nB. Threw the box.\nC. Put down the broom.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "sat on the sofa couch", + "threw the box", + "put down the broom", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "YMGGV/YMGGV_0.jpeg", + "YMGGV/YMGGV_1.jpeg", + "YMGGV/YMGGV_2.jpeg", + "YMGGV/YMGGV_3.jpeg", + "YMGGV/YMGGV_4.jpeg", + "YMGGV/YMGGV_5.jpeg", + "YMGGV/YMGGV_6.jpeg", + "YMGGV/YMGGV_7.jpeg", + "YMGGV/YMGGV_8.jpeg", + "YMGGV/YMGGV_9.jpeg", + "YMGGV/YMGGV_10.jpeg", + "YMGGV/YMGGV_11.jpeg", + "YMGGV/YMGGV_12.jpeg", + "YMGGV/YMGGV_13.jpeg", + "YMGGV/YMGGV_14.jpeg", + "YMGGV/YMGGV_15.jpeg", + "YMGGV/YMGGV_16.jpeg", + "YMGGV/YMGGV_17.jpeg", + "YMGGV/YMGGV_18.jpeg", + "YMGGV/YMGGV_19.jpeg", + "YMGGV/YMGGV_20.jpeg", + "YMGGV/YMGGV_21.jpeg", + "YMGGV/YMGGV_22.jpeg", + "YMGGV/YMGGV_23.jpeg", + "YMGGV/YMGGV_24.jpeg", + "YMGGV/YMGGV_25.jpeg", + "YMGGV/YMGGV_26.jpeg", + "YMGGV/YMGGV_27.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 35, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person took the clothes?\nChoice list: \nA. Washed the clothes.\nB. Threw the blanket.\nC. Took the towel.\nD. Opened the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "washed the clothes", + "threw the blanket", + "took the towel", + "opened the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "V3RAX/V3RAX_0.jpeg", + "V3RAX/V3RAX_1.jpeg", + "V3RAX/V3RAX_2.jpeg", + "V3RAX/V3RAX_3.jpeg", + "V3RAX/V3RAX_4.jpeg", + "V3RAX/V3RAX_5.jpeg", + "V3RAX/V3RAX_6.jpeg", + "V3RAX/V3RAX_7.jpeg", + "V3RAX/V3RAX_8.jpeg", + "V3RAX/V3RAX_9.jpeg", + "V3RAX/V3RAX_10.jpeg", + "V3RAX/V3RAX_11.jpeg", + "V3RAX/V3RAX_12.jpeg", + "V3RAX/V3RAX_13.jpeg", + "V3RAX/V3RAX_14.jpeg", + "V3RAX/V3RAX_15.jpeg", + "V3RAX/V3RAX_16.jpeg", + "V3RAX/V3RAX_17.jpeg", + "V3RAX/V3RAX_18.jpeg", + "V3RAX/V3RAX_19.jpeg", + "V3RAX/V3RAX_20.jpeg", + "V3RAX/V3RAX_21.jpeg", + "V3RAX/V3RAX_22.jpeg", + "V3RAX/V3RAX_23.jpeg", + "V3RAX/V3RAX_24.jpeg", + "V3RAX/V3RAX_25.jpeg", + "V3RAX/V3RAX_26.jpeg", + "V3RAX/V3RAX_27.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 32, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person opened the closet/cabinet?\nChoice list: \nA. Took the sandwich.\nB. Took the dish.\nC. Took the phone/camera.\nD. Closed the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the sandwich", + "took the dish", + "took the phone camera", + "closed the door" + ], + "image_quantity_level": "Medium", + "image": [ + "LH6LE/LH6LE_0.jpeg", + "LH6LE/LH6LE_1.jpeg", + "LH6LE/LH6LE_2.jpeg", + "LH6LE/LH6LE_3.jpeg", + "LH6LE/LH6LE_4.jpeg", + "LH6LE/LH6LE_5.jpeg", + "LH6LE/LH6LE_6.jpeg", + "LH6LE/LH6LE_7.jpeg", + "LH6LE/LH6LE_8.jpeg", + "LH6LE/LH6LE_9.jpeg", + "LH6LE/LH6LE_10.jpeg", + "LH6LE/LH6LE_11.jpeg", + "LH6LE/LH6LE_12.jpeg", + "LH6LE/LH6LE_13.jpeg", + "LH6LE/LH6LE_14.jpeg", + "LH6LE/LH6LE_15.jpeg", + "LH6LE/LH6LE_16.jpeg", + "LH6LE/LH6LE_17.jpeg", + "LH6LE/LH6LE_18.jpeg", + "LH6LE/LH6LE_19.jpeg", + "LH6LE/LH6LE_20.jpeg", + "LH6LE/LH6LE_21.jpeg", + "LH6LE/LH6LE_22.jpeg", + "LH6LE/LH6LE_23.jpeg", + "LH6LE/LH6LE_24.jpeg", + "LH6LE/LH6LE_25.jpeg", + "LH6LE/LH6LE_26.jpeg", + "LH6LE/LH6LE_27.jpeg", + "LH6LE/LH6LE_28.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 66, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person took the phone/camera?\nChoice list: \nA. Threw the bag.\nB. Opened the bag.\nC. Put down the food.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the bag", + "opened the bag", + "put down the food", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "7TA23/7TA23_0.jpeg", + "7TA23/7TA23_1.jpeg", + "7TA23/7TA23_2.jpeg", + "7TA23/7TA23_3.jpeg", + "7TA23/7TA23_4.jpeg", + "7TA23/7TA23_5.jpeg", + "7TA23/7TA23_6.jpeg", + "7TA23/7TA23_7.jpeg", + "7TA23/7TA23_8.jpeg", + "7TA23/7TA23_9.jpeg", + "7TA23/7TA23_10.jpeg", + "7TA23/7TA23_11.jpeg", + "7TA23/7TA23_12.jpeg", + "7TA23/7TA23_13.jpeg", + "7TA23/7TA23_14.jpeg", + "7TA23/7TA23_15.jpeg", + "7TA23/7TA23_16.jpeg", + "7TA23/7TA23_17.jpeg", + "7TA23/7TA23_18.jpeg", + "7TA23/7TA23_19.jpeg", + "7TA23/7TA23_20.jpeg", + "7TA23/7TA23_21.jpeg", + "7TA23/7TA23_22.jpeg", + "7TA23/7TA23_23.jpeg", + "7TA23/7TA23_24.jpeg", + "7TA23/7TA23_25.jpeg", + "7TA23/7TA23_26.jpeg", + "7TA23/7TA23_27.jpeg", + "7TA23/7TA23_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 37, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the cup/glass/bottle?\nChoice list: \nA. Threw the blanket.\nB. Threw the box.\nC. Put down the shoe.\nD. Took the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "threw the blanket", + "threw the box", + "put down the shoe", + "took the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "MLS4T/MLS4T_0.jpeg", + "MLS4T/MLS4T_1.jpeg", + "MLS4T/MLS4T_2.jpeg", + "MLS4T/MLS4T_3.jpeg", + "MLS4T/MLS4T_4.jpeg", + "MLS4T/MLS4T_5.jpeg", + "MLS4T/MLS4T_6.jpeg", + "MLS4T/MLS4T_7.jpeg", + "MLS4T/MLS4T_8.jpeg", + "MLS4T/MLS4T_9.jpeg", + "MLS4T/MLS4T_10.jpeg", + "MLS4T/MLS4T_11.jpeg", + "MLS4T/MLS4T_12.jpeg", + "MLS4T/MLS4T_13.jpeg", + "MLS4T/MLS4T_14.jpeg", + "MLS4T/MLS4T_15.jpeg", + "MLS4T/MLS4T_16.jpeg", + "MLS4T/MLS4T_17.jpeg", + "MLS4T/MLS4T_18.jpeg", + "MLS4T/MLS4T_19.jpeg", + "MLS4T/MLS4T_20.jpeg", + "MLS4T/MLS4T_21.jpeg", + "MLS4T/MLS4T_22.jpeg", + "MLS4T/MLS4T_23.jpeg", + "MLS4T/MLS4T_24.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 40, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Put down the paper/notebook.\nB. Tidied up the blanket.\nC. Sat on the floor.\nD. Took the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the floor", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the paper notebook", + "tidied up the blanket", + "sat on the floor", + "took the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "HCM5Y/HCM5Y_0.jpeg", + "HCM5Y/HCM5Y_1.jpeg", + "HCM5Y/HCM5Y_2.jpeg", + "HCM5Y/HCM5Y_3.jpeg", + "HCM5Y/HCM5Y_4.jpeg", + "HCM5Y/HCM5Y_5.jpeg", + "HCM5Y/HCM5Y_6.jpeg", + "HCM5Y/HCM5Y_7.jpeg", + "HCM5Y/HCM5Y_8.jpeg", + "HCM5Y/HCM5Y_9.jpeg", + "HCM5Y/HCM5Y_10.jpeg", + "HCM5Y/HCM5Y_11.jpeg", + "HCM5Y/HCM5Y_12.jpeg", + "HCM5Y/HCM5Y_13.jpeg", + "HCM5Y/HCM5Y_14.jpeg", + "HCM5Y/HCM5Y_15.jpeg", + "HCM5Y/HCM5Y_16.jpeg", + "HCM5Y/HCM5Y_17.jpeg", + "HCM5Y/HCM5Y_18.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 42, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person drank from the cup/glass/bottle?\nChoice list: \nA. Tidied up the table.\nB. Took the bag.\nC. Took the dish.\nD. Threw the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidied up the table", + "took the bag", + "took the dish", + "threw the book" + ], + "image_quantity_level": "Many", + "image": [ + "V9W9C/V9W9C_0.jpeg", + "V9W9C/V9W9C_1.jpeg", + "V9W9C/V9W9C_2.jpeg", + "V9W9C/V9W9C_3.jpeg", + "V9W9C/V9W9C_4.jpeg", + "V9W9C/V9W9C_5.jpeg", + "V9W9C/V9W9C_6.jpeg", + "V9W9C/V9W9C_7.jpeg", + "V9W9C/V9W9C_8.jpeg", + "V9W9C/V9W9C_9.jpeg", + "V9W9C/V9W9C_10.jpeg", + "V9W9C/V9W9C_11.jpeg", + "V9W9C/V9W9C_12.jpeg", + "V9W9C/V9W9C_13.jpeg", + "V9W9C/V9W9C_14.jpeg", + "V9W9C/V9W9C_15.jpeg", + "V9W9C/V9W9C_16.jpeg", + "V9W9C/V9W9C_17.jpeg", + "V9W9C/V9W9C_18.jpeg", + "V9W9C/V9W9C_19.jpeg", + "V9W9C/V9W9C_20.jpeg", + "V9W9C/V9W9C_21.jpeg", + "V9W9C/V9W9C_22.jpeg", + "V9W9C/V9W9C_23.jpeg", + "V9W9C/V9W9C_24.jpeg", + "V9W9C/V9W9C_25.jpeg", + "V9W9C/V9W9C_26.jpeg", + "V9W9C/V9W9C_27.jpeg", + "V9W9C/V9W9C_28.jpeg", + "V9W9C/V9W9C_29.jpeg", + "V9W9C/V9W9C_30.jpeg", + "V9W9C/V9W9C_31.jpeg", + "V9W9C/V9W9C_32.jpeg", + "V9W9C/V9W9C_33.jpeg", + "V9W9C/V9W9C_34.jpeg", + "V9W9C/V9W9C_35.jpeg", + "V9W9C/V9W9C_36.jpeg", + "V9W9C/V9W9C_37.jpeg", + "V9W9C/V9W9C_38.jpeg", + "V9W9C/V9W9C_39.jpeg", + "V9W9C/V9W9C_40.jpeg", + "V9W9C/V9W9C_41.jpeg", + "V9W9C/V9W9C_42.jpeg", + "V9W9C/V9W9C_43.jpeg", + "V9W9C/V9W9C_44.jpeg", + "V9W9C/V9W9C_45.jpeg", + "V9W9C/V9W9C_46.jpeg", + "V9W9C/V9W9C_47.jpeg", + "V9W9C/V9W9C_48.jpeg", + "V9W9C/V9W9C_49.jpeg", + "V9W9C/V9W9C_50.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 49, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person opened the bag?\nChoice list: \nA. Sat at the table.\nB. Tidied up the closet/cabinet.\nC. Took the laptop.\nD. Put down the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "sat at the table", + "tidied up the closet cabinet", + "took the laptop", + "put down the cup glass bottle" + ], + "image_quantity_level": "Many", + "image": [ + "T1CQE/T1CQE_0.jpeg", + "T1CQE/T1CQE_1.jpeg", + "T1CQE/T1CQE_2.jpeg", + "T1CQE/T1CQE_3.jpeg", + "T1CQE/T1CQE_4.jpeg", + "T1CQE/T1CQE_5.jpeg", + "T1CQE/T1CQE_6.jpeg", + "T1CQE/T1CQE_7.jpeg", + "T1CQE/T1CQE_8.jpeg", + "T1CQE/T1CQE_9.jpeg", + "T1CQE/T1CQE_10.jpeg", + "T1CQE/T1CQE_11.jpeg", + "T1CQE/T1CQE_12.jpeg", + "T1CQE/T1CQE_13.jpeg", + "T1CQE/T1CQE_14.jpeg", + "T1CQE/T1CQE_15.jpeg", + "T1CQE/T1CQE_16.jpeg", + "T1CQE/T1CQE_17.jpeg", + "T1CQE/T1CQE_18.jpeg", + "T1CQE/T1CQE_19.jpeg", + "T1CQE/T1CQE_20.jpeg", + "T1CQE/T1CQE_21.jpeg", + "T1CQE/T1CQE_22.jpeg", + "T1CQE/T1CQE_23.jpeg", + "T1CQE/T1CQE_24.jpeg", + "T1CQE/T1CQE_25.jpeg", + "T1CQE/T1CQE_26.jpeg", + "T1CQE/T1CQE_27.jpeg", + "T1CQE/T1CQE_28.jpeg", + "T1CQE/T1CQE_29.jpeg", + "T1CQE/T1CQE_30.jpeg", + "T1CQE/T1CQE_31.jpeg", + "T1CQE/T1CQE_32.jpeg", + "T1CQE/T1CQE_33.jpeg", + "T1CQE/T1CQE_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 62, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person sat at the table?\nChoice list: \nA. Put down the broom.\nB. Put down the box.\nC. Opened the door.\nD. Closed the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the broom", + "put down the box", + "opened the door", + "closed the closet cabinet" + ], + "image_quantity_level": "Many", + "image": [ + "0HGNK/0HGNK_0.jpeg", + "0HGNK/0HGNK_1.jpeg", + "0HGNK/0HGNK_2.jpeg", + "0HGNK/0HGNK_3.jpeg", + "0HGNK/0HGNK_4.jpeg", + "0HGNK/0HGNK_5.jpeg", + "0HGNK/0HGNK_6.jpeg", + "0HGNK/0HGNK_7.jpeg", + "0HGNK/0HGNK_8.jpeg", + "0HGNK/0HGNK_9.jpeg", + "0HGNK/0HGNK_10.jpeg", + "0HGNK/0HGNK_11.jpeg", + "0HGNK/0HGNK_12.jpeg", + "0HGNK/0HGNK_13.jpeg", + "0HGNK/0HGNK_14.jpeg", + "0HGNK/0HGNK_15.jpeg", + "0HGNK/0HGNK_16.jpeg", + "0HGNK/0HGNK_17.jpeg", + "0HGNK/0HGNK_18.jpeg", + "0HGNK/0HGNK_19.jpeg", + "0HGNK/0HGNK_20.jpeg", + "0HGNK/0HGNK_21.jpeg", + "0HGNK/0HGNK_22.jpeg", + "0HGNK/0HGNK_23.jpeg", + "0HGNK/0HGNK_24.jpeg", + "0HGNK/0HGNK_25.jpeg", + "0HGNK/0HGNK_26.jpeg", + "0HGNK/0HGNK_27.jpeg", + "0HGNK/0HGNK_28.jpeg", + "0HGNK/0HGNK_29.jpeg", + "0HGNK/0HGNK_30.jpeg", + "0HGNK/0HGNK_31.jpeg", + "0HGNK/0HGNK_32.jpeg", + "0HGNK/0HGNK_33.jpeg", + "0HGNK/0HGNK_34.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 50, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened before the person closed the box?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Put down the towel.\nC. Threw the box.\nD. Sat on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the cup glass bottle", + "put down the towel", + "threw the box", + "sat on the sofa couch" + ], + "image_quantity_level": "Many", + "image": [ + "OEE36/OEE36_0.jpeg", + "OEE36/OEE36_1.jpeg", + "OEE36/OEE36_2.jpeg", + "OEE36/OEE36_3.jpeg", + "OEE36/OEE36_4.jpeg", + "OEE36/OEE36_5.jpeg", + "OEE36/OEE36_6.jpeg", + "OEE36/OEE36_7.jpeg", + "OEE36/OEE36_8.jpeg", + "OEE36/OEE36_9.jpeg", + "OEE36/OEE36_10.jpeg", + "OEE36/OEE36_11.jpeg", + "OEE36/OEE36_12.jpeg", + "OEE36/OEE36_13.jpeg", + "OEE36/OEE36_14.jpeg", + "OEE36/OEE36_15.jpeg", + "OEE36/OEE36_16.jpeg", + "OEE36/OEE36_17.jpeg", + "OEE36/OEE36_18.jpeg", + "OEE36/OEE36_19.jpeg", + "OEE36/OEE36_20.jpeg", + "OEE36/OEE36_21.jpeg", + "OEE36/OEE36_22.jpeg", + "OEE36/OEE36_23.jpeg", + "OEE36/OEE36_24.jpeg", + "OEE36/OEE36_25.jpeg", + "OEE36/OEE36_26.jpeg", + "OEE36/OEE36_27.jpeg", + "OEE36/OEE36_28.jpeg", + "OEE36/OEE36_29.jpeg", + "OEE36/OEE36_30.jpeg", + "OEE36/OEE36_31.jpeg", + "OEE36/OEE36_32.jpeg", + "OEE36/OEE36_33.jpeg", + "OEE36/OEE36_34.jpeg", + "OEE36/OEE36_35.jpeg", + "OEE36/OEE36_36.jpeg", + "OEE36/OEE36_37.jpeg", + "OEE36/OEE36_38.jpeg", + "OEE36/OEE36_39.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 52, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person put down the pillow?\nChoice list: \nA. Took the picture.\nB. Tidied up the table.\nC. Sat on the floor.\nD. Sat at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the picture", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the picture", + "tidied up the table", + "sat on the floor", + "sat at the table" + ], + "image_quantity_level": "Many", + "image": [ + "ZAJAJ/ZAJAJ_0.jpeg", + "ZAJAJ/ZAJAJ_1.jpeg", + "ZAJAJ/ZAJAJ_2.jpeg", + "ZAJAJ/ZAJAJ_3.jpeg", + "ZAJAJ/ZAJAJ_4.jpeg", + "ZAJAJ/ZAJAJ_5.jpeg", + "ZAJAJ/ZAJAJ_6.jpeg", + "ZAJAJ/ZAJAJ_7.jpeg", + "ZAJAJ/ZAJAJ_8.jpeg", + "ZAJAJ/ZAJAJ_9.jpeg", + "ZAJAJ/ZAJAJ_10.jpeg", + "ZAJAJ/ZAJAJ_11.jpeg", + "ZAJAJ/ZAJAJ_12.jpeg", + "ZAJAJ/ZAJAJ_13.jpeg", + "ZAJAJ/ZAJAJ_14.jpeg", + "ZAJAJ/ZAJAJ_15.jpeg", + "ZAJAJ/ZAJAJ_16.jpeg", + "ZAJAJ/ZAJAJ_17.jpeg", + "ZAJAJ/ZAJAJ_18.jpeg", + "ZAJAJ/ZAJAJ_19.jpeg", + "ZAJAJ/ZAJAJ_20.jpeg", + "ZAJAJ/ZAJAJ_21.jpeg", + "ZAJAJ/ZAJAJ_22.jpeg", + "ZAJAJ/ZAJAJ_23.jpeg", + "ZAJAJ/ZAJAJ_24.jpeg", + "ZAJAJ/ZAJAJ_25.jpeg", + "ZAJAJ/ZAJAJ_26.jpeg", + "ZAJAJ/ZAJAJ_27.jpeg", + "ZAJAJ/ZAJAJ_28.jpeg", + "ZAJAJ/ZAJAJ_29.jpeg", + "ZAJAJ/ZAJAJ_30.jpeg", + "ZAJAJ/ZAJAJ_31.jpeg", + "ZAJAJ/ZAJAJ_32.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 64, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person put down the blanket?\nChoice list: \nA. Took the pillow.\nB. Washed the cup/glass/bottle.\nC. Took the bag.\nD. Washed the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "washed the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the pillow", + "washed the cup glass bottle", + "took the bag", + "washed the clothes" + ], + "image_quantity_level": "Many", + "image": [ + "9BCZU/9BCZU_0.jpeg", + "9BCZU/9BCZU_1.jpeg", + "9BCZU/9BCZU_2.jpeg", + "9BCZU/9BCZU_3.jpeg", + "9BCZU/9BCZU_4.jpeg", + "9BCZU/9BCZU_5.jpeg", + "9BCZU/9BCZU_6.jpeg", + "9BCZU/9BCZU_7.jpeg", + "9BCZU/9BCZU_8.jpeg", + "9BCZU/9BCZU_9.jpeg", + "9BCZU/9BCZU_10.jpeg", + "9BCZU/9BCZU_11.jpeg", + "9BCZU/9BCZU_12.jpeg", + "9BCZU/9BCZU_13.jpeg", + "9BCZU/9BCZU_14.jpeg", + "9BCZU/9BCZU_15.jpeg", + "9BCZU/9BCZU_16.jpeg", + "9BCZU/9BCZU_17.jpeg", + "9BCZU/9BCZU_18.jpeg", + "9BCZU/9BCZU_19.jpeg", + "9BCZU/9BCZU_20.jpeg", + "9BCZU/9BCZU_21.jpeg", + "9BCZU/9BCZU_22.jpeg", + "9BCZU/9BCZU_23.jpeg", + "9BCZU/9BCZU_24.jpeg", + "9BCZU/9BCZU_25.jpeg", + "9BCZU/9BCZU_26.jpeg", + "9BCZU/9BCZU_27.jpeg", + "9BCZU/9BCZU_28.jpeg", + "9BCZU/9BCZU_29.jpeg", + "9BCZU/9BCZU_30.jpeg", + "9BCZU/9BCZU_31.jpeg", + "9BCZU/9BCZU_32.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 53, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person took the blanket?\nChoice list: \nA. Took the book.\nB. Put down the dish.\nC. Opened the closet/cabinet.\nD. Put down the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the book", + "put down the dish", + "opened the closet cabinet", + "put down the laptop" + ], + "image_quantity_level": "Many", + "image": [ + "CCCUJ/CCCUJ_0.jpeg", + "CCCUJ/CCCUJ_1.jpeg", + "CCCUJ/CCCUJ_2.jpeg", + "CCCUJ/CCCUJ_3.jpeg", + "CCCUJ/CCCUJ_4.jpeg", + "CCCUJ/CCCUJ_5.jpeg", + "CCCUJ/CCCUJ_6.jpeg", + "CCCUJ/CCCUJ_7.jpeg", + "CCCUJ/CCCUJ_8.jpeg", + "CCCUJ/CCCUJ_9.jpeg", + "CCCUJ/CCCUJ_10.jpeg", + "CCCUJ/CCCUJ_11.jpeg", + "CCCUJ/CCCUJ_12.jpeg", + "CCCUJ/CCCUJ_13.jpeg", + "CCCUJ/CCCUJ_14.jpeg", + "CCCUJ/CCCUJ_15.jpeg", + "CCCUJ/CCCUJ_16.jpeg", + "CCCUJ/CCCUJ_17.jpeg", + "CCCUJ/CCCUJ_18.jpeg", + "CCCUJ/CCCUJ_19.jpeg", + "CCCUJ/CCCUJ_20.jpeg", + "CCCUJ/CCCUJ_21.jpeg", + "CCCUJ/CCCUJ_22.jpeg", + "CCCUJ/CCCUJ_23.jpeg", + "CCCUJ/CCCUJ_24.jpeg", + "CCCUJ/CCCUJ_25.jpeg", + "CCCUJ/CCCUJ_26.jpeg", + "CCCUJ/CCCUJ_27.jpeg", + "CCCUJ/CCCUJ_28.jpeg", + "CCCUJ/CCCUJ_29.jpeg", + "CCCUJ/CCCUJ_30.jpeg", + "CCCUJ/CCCUJ_31.jpeg", + "CCCUJ/CCCUJ_32.jpeg", + "CCCUJ/CCCUJ_33.jpeg", + "CCCUJ/CCCUJ_34.jpeg", + "CCCUJ/CCCUJ_35.jpeg", + "CCCUJ/CCCUJ_36.jpeg", + "CCCUJ/CCCUJ_37.jpeg", + "CCCUJ/CCCUJ_38.jpeg", + "CCCUJ/CCCUJ_39.jpeg", + "CCCUJ/CCCUJ_40.jpeg", + "CCCUJ/CCCUJ_41.jpeg", + "CCCUJ/CCCUJ_42.jpeg", + "CCCUJ/CCCUJ_43.jpeg", + "CCCUJ/CCCUJ_44.jpeg", + "CCCUJ/CCCUJ_45.jpeg", + "CCCUJ/CCCUJ_46.jpeg", + "CCCUJ/CCCUJ_47.jpeg", + "CCCUJ/CCCUJ_48.jpeg", + "CCCUJ/CCCUJ_49.jpeg", + "CCCUJ/CCCUJ_50.jpeg", + "CCCUJ/CCCUJ_51.jpeg", + "CCCUJ/CCCUJ_52.jpeg", + "CCCUJ/CCCUJ_53.jpeg", + "CCCUJ/CCCUJ_54.jpeg", + "CCCUJ/CCCUJ_55.jpeg", + "CCCUJ/CCCUJ_56.jpeg", + "CCCUJ/CCCUJ_57.jpeg", + "CCCUJ/CCCUJ_58.jpeg", + "CCCUJ/CCCUJ_59.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 56, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person opened the book?\nChoice list: \nA. Took the paper/notebook.\nB. Put down the food.\nC. Threw the clothes.\nD. Took the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the paper notebook", + "put down the food", + "threw the clothes", + "took the dish" + ], + "image_quantity_level": "Many", + "image": [ + "NW0KT/NW0KT_0.jpeg", + "NW0KT/NW0KT_1.jpeg", + "NW0KT/NW0KT_2.jpeg", + "NW0KT/NW0KT_3.jpeg", + "NW0KT/NW0KT_4.jpeg", + "NW0KT/NW0KT_5.jpeg", + "NW0KT/NW0KT_6.jpeg", + "NW0KT/NW0KT_7.jpeg", + "NW0KT/NW0KT_8.jpeg", + "NW0KT/NW0KT_9.jpeg", + "NW0KT/NW0KT_10.jpeg", + "NW0KT/NW0KT_11.jpeg", + "NW0KT/NW0KT_12.jpeg", + "NW0KT/NW0KT_13.jpeg", + "NW0KT/NW0KT_14.jpeg", + "NW0KT/NW0KT_15.jpeg", + "NW0KT/NW0KT_16.jpeg", + "NW0KT/NW0KT_17.jpeg", + "NW0KT/NW0KT_18.jpeg", + "NW0KT/NW0KT_19.jpeg", + "NW0KT/NW0KT_20.jpeg", + "NW0KT/NW0KT_21.jpeg", + "NW0KT/NW0KT_22.jpeg", + "NW0KT/NW0KT_23.jpeg", + "NW0KT/NW0KT_24.jpeg", + "NW0KT/NW0KT_25.jpeg", + "NW0KT/NW0KT_26.jpeg", + "NW0KT/NW0KT_27.jpeg", + "NW0KT/NW0KT_28.jpeg", + "NW0KT/NW0KT_29.jpeg", + "NW0KT/NW0KT_30.jpeg", + "NW0KT/NW0KT_31.jpeg", + "NW0KT/NW0KT_32.jpeg", + "NW0KT/NW0KT_33.jpeg", + "NW0KT/NW0KT_34.jpeg", + "NW0KT/NW0KT_35.jpeg", + "NW0KT/NW0KT_36.jpeg", + "NW0KT/NW0KT_37.jpeg", + "NW0KT/NW0KT_38.jpeg", + "NW0KT/NW0KT_39.jpeg", + "NW0KT/NW0KT_40.jpeg", + "NW0KT/NW0KT_41.jpeg", + "NW0KT/NW0KT_42.jpeg", + "NW0KT/NW0KT_43.jpeg", + "NW0KT/NW0KT_44.jpeg", + "NW0KT/NW0KT_45.jpeg", + "NW0KT/NW0KT_46.jpeg", + "NW0KT/NW0KT_47.jpeg", + "NW0KT/NW0KT_48.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 59, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened before the person closed the door?\nChoice list: \nA. Lied on the bed.\nB. Took the cup/glass/bottle.\nC. Put down the blanket.\nD. Closed the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lied on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "lied on the bed", + "took the cup glass bottle", + "put down the blanket", + "closed the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "E002I/E002I_0.jpeg", + "E002I/E002I_1.jpeg", + "E002I/E002I_2.jpeg", + "E002I/E002I_3.jpeg", + "E002I/E002I_4.jpeg", + "E002I/E002I_5.jpeg", + "E002I/E002I_6.jpeg", + "E002I/E002I_7.jpeg", + "E002I/E002I_8.jpeg", + "E002I/E002I_9.jpeg", + "E002I/E002I_10.jpeg", + "E002I/E002I_11.jpeg", + "E002I/E002I_12.jpeg", + "E002I/E002I_13.jpeg", + "E002I/E002I_14.jpeg", + "E002I/E002I_15.jpeg", + "E002I/E002I_16.jpeg", + "E002I/E002I_17.jpeg", + "E002I/E002I_18.jpeg", + "E002I/E002I_19.jpeg", + "E002I/E002I_20.jpeg", + "E002I/E002I_21.jpeg", + "E002I/E002I_22.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 105, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person threw the pillow?\nChoice list: \nA. Tidied up the clothes.\nB. Put down the laptop.\nC. Tidied up the towel.\nD. Washed the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidied up the clothes", + "put down the laptop", + "tidied up the towel", + "washed the table" + ], + "image_quantity_level": "Medium", + "image": [ + "FL6DF/FL6DF_0.jpeg", + "FL6DF/FL6DF_1.jpeg", + "FL6DF/FL6DF_2.jpeg", + "FL6DF/FL6DF_3.jpeg", + "FL6DF/FL6DF_4.jpeg", + "FL6DF/FL6DF_5.jpeg", + "FL6DF/FL6DF_6.jpeg", + "FL6DF/FL6DF_7.jpeg", + "FL6DF/FL6DF_8.jpeg", + "FL6DF/FL6DF_9.jpeg", + "FL6DF/FL6DF_10.jpeg", + "FL6DF/FL6DF_11.jpeg", + "FL6DF/FL6DF_12.jpeg", + "FL6DF/FL6DF_13.jpeg", + "FL6DF/FL6DF_14.jpeg", + "FL6DF/FL6DF_15.jpeg", + "FL6DF/FL6DF_16.jpeg", + "FL6DF/FL6DF_17.jpeg", + "FL6DF/FL6DF_18.jpeg", + "FL6DF/FL6DF_19.jpeg", + "FL6DF/FL6DF_20.jpeg", + "FL6DF/FL6DF_21.jpeg", + "FL6DF/FL6DF_22.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 75, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person held the sandwich?\nChoice list: \nA. Put down the dish.\nB. Put down the food.\nC. Put down the towel.\nD. Closed the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the dish", + "put down the food", + "put down the towel", + "closed the refrigerator" + ], + "image_quantity_level": "Many", + "image": [ + "52MV9/52MV9_0.jpeg", + "52MV9/52MV9_1.jpeg", + "52MV9/52MV9_2.jpeg", + "52MV9/52MV9_3.jpeg", + "52MV9/52MV9_4.jpeg", + "52MV9/52MV9_5.jpeg", + "52MV9/52MV9_6.jpeg", + "52MV9/52MV9_7.jpeg", + "52MV9/52MV9_8.jpeg", + "52MV9/52MV9_9.jpeg", + "52MV9/52MV9_10.jpeg", + "52MV9/52MV9_11.jpeg", + "52MV9/52MV9_12.jpeg", + "52MV9/52MV9_13.jpeg", + "52MV9/52MV9_14.jpeg", + "52MV9/52MV9_15.jpeg", + "52MV9/52MV9_16.jpeg", + "52MV9/52MV9_17.jpeg", + "52MV9/52MV9_18.jpeg", + "52MV9/52MV9_19.jpeg", + "52MV9/52MV9_20.jpeg", + "52MV9/52MV9_21.jpeg", + "52MV9/52MV9_22.jpeg", + "52MV9/52MV9_23.jpeg", + "52MV9/52MV9_24.jpeg", + "52MV9/52MV9_25.jpeg", + "52MV9/52MV9_26.jpeg", + "52MV9/52MV9_27.jpeg", + "52MV9/52MV9_28.jpeg", + "52MV9/52MV9_29.jpeg", + "52MV9/52MV9_30.jpeg", + "52MV9/52MV9_31.jpeg", + "52MV9/52MV9_32.jpeg", + "52MV9/52MV9_33.jpeg", + "52MV9/52MV9_34.jpeg", + "52MV9/52MV9_35.jpeg", + "52MV9/52MV9_36.jpeg", + "52MV9/52MV9_37.jpeg", + "52MV9/52MV9_38.jpeg", + "52MV9/52MV9_39.jpeg", + "52MV9/52MV9_40.jpeg", + "52MV9/52MV9_41.jpeg", + "52MV9/52MV9_42.jpeg", + "52MV9/52MV9_43.jpeg", + "52MV9/52MV9_44.jpeg", + "52MV9/52MV9_45.jpeg", + "52MV9/52MV9_46.jpeg", + "52MV9/52MV9_47.jpeg", + "52MV9/52MV9_48.jpeg", + "52MV9/52MV9_49.jpeg", + "52MV9/52MV9_50.jpeg", + "52MV9/52MV9_51.jpeg", + "52MV9/52MV9_52.jpeg", + "52MV9/52MV9_53.jpeg", + "52MV9/52MV9_54.jpeg", + "52MV9/52MV9_55.jpeg", + "52MV9/52MV9_56.jpeg", + "52MV9/52MV9_57.jpeg", + "52MV9/52MV9_58.jpeg", + "52MV9/52MV9_59.jpeg", + "52MV9/52MV9_60.jpeg", + "52MV9/52MV9_61.jpeg", + "52MV9/52MV9_62.jpeg", + "52MV9/52MV9_63.jpeg", + "52MV9/52MV9_64.jpeg", + "52MV9/52MV9_65.jpeg", + "52MV9/52MV9_66.jpeg", + "52MV9/52MV9_67.jpeg", + "52MV9/52MV9_68.jpeg", + "52MV9/52MV9_69.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 79, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person watched at the picture?\nChoice list: \nA. Opened the book.\nB. Lied on the floor.\nC. Washed the clothes.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lied on the floor", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "opened the book", + "lied on the floor", + "washed the clothes", + "took the food" + ], + "image_quantity_level": "Many", + "image": [ + "5I4UT/5I4UT_0.jpeg", + "5I4UT/5I4UT_1.jpeg", + "5I4UT/5I4UT_2.jpeg", + "5I4UT/5I4UT_3.jpeg", + "5I4UT/5I4UT_4.jpeg", + "5I4UT/5I4UT_5.jpeg", + "5I4UT/5I4UT_6.jpeg", + "5I4UT/5I4UT_7.jpeg", + "5I4UT/5I4UT_8.jpeg", + "5I4UT/5I4UT_9.jpeg", + "5I4UT/5I4UT_10.jpeg", + "5I4UT/5I4UT_11.jpeg", + "5I4UT/5I4UT_12.jpeg", + "5I4UT/5I4UT_13.jpeg", + "5I4UT/5I4UT_14.jpeg", + "5I4UT/5I4UT_15.jpeg", + "5I4UT/5I4UT_16.jpeg", + "5I4UT/5I4UT_17.jpeg", + "5I4UT/5I4UT_18.jpeg", + "5I4UT/5I4UT_19.jpeg", + "5I4UT/5I4UT_20.jpeg", + "5I4UT/5I4UT_21.jpeg", + "5I4UT/5I4UT_22.jpeg", + "5I4UT/5I4UT_23.jpeg", + "5I4UT/5I4UT_24.jpeg", + "5I4UT/5I4UT_25.jpeg", + "5I4UT/5I4UT_26.jpeg", + "5I4UT/5I4UT_27.jpeg", + "5I4UT/5I4UT_28.jpeg", + "5I4UT/5I4UT_29.jpeg", + "5I4UT/5I4UT_30.jpeg", + "5I4UT/5I4UT_31.jpeg", + "5I4UT/5I4UT_32.jpeg", + "5I4UT/5I4UT_33.jpeg", + "5I4UT/5I4UT_34.jpeg", + "5I4UT/5I4UT_35.jpeg", + "5I4UT/5I4UT_36.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 106, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat at the table?\nChoice list: \nA. Washed the dish.\nB. Threw the broom.\nC. Put down the book.\nD. Took the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "washed the dish", + "threw the broom", + "put down the book", + "took the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "G87XG/G87XG_0.jpeg", + "G87XG/G87XG_1.jpeg", + "G87XG/G87XG_2.jpeg", + "G87XG/G87XG_3.jpeg", + "G87XG/G87XG_4.jpeg", + "G87XG/G87XG_5.jpeg", + "G87XG/G87XG_6.jpeg", + "G87XG/G87XG_7.jpeg", + "G87XG/G87XG_8.jpeg", + "G87XG/G87XG_9.jpeg", + "G87XG/G87XG_10.jpeg", + "G87XG/G87XG_11.jpeg", + "G87XG/G87XG_12.jpeg", + "G87XG/G87XG_13.jpeg", + "G87XG/G87XG_14.jpeg", + "G87XG/G87XG_15.jpeg", + "G87XG/G87XG_16.jpeg", + "G87XG/G87XG_17.jpeg", + "G87XG/G87XG_18.jpeg", + "G87XG/G87XG_19.jpeg", + "G87XG/G87XG_20.jpeg", + "G87XG/G87XG_21.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 117, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person lied on the sofa/couch?\nChoice list: \nA. Took the pillow.\nB. Opened the door.\nC. Took the phone/camera.\nD. Threw the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the pillow", + "opened the door", + "took the phone camera", + "threw the box" + ], + "image_quantity_level": "Medium", + "image": [ + "RKGG5/RKGG5_0.jpeg", + "RKGG5/RKGG5_1.jpeg", + "RKGG5/RKGG5_2.jpeg", + "RKGG5/RKGG5_3.jpeg", + "RKGG5/RKGG5_4.jpeg", + "RKGG5/RKGG5_5.jpeg", + "RKGG5/RKGG5_6.jpeg", + "RKGG5/RKGG5_7.jpeg", + "RKGG5/RKGG5_8.jpeg", + "RKGG5/RKGG5_9.jpeg", + "RKGG5/RKGG5_10.jpeg", + "RKGG5/RKGG5_11.jpeg", + "RKGG5/RKGG5_12.jpeg", + "RKGG5/RKGG5_13.jpeg", + "RKGG5/RKGG5_14.jpeg", + "RKGG5/RKGG5_15.jpeg", + "RKGG5/RKGG5_16.jpeg", + "RKGG5/RKGG5_17.jpeg", + "RKGG5/RKGG5_18.jpeg", + "RKGG5/RKGG5_19.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 134, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person closed the book?\nChoice list: \nA. Put down the phone/camera.\nB. Ate the medicine.\nC. Put down the blanket.\nD. Tidied up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the phone camera", + "ate the medicine", + "put down the blanket", + "tidied up the table" + ], + "image_quantity_level": "Medium", + "image": [ + "GMMVC/GMMVC_0.jpeg", + "GMMVC/GMMVC_1.jpeg", + "GMMVC/GMMVC_2.jpeg", + "GMMVC/GMMVC_3.jpeg", + "GMMVC/GMMVC_4.jpeg", + "GMMVC/GMMVC_5.jpeg", + "GMMVC/GMMVC_6.jpeg", + "GMMVC/GMMVC_7.jpeg", + "GMMVC/GMMVC_8.jpeg", + "GMMVC/GMMVC_9.jpeg", + "GMMVC/GMMVC_10.jpeg", + "GMMVC/GMMVC_11.jpeg", + "GMMVC/GMMVC_12.jpeg", + "GMMVC/GMMVC_13.jpeg", + "GMMVC/GMMVC_14.jpeg", + "GMMVC/GMMVC_15.jpeg", + "GMMVC/GMMVC_16.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 139, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person drank from the cup/glass/bottle?\nChoice list: \nA. Took the shoe.\nB. Washed the dish.\nC. Took the bag.\nD. Took the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the shoe", + "washed the dish", + "took the bag", + "took the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "TIPFG/TIPFG_0.jpeg", + "TIPFG/TIPFG_1.jpeg", + "TIPFG/TIPFG_2.jpeg", + "TIPFG/TIPFG_3.jpeg", + "TIPFG/TIPFG_4.jpeg", + "TIPFG/TIPFG_5.jpeg", + "TIPFG/TIPFG_6.jpeg", + "TIPFG/TIPFG_7.jpeg", + "TIPFG/TIPFG_8.jpeg", + "TIPFG/TIPFG_9.jpeg", + "TIPFG/TIPFG_10.jpeg", + "TIPFG/TIPFG_11.jpeg", + "TIPFG/TIPFG_12.jpeg", + "TIPFG/TIPFG_13.jpeg", + "TIPFG/TIPFG_14.jpeg", + "TIPFG/TIPFG_15.jpeg", + "TIPFG/TIPFG_16.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 135, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person took the shoe?\nChoice list: \nA. Closed the window.\nB. Sat on the sofa/couch.\nC. Closed the closet/cabinet.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the window", + "sat on the sofa couch", + "closed the closet cabinet", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "86GSE/86GSE_0.jpeg", + "86GSE/86GSE_1.jpeg", + "86GSE/86GSE_2.jpeg", + "86GSE/86GSE_3.jpeg", + "86GSE/86GSE_4.jpeg", + "86GSE/86GSE_5.jpeg", + "86GSE/86GSE_6.jpeg", + "86GSE/86GSE_7.jpeg", + "86GSE/86GSE_8.jpeg", + "86GSE/86GSE_9.jpeg", + "86GSE/86GSE_10.jpeg", + "86GSE/86GSE_11.jpeg", + "86GSE/86GSE_12.jpeg", + "86GSE/86GSE_13.jpeg", + "86GSE/86GSE_14.jpeg", + "86GSE/86GSE_15.jpeg", + "86GSE/86GSE_16.jpeg", + "86GSE/86GSE_17.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 163, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person put down the food?\nChoice list: \nA. Opened the closet/cabinet.\nB. Put down the paper/notebook.\nC. Closed the refrigerator.\nD. Took the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the closet cabinet", + "put down the paper notebook", + "closed the refrigerator", + "took the towel" + ], + "image_quantity_level": "Many", + "image": [ + "S3TZ1/S3TZ1_0.jpeg", + "S3TZ1/S3TZ1_1.jpeg", + "S3TZ1/S3TZ1_2.jpeg", + "S3TZ1/S3TZ1_3.jpeg", + "S3TZ1/S3TZ1_4.jpeg", + "S3TZ1/S3TZ1_5.jpeg", + "S3TZ1/S3TZ1_6.jpeg", + "S3TZ1/S3TZ1_7.jpeg", + "S3TZ1/S3TZ1_8.jpeg", + "S3TZ1/S3TZ1_9.jpeg", + "S3TZ1/S3TZ1_10.jpeg", + "S3TZ1/S3TZ1_11.jpeg", + "S3TZ1/S3TZ1_12.jpeg", + "S3TZ1/S3TZ1_13.jpeg", + "S3TZ1/S3TZ1_14.jpeg", + "S3TZ1/S3TZ1_15.jpeg", + "S3TZ1/S3TZ1_16.jpeg", + "S3TZ1/S3TZ1_17.jpeg", + "S3TZ1/S3TZ1_18.jpeg", + "S3TZ1/S3TZ1_19.jpeg", + "S3TZ1/S3TZ1_20.jpeg", + "S3TZ1/S3TZ1_21.jpeg", + "S3TZ1/S3TZ1_22.jpeg", + "S3TZ1/S3TZ1_23.jpeg", + "S3TZ1/S3TZ1_24.jpeg", + "S3TZ1/S3TZ1_25.jpeg", + "S3TZ1/S3TZ1_26.jpeg", + "S3TZ1/S3TZ1_27.jpeg", + "S3TZ1/S3TZ1_28.jpeg", + "S3TZ1/S3TZ1_29.jpeg", + "S3TZ1/S3TZ1_30.jpeg", + "S3TZ1/S3TZ1_31.jpeg", + "S3TZ1/S3TZ1_32.jpeg", + "S3TZ1/S3TZ1_33.jpeg", + "S3TZ1/S3TZ1_34.jpeg", + "S3TZ1/S3TZ1_35.jpeg", + "S3TZ1/S3TZ1_36.jpeg", + "S3TZ1/S3TZ1_37.jpeg", + "S3TZ1/S3TZ1_38.jpeg", + "S3TZ1/S3TZ1_39.jpeg", + "S3TZ1/S3TZ1_40.jpeg", + "S3TZ1/S3TZ1_41.jpeg", + "S3TZ1/S3TZ1_42.jpeg", + "S3TZ1/S3TZ1_43.jpeg", + "S3TZ1/S3TZ1_44.jpeg", + "S3TZ1/S3TZ1_45.jpeg", + "S3TZ1/S3TZ1_46.jpeg", + "S3TZ1/S3TZ1_47.jpeg", + "S3TZ1/S3TZ1_48.jpeg", + "S3TZ1/S3TZ1_49.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 196, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the dish?\nChoice list: \nA. Put down the paper/notebook.\nB. Opened the laptop.\nC. Lied on the floor.\nD. Opened the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the paper notebook", + "opened the laptop", + "lied on the floor", + "opened the refrigerator" + ], + "image_quantity_level": "Medium", + "image": [ + "UY0SK/UY0SK_0.jpeg", + "UY0SK/UY0SK_1.jpeg", + "UY0SK/UY0SK_2.jpeg", + "UY0SK/UY0SK_3.jpeg", + "UY0SK/UY0SK_4.jpeg", + "UY0SK/UY0SK_5.jpeg", + "UY0SK/UY0SK_6.jpeg", + "UY0SK/UY0SK_7.jpeg", + "UY0SK/UY0SK_8.jpeg", + "UY0SK/UY0SK_9.jpeg", + "UY0SK/UY0SK_10.jpeg", + "UY0SK/UY0SK_11.jpeg", + "UY0SK/UY0SK_12.jpeg", + "UY0SK/UY0SK_13.jpeg", + "UY0SK/UY0SK_14.jpeg", + "UY0SK/UY0SK_15.jpeg", + "UY0SK/UY0SK_16.jpeg", + "UY0SK/UY0SK_17.jpeg", + "UY0SK/UY0SK_18.jpeg", + "UY0SK/UY0SK_19.jpeg", + "UY0SK/UY0SK_20.jpeg", + "UY0SK/UY0SK_21.jpeg", + "UY0SK/UY0SK_22.jpeg", + "UY0SK/UY0SK_23.jpeg", + "UY0SK/UY0SK_24.jpeg", + "UY0SK/UY0SK_25.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 131, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person held the food?\nChoice list: \nA. Took the sandwich.\nB. Took the phone/camera.\nC. Ate the medicine.\nD. Sat on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the sandwich", + "took the phone camera", + "ate the medicine", + "sat on the sofa couch" + ], + "image_quantity_level": "Many", + "image": [ + "9B93K/9B93K_0.jpeg", + "9B93K/9B93K_1.jpeg", + "9B93K/9B93K_2.jpeg", + "9B93K/9B93K_3.jpeg", + "9B93K/9B93K_4.jpeg", + "9B93K/9B93K_5.jpeg", + "9B93K/9B93K_6.jpeg", + "9B93K/9B93K_7.jpeg", + "9B93K/9B93K_8.jpeg", + "9B93K/9B93K_9.jpeg", + "9B93K/9B93K_10.jpeg", + "9B93K/9B93K_11.jpeg", + "9B93K/9B93K_12.jpeg", + "9B93K/9B93K_13.jpeg", + "9B93K/9B93K_14.jpeg", + "9B93K/9B93K_15.jpeg", + "9B93K/9B93K_16.jpeg", + "9B93K/9B93K_17.jpeg", + "9B93K/9B93K_18.jpeg", + "9B93K/9B93K_19.jpeg", + "9B93K/9B93K_20.jpeg", + "9B93K/9B93K_21.jpeg", + "9B93K/9B93K_22.jpeg", + "9B93K/9B93K_23.jpeg", + "9B93K/9B93K_24.jpeg", + "9B93K/9B93K_25.jpeg", + "9B93K/9B93K_26.jpeg", + "9B93K/9B93K_27.jpeg", + "9B93K/9B93K_28.jpeg", + "9B93K/9B93K_29.jpeg", + "9B93K/9B93K_30.jpeg", + "9B93K/9B93K_31.jpeg", + "9B93K/9B93K_32.jpeg", + "9B93K/9B93K_33.jpeg", + "9B93K/9B93K_34.jpeg", + "9B93K/9B93K_35.jpeg", + "9B93K/9B93K_36.jpeg", + "9B93K/9B93K_37.jpeg", + "9B93K/9B93K_38.jpeg", + "9B93K/9B93K_39.jpeg", + "9B93K/9B93K_40.jpeg", + "9B93K/9B93K_41.jpeg", + "9B93K/9B93K_42.jpeg", + "9B93K/9B93K_43.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 45, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the food?\nChoice list: \nA. Took the blanket.\nB. Tidied up the table.\nC. Took the book.\nD. Threw the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the blanket", + "tidied up the table", + "took the book", + "threw the broom" + ], + "image_quantity_level": "Medium", + "image": [ + "983K8/983K8_0.jpeg", + "983K8/983K8_1.jpeg", + "983K8/983K8_2.jpeg", + "983K8/983K8_3.jpeg", + "983K8/983K8_4.jpeg", + "983K8/983K8_5.jpeg", + "983K8/983K8_6.jpeg", + "983K8/983K8_7.jpeg", + "983K8/983K8_8.jpeg", + "983K8/983K8_9.jpeg", + "983K8/983K8_10.jpeg", + "983K8/983K8_11.jpeg", + "983K8/983K8_12.jpeg", + "983K8/983K8_13.jpeg", + "983K8/983K8_14.jpeg", + "983K8/983K8_15.jpeg", + "983K8/983K8_16.jpeg", + "983K8/983K8_17.jpeg", + "983K8/983K8_18.jpeg", + "983K8/983K8_19.jpeg", + "983K8/983K8_20.jpeg", + "983K8/983K8_21.jpeg", + "983K8/983K8_22.jpeg", + "983K8/983K8_23.jpeg", + "983K8/983K8_24.jpeg", + "983K8/983K8_25.jpeg", + "983K8/983K8_26.jpeg", + "983K8/983K8_27.jpeg", + "983K8/983K8_28.jpeg", + "983K8/983K8_29.jpeg", + "983K8/983K8_30.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 46, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person washed the dish?\nChoice list: \nA. Washed the table.\nB. Took the cup/glass/bottle.\nC. Tidied up the towel.\nD. Put down the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "washed the table", + "took the cup glass bottle", + "tidied up the towel", + "put down the box" + ], + "image_quantity_level": "Medium", + "image": [ + "VO45S/VO45S_0.jpeg", + "VO45S/VO45S_1.jpeg", + "VO45S/VO45S_2.jpeg", + "VO45S/VO45S_3.jpeg", + "VO45S/VO45S_4.jpeg", + "VO45S/VO45S_5.jpeg", + "VO45S/VO45S_6.jpeg", + "VO45S/VO45S_7.jpeg", + "VO45S/VO45S_8.jpeg", + "VO45S/VO45S_9.jpeg", + "VO45S/VO45S_10.jpeg", + "VO45S/VO45S_11.jpeg", + "VO45S/VO45S_12.jpeg", + "VO45S/VO45S_13.jpeg", + "VO45S/VO45S_14.jpeg", + "VO45S/VO45S_15.jpeg", + "VO45S/VO45S_16.jpeg", + "VO45S/VO45S_17.jpeg", + "VO45S/VO45S_18.jpeg", + "VO45S/VO45S_19.jpeg", + "VO45S/VO45S_20.jpeg", + "VO45S/VO45S_21.jpeg", + "VO45S/VO45S_22.jpeg", + "VO45S/VO45S_23.jpeg", + "VO45S/VO45S_24.jpeg", + "VO45S/VO45S_25.jpeg", + "VO45S/VO45S_26.jpeg", + "VO45S/VO45S_27.jpeg", + "VO45S/VO45S_28.jpeg", + "VO45S/VO45S_29.jpeg", + "VO45S/VO45S_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 55, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person put down the clothes?\nChoice list: \nA. Tidied up the blanket.\nB. Threw the shoe.\nC. Opened the door.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidied up the blanket", + "threw the shoe", + "opened the door", + "put down the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "NV4FC/NV4FC_0.jpeg", + "NV4FC/NV4FC_1.jpeg", + "NV4FC/NV4FC_2.jpeg", + "NV4FC/NV4FC_3.jpeg", + "NV4FC/NV4FC_4.jpeg", + "NV4FC/NV4FC_5.jpeg", + "NV4FC/NV4FC_6.jpeg", + "NV4FC/NV4FC_7.jpeg", + "NV4FC/NV4FC_8.jpeg", + "NV4FC/NV4FC_9.jpeg", + "NV4FC/NV4FC_10.jpeg", + "NV4FC/NV4FC_11.jpeg", + "NV4FC/NV4FC_12.jpeg", + "NV4FC/NV4FC_13.jpeg", + "NV4FC/NV4FC_14.jpeg", + "NV4FC/NV4FC_15.jpeg", + "NV4FC/NV4FC_16.jpeg", + "NV4FC/NV4FC_17.jpeg", + "NV4FC/NV4FC_18.jpeg", + "NV4FC/NV4FC_19.jpeg", + "NV4FC/NV4FC_20.jpeg", + "NV4FC/NV4FC_21.jpeg", + "NV4FC/NV4FC_22.jpeg", + "NV4FC/NV4FC_23.jpeg", + "NV4FC/NV4FC_24.jpeg", + "NV4FC/NV4FC_25.jpeg", + "NV4FC/NV4FC_26.jpeg", + "NV4FC/NV4FC_27.jpeg", + "NV4FC/NV4FC_28.jpeg", + "NV4FC/NV4FC_29.jpeg", + "NV4FC/NV4FC_30.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 68, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person held the phone/camera?\nChoice list: \nA. Put down the broom.\nB. Threw the towel.\nC. Opened the bag.\nD. Threw the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the broom", + "threw the towel", + "opened the bag", + "threw the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "TBRZ5/TBRZ5_0.jpeg", + "TBRZ5/TBRZ5_1.jpeg", + "TBRZ5/TBRZ5_2.jpeg", + "TBRZ5/TBRZ5_3.jpeg", + "TBRZ5/TBRZ5_4.jpeg", + "TBRZ5/TBRZ5_5.jpeg", + "TBRZ5/TBRZ5_6.jpeg", + "TBRZ5/TBRZ5_7.jpeg", + "TBRZ5/TBRZ5_8.jpeg", + "TBRZ5/TBRZ5_9.jpeg", + "TBRZ5/TBRZ5_10.jpeg", + "TBRZ5/TBRZ5_11.jpeg", + "TBRZ5/TBRZ5_12.jpeg", + "TBRZ5/TBRZ5_13.jpeg", + "TBRZ5/TBRZ5_14.jpeg", + "TBRZ5/TBRZ5_15.jpeg", + "TBRZ5/TBRZ5_16.jpeg", + "TBRZ5/TBRZ5_17.jpeg", + "TBRZ5/TBRZ5_18.jpeg", + "TBRZ5/TBRZ5_19.jpeg", + "TBRZ5/TBRZ5_20.jpeg", + "TBRZ5/TBRZ5_21.jpeg", + "TBRZ5/TBRZ5_22.jpeg", + "TBRZ5/TBRZ5_23.jpeg", + "TBRZ5/TBRZ5_24.jpeg", + "TBRZ5/TBRZ5_25.jpeg", + "TBRZ5/TBRZ5_26.jpeg", + "TBRZ5/TBRZ5_27.jpeg", + "TBRZ5/TBRZ5_28.jpeg", + "TBRZ5/TBRZ5_29.jpeg", + "TBRZ5/TBRZ5_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 73, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person opened the door?\nChoice list: \nA. Put down the pillow.\nB. Tidied up the closet/cabinet.\nC. Sat on the bed.\nD. Closed the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the pillow", + "tidied up the closet cabinet", + "sat on the bed", + "closed the box" + ], + "image_quantity_level": "Medium", + "image": [ + "T2J3M/T2J3M_0.jpeg", + "T2J3M/T2J3M_1.jpeg", + "T2J3M/T2J3M_2.jpeg", + "T2J3M/T2J3M_3.jpeg", + "T2J3M/T2J3M_4.jpeg", + "T2J3M/T2J3M_5.jpeg", + "T2J3M/T2J3M_6.jpeg", + "T2J3M/T2J3M_7.jpeg", + "T2J3M/T2J3M_8.jpeg", + "T2J3M/T2J3M_9.jpeg", + "T2J3M/T2J3M_10.jpeg", + "T2J3M/T2J3M_11.jpeg", + "T2J3M/T2J3M_12.jpeg", + "T2J3M/T2J3M_13.jpeg", + "T2J3M/T2J3M_14.jpeg", + "T2J3M/T2J3M_15.jpeg", + "T2J3M/T2J3M_16.jpeg", + "T2J3M/T2J3M_17.jpeg", + "T2J3M/T2J3M_18.jpeg", + "T2J3M/T2J3M_19.jpeg", + "T2J3M/T2J3M_20.jpeg", + "T2J3M/T2J3M_21.jpeg", + "T2J3M/T2J3M_22.jpeg", + "T2J3M/T2J3M_23.jpeg", + "T2J3M/T2J3M_24.jpeg", + "T2J3M/T2J3M_25.jpeg", + "T2J3M/T2J3M_26.jpeg", + "T2J3M/T2J3M_27.jpeg", + "T2J3M/T2J3M_28.jpeg", + "T2J3M/T2J3M_29.jpeg", + "T2J3M/T2J3M_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 74, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person put down the bag?\nChoice list: \nA. Closed the window.\nB. Closed the closet/cabinet.\nC. Sat on the bed.\nD. Closed the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "closed the window", + "closed the closet cabinet", + "sat on the bed", + "closed the book" + ], + "image_quantity_level": "Medium", + "image": [ + "BE51K/BE51K_0.jpeg", + "BE51K/BE51K_1.jpeg", + "BE51K/BE51K_2.jpeg", + "BE51K/BE51K_3.jpeg", + "BE51K/BE51K_4.jpeg", + "BE51K/BE51K_5.jpeg", + "BE51K/BE51K_6.jpeg", + "BE51K/BE51K_7.jpeg", + "BE51K/BE51K_8.jpeg", + "BE51K/BE51K_9.jpeg", + "BE51K/BE51K_10.jpeg", + "BE51K/BE51K_11.jpeg", + "BE51K/BE51K_12.jpeg", + "BE51K/BE51K_13.jpeg", + "BE51K/BE51K_14.jpeg", + "BE51K/BE51K_15.jpeg", + "BE51K/BE51K_16.jpeg", + "BE51K/BE51K_17.jpeg", + "BE51K/BE51K_18.jpeg", + "BE51K/BE51K_19.jpeg", + "BE51K/BE51K_20.jpeg", + "BE51K/BE51K_21.jpeg", + "BE51K/BE51K_22.jpeg", + "BE51K/BE51K_23.jpeg", + "BE51K/BE51K_24.jpeg", + "BE51K/BE51K_25.jpeg", + "BE51K/BE51K_26.jpeg", + "BE51K/BE51K_27.jpeg", + "BE51K/BE51K_28.jpeg", + "BE51K/BE51K_29.jpeg", + "BE51K/BE51K_30.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 87, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person sat on the floor?\nChoice list: \nA. Took the bag.\nB. Took the phone/camera.\nC. Took the blanket.\nD. Took the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the bag", + "took the phone camera", + "took the blanket", + "took the box" + ], + "image_quantity_level": "Many", + "image": [ + "CR5G1/CR5G1_0.jpeg", + "CR5G1/CR5G1_1.jpeg", + "CR5G1/CR5G1_2.jpeg", + "CR5G1/CR5G1_3.jpeg", + "CR5G1/CR5G1_4.jpeg", + "CR5G1/CR5G1_5.jpeg", + "CR5G1/CR5G1_6.jpeg", + "CR5G1/CR5G1_7.jpeg", + "CR5G1/CR5G1_8.jpeg", + "CR5G1/CR5G1_9.jpeg", + "CR5G1/CR5G1_10.jpeg", + "CR5G1/CR5G1_11.jpeg", + "CR5G1/CR5G1_12.jpeg", + "CR5G1/CR5G1_13.jpeg", + "CR5G1/CR5G1_14.jpeg", + "CR5G1/CR5G1_15.jpeg", + "CR5G1/CR5G1_16.jpeg", + "CR5G1/CR5G1_17.jpeg", + "CR5G1/CR5G1_18.jpeg", + "CR5G1/CR5G1_19.jpeg", + "CR5G1/CR5G1_20.jpeg", + "CR5G1/CR5G1_21.jpeg", + "CR5G1/CR5G1_22.jpeg", + "CR5G1/CR5G1_23.jpeg", + "CR5G1/CR5G1_24.jpeg", + "CR5G1/CR5G1_25.jpeg", + "CR5G1/CR5G1_26.jpeg", + "CR5G1/CR5G1_27.jpeg", + "CR5G1/CR5G1_28.jpeg", + "CR5G1/CR5G1_29.jpeg", + "CR5G1/CR5G1_30.jpeg", + "CR5G1/CR5G1_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 107, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat on the sofa/couch?\nChoice list: \nA. Ate the sandwich.\nB. Put down the pillow.\nC. Washed the clothes.\nD. Took the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "ate the sandwich", + "put down the pillow", + "washed the clothes", + "took the bag" + ], + "image_quantity_level": "Many", + "image": [ + "GKH0F/GKH0F_0.jpeg", + "GKH0F/GKH0F_1.jpeg", + "GKH0F/GKH0F_2.jpeg", + "GKH0F/GKH0F_3.jpeg", + "GKH0F/GKH0F_4.jpeg", + "GKH0F/GKH0F_5.jpeg", + "GKH0F/GKH0F_6.jpeg", + "GKH0F/GKH0F_7.jpeg", + "GKH0F/GKH0F_8.jpeg", + "GKH0F/GKH0F_9.jpeg", + "GKH0F/GKH0F_10.jpeg", + "GKH0F/GKH0F_11.jpeg", + "GKH0F/GKH0F_12.jpeg", + "GKH0F/GKH0F_13.jpeg", + "GKH0F/GKH0F_14.jpeg", + "GKH0F/GKH0F_15.jpeg", + "GKH0F/GKH0F_16.jpeg", + "GKH0F/GKH0F_17.jpeg", + "GKH0F/GKH0F_18.jpeg", + "GKH0F/GKH0F_19.jpeg", + "GKH0F/GKH0F_20.jpeg", + "GKH0F/GKH0F_21.jpeg", + "GKH0F/GKH0F_22.jpeg", + "GKH0F/GKH0F_23.jpeg", + "GKH0F/GKH0F_24.jpeg", + "GKH0F/GKH0F_25.jpeg", + "GKH0F/GKH0F_26.jpeg", + "GKH0F/GKH0F_27.jpeg", + "GKH0F/GKH0F_28.jpeg", + "GKH0F/GKH0F_29.jpeg", + "GKH0F/GKH0F_30.jpeg", + "GKH0F/GKH0F_31.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 108, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened before the person put down the clothes?\nChoice list: \nA. Closed the door.\nB. Took the blanket.\nC. Took the paper/notebook.\nD. Closed the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "closed the door", + "took the blanket", + "took the paper notebook", + "closed the laptop" + ], + "image_quantity_level": "Many", + "image": [ + "JBJGX/JBJGX_0.jpeg", + "JBJGX/JBJGX_1.jpeg", + "JBJGX/JBJGX_2.jpeg", + "JBJGX/JBJGX_3.jpeg", + "JBJGX/JBJGX_4.jpeg", + "JBJGX/JBJGX_5.jpeg", + "JBJGX/JBJGX_6.jpeg", + "JBJGX/JBJGX_7.jpeg", + "JBJGX/JBJGX_8.jpeg", + "JBJGX/JBJGX_9.jpeg", + "JBJGX/JBJGX_10.jpeg", + "JBJGX/JBJGX_11.jpeg", + "JBJGX/JBJGX_12.jpeg", + "JBJGX/JBJGX_13.jpeg", + "JBJGX/JBJGX_14.jpeg", + "JBJGX/JBJGX_15.jpeg", + "JBJGX/JBJGX_16.jpeg", + "JBJGX/JBJGX_17.jpeg", + "JBJGX/JBJGX_18.jpeg", + "JBJGX/JBJGX_19.jpeg", + "JBJGX/JBJGX_20.jpeg", + "JBJGX/JBJGX_21.jpeg", + "JBJGX/JBJGX_22.jpeg", + "JBJGX/JBJGX_23.jpeg", + "JBJGX/JBJGX_24.jpeg", + "JBJGX/JBJGX_25.jpeg", + "JBJGX/JBJGX_26.jpeg", + "JBJGX/JBJGX_27.jpeg", + "JBJGX/JBJGX_28.jpeg", + "JBJGX/JBJGX_29.jpeg", + "JBJGX/JBJGX_30.jpeg", + "JBJGX/JBJGX_31.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 78, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person washed the clothes?\nChoice list: \nA. Put down the blanket.\nB. Took the clothes.\nC. Took the cup/glass/bottle.\nD. Threw the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the blanket", + "took the clothes", + "took the cup glass bottle", + "threw the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "IFQS1/IFQS1_0.jpeg", + "IFQS1/IFQS1_1.jpeg", + "IFQS1/IFQS1_2.jpeg", + "IFQS1/IFQS1_3.jpeg", + "IFQS1/IFQS1_4.jpeg", + "IFQS1/IFQS1_5.jpeg", + "IFQS1/IFQS1_6.jpeg", + "IFQS1/IFQS1_7.jpeg", + "IFQS1/IFQS1_8.jpeg", + "IFQS1/IFQS1_9.jpeg", + "IFQS1/IFQS1_10.jpeg", + "IFQS1/IFQS1_11.jpeg", + "IFQS1/IFQS1_12.jpeg", + "IFQS1/IFQS1_13.jpeg", + "IFQS1/IFQS1_14.jpeg", + "IFQS1/IFQS1_15.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 109, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person washed the table?\nChoice list: \nA. Threw the clothes.\nB. Washed the cup/glass/bottle.\nC. Took the phone/camera.\nD. Put down the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the clothes", + "washed the cup glass bottle", + "took the phone camera", + "put down the towel" + ], + "image_quantity_level": "Many", + "image": [ + "EO6OI/EO6OI_0.jpeg", + "EO6OI/EO6OI_1.jpeg", + "EO6OI/EO6OI_2.jpeg", + "EO6OI/EO6OI_3.jpeg", + "EO6OI/EO6OI_4.jpeg", + "EO6OI/EO6OI_5.jpeg", + "EO6OI/EO6OI_6.jpeg", + "EO6OI/EO6OI_7.jpeg", + "EO6OI/EO6OI_8.jpeg", + "EO6OI/EO6OI_9.jpeg", + "EO6OI/EO6OI_10.jpeg", + "EO6OI/EO6OI_11.jpeg", + "EO6OI/EO6OI_12.jpeg", + "EO6OI/EO6OI_13.jpeg", + "EO6OI/EO6OI_14.jpeg", + "EO6OI/EO6OI_15.jpeg", + "EO6OI/EO6OI_16.jpeg", + "EO6OI/EO6OI_17.jpeg", + "EO6OI/EO6OI_18.jpeg", + "EO6OI/EO6OI_19.jpeg", + "EO6OI/EO6OI_20.jpeg", + "EO6OI/EO6OI_21.jpeg", + "EO6OI/EO6OI_22.jpeg", + "EO6OI/EO6OI_23.jpeg", + "EO6OI/EO6OI_24.jpeg", + "EO6OI/EO6OI_25.jpeg", + "EO6OI/EO6OI_26.jpeg", + "EO6OI/EO6OI_27.jpeg", + "EO6OI/EO6OI_28.jpeg", + "EO6OI/EO6OI_29.jpeg", + "EO6OI/EO6OI_30.jpeg", + "EO6OI/EO6OI_31.jpeg", + "EO6OI/EO6OI_32.jpeg", + "EO6OI/EO6OI_33.jpeg", + "EO6OI/EO6OI_34.jpeg", + "EO6OI/EO6OI_35.jpeg", + "EO6OI/EO6OI_36.jpeg", + "EO6OI/EO6OI_37.jpeg", + "EO6OI/EO6OI_38.jpeg", + "EO6OI/EO6OI_39.jpeg", + "EO6OI/EO6OI_40.jpeg", + "EO6OI/EO6OI_41.jpeg", + "EO6OI/EO6OI_42.jpeg", + "EO6OI/EO6OI_43.jpeg", + "EO6OI/EO6OI_44.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 63, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person held the dish?\nChoice list: \nA. Ate the medicine.\nB. Took the towel.\nC. Took the phone/camera.\nD. Took the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "ate the medicine", + "took the towel", + "took the phone camera", + "took the paper notebook" + ], + "image_quantity_level": "Many", + "image": [ + "3IRHH/3IRHH_0.jpeg", + "3IRHH/3IRHH_1.jpeg", + "3IRHH/3IRHH_2.jpeg", + "3IRHH/3IRHH_3.jpeg", + "3IRHH/3IRHH_4.jpeg", + "3IRHH/3IRHH_5.jpeg", + "3IRHH/3IRHH_6.jpeg", + "3IRHH/3IRHH_7.jpeg", + "3IRHH/3IRHH_8.jpeg", + "3IRHH/3IRHH_9.jpeg", + "3IRHH/3IRHH_10.jpeg", + "3IRHH/3IRHH_11.jpeg", + "3IRHH/3IRHH_12.jpeg", + "3IRHH/3IRHH_13.jpeg", + "3IRHH/3IRHH_14.jpeg", + "3IRHH/3IRHH_15.jpeg", + "3IRHH/3IRHH_16.jpeg", + "3IRHH/3IRHH_17.jpeg", + "3IRHH/3IRHH_18.jpeg", + "3IRHH/3IRHH_19.jpeg", + "3IRHH/3IRHH_20.jpeg", + "3IRHH/3IRHH_21.jpeg", + "3IRHH/3IRHH_22.jpeg", + "3IRHH/3IRHH_23.jpeg", + "3IRHH/3IRHH_24.jpeg", + "3IRHH/3IRHH_25.jpeg", + "3IRHH/3IRHH_26.jpeg", + "3IRHH/3IRHH_27.jpeg", + "3IRHH/3IRHH_28.jpeg", + "3IRHH/3IRHH_29.jpeg", + "3IRHH/3IRHH_30.jpeg", + "3IRHH/3IRHH_31.jpeg", + "3IRHH/3IRHH_32.jpeg", + "3IRHH/3IRHH_33.jpeg", + "3IRHH/3IRHH_34.jpeg", + "3IRHH/3IRHH_35.jpeg", + "3IRHH/3IRHH_36.jpeg", + "3IRHH/3IRHH_37.jpeg", + "3IRHH/3IRHH_38.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 57, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat on the floor?\nChoice list: \nA. Put down the book.\nB. Opened the bag.\nC. Washed the clothes.\nD. Took the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the book", + "opened the bag", + "washed the clothes", + "took the picture" + ], + "image_quantity_level": "Medium", + "image": [ + "SUI1X/SUI1X_0.jpeg", + "SUI1X/SUI1X_1.jpeg", + "SUI1X/SUI1X_2.jpeg", + "SUI1X/SUI1X_3.jpeg", + "SUI1X/SUI1X_4.jpeg", + "SUI1X/SUI1X_5.jpeg", + "SUI1X/SUI1X_6.jpeg", + "SUI1X/SUI1X_7.jpeg", + "SUI1X/SUI1X_8.jpeg", + "SUI1X/SUI1X_9.jpeg", + "SUI1X/SUI1X_10.jpeg", + "SUI1X/SUI1X_11.jpeg", + "SUI1X/SUI1X_12.jpeg", + "SUI1X/SUI1X_13.jpeg", + "SUI1X/SUI1X_14.jpeg", + "SUI1X/SUI1X_15.jpeg", + "SUI1X/SUI1X_16.jpeg", + "SUI1X/SUI1X_17.jpeg", + "SUI1X/SUI1X_18.jpeg", + "SUI1X/SUI1X_19.jpeg", + "SUI1X/SUI1X_20.jpeg", + "SUI1X/SUI1X_21.jpeg", + "SUI1X/SUI1X_22.jpeg", + "SUI1X/SUI1X_23.jpeg", + "SUI1X/SUI1X_24.jpeg", + "SUI1X/SUI1X_25.jpeg", + "SUI1X/SUI1X_26.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 60, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person closed the door?\nChoice list: \nA. Took the food.\nB. Put down the phone/camera.\nC. Threw the book.\nD. Put down the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the food", + "put down the phone camera", + "threw the book", + "put down the book" + ], + "image_quantity_level": "Medium", + "image": [ + "IZ2XX/IZ2XX_0.jpeg", + "IZ2XX/IZ2XX_1.jpeg", + "IZ2XX/IZ2XX_2.jpeg", + "IZ2XX/IZ2XX_3.jpeg", + "IZ2XX/IZ2XX_4.jpeg", + "IZ2XX/IZ2XX_5.jpeg", + "IZ2XX/IZ2XX_6.jpeg", + "IZ2XX/IZ2XX_7.jpeg", + "IZ2XX/IZ2XX_8.jpeg", + "IZ2XX/IZ2XX_9.jpeg", + "IZ2XX/IZ2XX_10.jpeg", + "IZ2XX/IZ2XX_11.jpeg", + "IZ2XX/IZ2XX_12.jpeg", + "IZ2XX/IZ2XX_13.jpeg", + "IZ2XX/IZ2XX_14.jpeg", + "IZ2XX/IZ2XX_15.jpeg", + "IZ2XX/IZ2XX_16.jpeg", + "IZ2XX/IZ2XX_17.jpeg", + "IZ2XX/IZ2XX_18.jpeg", + "IZ2XX/IZ2XX_19.jpeg", + "IZ2XX/IZ2XX_20.jpeg", + "IZ2XX/IZ2XX_21.jpeg", + "IZ2XX/IZ2XX_22.jpeg", + "IZ2XX/IZ2XX_23.jpeg", + "IZ2XX/IZ2XX_24.jpeg", + "IZ2XX/IZ2XX_25.jpeg", + "IZ2XX/IZ2XX_26.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 90, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person threw the clothes?\nChoice list: \nA. Washed the dish.\nB. Opened the box.\nC. Took the laptop.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "washed the dish", + "opened the box", + "took the laptop", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "JCNHL/JCNHL_0.jpeg", + "JCNHL/JCNHL_1.jpeg", + "JCNHL/JCNHL_2.jpeg", + "JCNHL/JCNHL_3.jpeg", + "JCNHL/JCNHL_4.jpeg", + "JCNHL/JCNHL_5.jpeg", + "JCNHL/JCNHL_6.jpeg", + "JCNHL/JCNHL_7.jpeg", + "JCNHL/JCNHL_8.jpeg", + "JCNHL/JCNHL_9.jpeg", + "JCNHL/JCNHL_10.jpeg", + "JCNHL/JCNHL_11.jpeg", + "JCNHL/JCNHL_12.jpeg", + "JCNHL/JCNHL_13.jpeg", + "JCNHL/JCNHL_14.jpeg", + "JCNHL/JCNHL_15.jpeg", + "JCNHL/JCNHL_16.jpeg", + "JCNHL/JCNHL_17.jpeg", + "JCNHL/JCNHL_18.jpeg", + "JCNHL/JCNHL_19.jpeg", + "JCNHL/JCNHL_20.jpeg", + "JCNHL/JCNHL_21.jpeg", + "JCNHL/JCNHL_22.jpeg", + "JCNHL/JCNHL_23.jpeg", + "JCNHL/JCNHL_24.jpeg", + "JCNHL/JCNHL_25.jpeg", + "JCNHL/JCNHL_26.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 104, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened before the person took the phone/camera?\nChoice list: \nA. Opened the window.\nB. Put down the clothes.\nC. Took the paper/notebook.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the window", + "put down the clothes", + "took the paper notebook", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "YFI1M/YFI1M_0.jpeg", + "YFI1M/YFI1M_1.jpeg", + "YFI1M/YFI1M_2.jpeg", + "YFI1M/YFI1M_3.jpeg", + "YFI1M/YFI1M_4.jpeg", + "YFI1M/YFI1M_5.jpeg", + "YFI1M/YFI1M_6.jpeg", + "YFI1M/YFI1M_7.jpeg", + "YFI1M/YFI1M_8.jpeg", + "YFI1M/YFI1M_9.jpeg", + "YFI1M/YFI1M_10.jpeg", + "YFI1M/YFI1M_11.jpeg", + "YFI1M/YFI1M_12.jpeg", + "YFI1M/YFI1M_13.jpeg", + "YFI1M/YFI1M_14.jpeg", + "YFI1M/YFI1M_15.jpeg", + "YFI1M/YFI1M_16.jpeg", + "YFI1M/YFI1M_17.jpeg", + "YFI1M/YFI1M_18.jpeg", + "YFI1M/YFI1M_19.jpeg", + "YFI1M/YFI1M_20.jpeg", + "YFI1M/YFI1M_21.jpeg", + "YFI1M/YFI1M_22.jpeg", + "YFI1M/YFI1M_23.jpeg", + "YFI1M/YFI1M_24.jpeg", + "YFI1M/YFI1M_25.jpeg", + "YFI1M/YFI1M_26.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 39, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Tidied up the clothes.\nB. Put down the bag.\nC. Put down the food.\nD. Opened the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidied up the clothes", + "put down the bag", + "put down the food", + "opened the book" + ], + "image_quantity_level": "Many", + "image": [ + "56XKK/56XKK_0.jpeg", + "56XKK/56XKK_1.jpeg", + "56XKK/56XKK_2.jpeg", + "56XKK/56XKK_3.jpeg", + "56XKK/56XKK_4.jpeg", + "56XKK/56XKK_5.jpeg", + "56XKK/56XKK_6.jpeg", + "56XKK/56XKK_7.jpeg", + "56XKK/56XKK_8.jpeg", + "56XKK/56XKK_9.jpeg", + "56XKK/56XKK_10.jpeg", + "56XKK/56XKK_11.jpeg", + "56XKK/56XKK_12.jpeg", + "56XKK/56XKK_13.jpeg", + "56XKK/56XKK_14.jpeg", + "56XKK/56XKK_15.jpeg", + "56XKK/56XKK_16.jpeg", + "56XKK/56XKK_17.jpeg", + "56XKK/56XKK_18.jpeg", + "56XKK/56XKK_19.jpeg", + "56XKK/56XKK_20.jpeg", + "56XKK/56XKK_21.jpeg", + "56XKK/56XKK_22.jpeg", + "56XKK/56XKK_23.jpeg", + "56XKK/56XKK_24.jpeg", + "56XKK/56XKK_25.jpeg", + "56XKK/56XKK_26.jpeg", + "56XKK/56XKK_27.jpeg", + "56XKK/56XKK_28.jpeg", + "56XKK/56XKK_29.jpeg", + "56XKK/56XKK_30.jpeg", + "56XKK/56XKK_31.jpeg", + "56XKK/56XKK_32.jpeg", + "56XKK/56XKK_33.jpeg", + "56XKK/56XKK_34.jpeg", + "56XKK/56XKK_35.jpeg", + "56XKK/56XKK_36.jpeg", + "56XKK/56XKK_37.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 67, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person held the sandwich?\nChoice list: \nA. Sat at the table.\nB. Put down the shoe.\nC. Opened the laptop.\nD. Ate the medicine.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat at the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "sat at the table", + "put down the shoe", + "opened the laptop", + "ate the medicine" + ], + "image_quantity_level": "Medium", + "image": [ + "1ZBUS/1ZBUS_0.jpeg", + "1ZBUS/1ZBUS_1.jpeg", + "1ZBUS/1ZBUS_2.jpeg", + "1ZBUS/1ZBUS_3.jpeg", + "1ZBUS/1ZBUS_4.jpeg", + "1ZBUS/1ZBUS_5.jpeg", + "1ZBUS/1ZBUS_6.jpeg", + "1ZBUS/1ZBUS_7.jpeg", + "1ZBUS/1ZBUS_8.jpeg", + "1ZBUS/1ZBUS_9.jpeg", + "1ZBUS/1ZBUS_10.jpeg", + "1ZBUS/1ZBUS_11.jpeg", + "1ZBUS/1ZBUS_12.jpeg", + "1ZBUS/1ZBUS_13.jpeg", + "1ZBUS/1ZBUS_14.jpeg", + "1ZBUS/1ZBUS_15.jpeg", + "1ZBUS/1ZBUS_16.jpeg", + "1ZBUS/1ZBUS_17.jpeg", + "1ZBUS/1ZBUS_18.jpeg", + "1ZBUS/1ZBUS_19.jpeg", + "1ZBUS/1ZBUS_20.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 188, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person held the food?\nChoice list: \nA. Put down the clothes.\nB. Opened the laptop.\nC. Put down the food.\nD. Took the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the clothes", + "opened the laptop", + "put down the food", + "took the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "3Q6N1/3Q6N1_0.jpeg", + "3Q6N1/3Q6N1_1.jpeg", + "3Q6N1/3Q6N1_2.jpeg", + "3Q6N1/3Q6N1_3.jpeg", + "3Q6N1/3Q6N1_4.jpeg", + "3Q6N1/3Q6N1_5.jpeg", + "3Q6N1/3Q6N1_6.jpeg", + "3Q6N1/3Q6N1_7.jpeg", + "3Q6N1/3Q6N1_8.jpeg", + "3Q6N1/3Q6N1_9.jpeg", + "3Q6N1/3Q6N1_10.jpeg", + "3Q6N1/3Q6N1_11.jpeg", + "3Q6N1/3Q6N1_12.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 61, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person watched at the picture?\nChoice list: \nA. Opened the laptop.\nB. Took the broom.\nC. Opened the book.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the laptop", + "took the broom", + "opened the book", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "FX1T5/FX1T5_0.jpeg", + "FX1T5/FX1T5_1.jpeg", + "FX1T5/FX1T5_2.jpeg", + "FX1T5/FX1T5_3.jpeg", + "FX1T5/FX1T5_4.jpeg", + "FX1T5/FX1T5_5.jpeg", + "FX1T5/FX1T5_6.jpeg", + "FX1T5/FX1T5_7.jpeg", + "FX1T5/FX1T5_8.jpeg", + "FX1T5/FX1T5_9.jpeg", + "FX1T5/FX1T5_10.jpeg", + "FX1T5/FX1T5_11.jpeg", + "FX1T5/FX1T5_12.jpeg", + "FX1T5/FX1T5_13.jpeg", + "FX1T5/FX1T5_14.jpeg", + "FX1T5/FX1T5_15.jpeg", + "FX1T5/FX1T5_16.jpeg", + "FX1T5/FX1T5_17.jpeg", + "FX1T5/FX1T5_18.jpeg", + "FX1T5/FX1T5_19.jpeg", + "FX1T5/FX1T5_20.jpeg", + "FX1T5/FX1T5_21.jpeg", + "FX1T5/FX1T5_22.jpeg", + "FX1T5/FX1T5_23.jpeg", + "FX1T5/FX1T5_24.jpeg", + "FX1T5/FX1T5_25.jpeg", + "FX1T5/FX1T5_26.jpeg", + "FX1T5/FX1T5_27.jpeg", + "FX1T5/FX1T5_28.jpeg", + "FX1T5/FX1T5_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 69, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Put down the book.\nB. Threw the shoe.\nC. Took the book.\nD. Took the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the book", + "threw the shoe", + "took the book", + "took the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "4GLAP/4GLAP_0.jpeg", + "4GLAP/4GLAP_1.jpeg", + "4GLAP/4GLAP_2.jpeg", + "4GLAP/4GLAP_3.jpeg", + "4GLAP/4GLAP_4.jpeg", + "4GLAP/4GLAP_5.jpeg", + "4GLAP/4GLAP_6.jpeg", + "4GLAP/4GLAP_7.jpeg", + "4GLAP/4GLAP_8.jpeg", + "4GLAP/4GLAP_9.jpeg", + "4GLAP/4GLAP_10.jpeg", + "4GLAP/4GLAP_11.jpeg", + "4GLAP/4GLAP_12.jpeg", + "4GLAP/4GLAP_13.jpeg", + "4GLAP/4GLAP_14.jpeg", + "4GLAP/4GLAP_15.jpeg", + "4GLAP/4GLAP_16.jpeg", + "4GLAP/4GLAP_17.jpeg", + "4GLAP/4GLAP_18.jpeg", + "4GLAP/4GLAP_19.jpeg", + "4GLAP/4GLAP_20.jpeg", + "4GLAP/4GLAP_21.jpeg", + "4GLAP/4GLAP_22.jpeg", + "4GLAP/4GLAP_23.jpeg", + "4GLAP/4GLAP_24.jpeg", + "4GLAP/4GLAP_25.jpeg", + "4GLAP/4GLAP_26.jpeg", + "4GLAP/4GLAP_27.jpeg", + "4GLAP/4GLAP_28.jpeg", + "4GLAP/4GLAP_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 72, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person lied on the bed?\nChoice list: \nA. Tidied up the clothes.\nB. Took the food.\nC. Opened the book.\nD. Put down the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the food", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidied up the clothes", + "took the food", + "opened the book", + "put down the laptop" + ], + "image_quantity_level": "Medium", + "image": [ + "8W31Y/8W31Y_0.jpeg", + "8W31Y/8W31Y_1.jpeg", + "8W31Y/8W31Y_2.jpeg", + "8W31Y/8W31Y_3.jpeg", + "8W31Y/8W31Y_4.jpeg", + "8W31Y/8W31Y_5.jpeg", + "8W31Y/8W31Y_6.jpeg", + "8W31Y/8W31Y_7.jpeg", + "8W31Y/8W31Y_8.jpeg", + "8W31Y/8W31Y_9.jpeg", + "8W31Y/8W31Y_10.jpeg", + "8W31Y/8W31Y_11.jpeg", + "8W31Y/8W31Y_12.jpeg", + "8W31Y/8W31Y_13.jpeg", + "8W31Y/8W31Y_14.jpeg", + "8W31Y/8W31Y_15.jpeg", + "8W31Y/8W31Y_16.jpeg", + "8W31Y/8W31Y_17.jpeg", + "8W31Y/8W31Y_18.jpeg", + "8W31Y/8W31Y_19.jpeg", + "8W31Y/8W31Y_20.jpeg", + "8W31Y/8W31Y_21.jpeg", + "8W31Y/8W31Y_22.jpeg", + "8W31Y/8W31Y_23.jpeg", + "8W31Y/8W31Y_24.jpeg", + "8W31Y/8W31Y_25.jpeg", + "8W31Y/8W31Y_26.jpeg", + "8W31Y/8W31Y_27.jpeg", + "8W31Y/8W31Y_28.jpeg", + "8W31Y/8W31Y_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 89, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person held the sandwich?\nChoice list: \nA. Took the food.\nB. Put down the food.\nC. Threw the towel.\nD. Tidied up the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the food", + "put down the food", + "threw the towel", + "tidied up the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "IBX56/IBX56_0.jpeg", + "IBX56/IBX56_1.jpeg", + "IBX56/IBX56_2.jpeg", + "IBX56/IBX56_3.jpeg", + "IBX56/IBX56_4.jpeg", + "IBX56/IBX56_5.jpeg", + "IBX56/IBX56_6.jpeg", + "IBX56/IBX56_7.jpeg", + "IBX56/IBX56_8.jpeg", + "IBX56/IBX56_9.jpeg", + "IBX56/IBX56_10.jpeg", + "IBX56/IBX56_11.jpeg", + "IBX56/IBX56_12.jpeg", + "IBX56/IBX56_13.jpeg", + "IBX56/IBX56_14.jpeg", + "IBX56/IBX56_15.jpeg", + "IBX56/IBX56_16.jpeg", + "IBX56/IBX56_17.jpeg", + "IBX56/IBX56_18.jpeg", + "IBX56/IBX56_19.jpeg", + "IBX56/IBX56_20.jpeg", + "IBX56/IBX56_21.jpeg", + "IBX56/IBX56_22.jpeg", + "IBX56/IBX56_23.jpeg", + "IBX56/IBX56_24.jpeg", + "IBX56/IBX56_25.jpeg", + "IBX56/IBX56_26.jpeg", + "IBX56/IBX56_27.jpeg", + "IBX56/IBX56_28.jpeg", + "IBX56/IBX56_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 92, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person held the food?\nChoice list: \nA. Opened the bag.\nB. Closed the closet/cabinet.\nC. Opened the laptop.\nD. Ate the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the bag", + "closed the closet cabinet", + "opened the laptop", + "ate the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "XFRYR/XFRYR_0.jpeg", + "XFRYR/XFRYR_1.jpeg", + "XFRYR/XFRYR_2.jpeg", + "XFRYR/XFRYR_3.jpeg", + "XFRYR/XFRYR_4.jpeg", + "XFRYR/XFRYR_5.jpeg", + "XFRYR/XFRYR_6.jpeg", + "XFRYR/XFRYR_7.jpeg", + "XFRYR/XFRYR_8.jpeg", + "XFRYR/XFRYR_9.jpeg", + "XFRYR/XFRYR_10.jpeg", + "XFRYR/XFRYR_11.jpeg", + "XFRYR/XFRYR_12.jpeg", + "XFRYR/XFRYR_13.jpeg", + "XFRYR/XFRYR_14.jpeg", + "XFRYR/XFRYR_15.jpeg", + "XFRYR/XFRYR_16.jpeg", + "XFRYR/XFRYR_17.jpeg", + "XFRYR/XFRYR_18.jpeg", + "XFRYR/XFRYR_19.jpeg", + "XFRYR/XFRYR_20.jpeg", + "XFRYR/XFRYR_21.jpeg", + "XFRYR/XFRYR_22.jpeg", + "XFRYR/XFRYR_23.jpeg", + "XFRYR/XFRYR_24.jpeg", + "XFRYR/XFRYR_25.jpeg", + "XFRYR/XFRYR_26.jpeg", + "XFRYR/XFRYR_27.jpeg", + "XFRYR/XFRYR_28.jpeg", + "XFRYR/XFRYR_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 93, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened before the person threw the clothes?\nChoice list: \nA. Closed the closet/cabinet.\nB. Put down the towel.\nC. Put down the paper/notebook.\nD. Ate the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the closet cabinet", + "put down the towel", + "put down the paper notebook", + "ate the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "3CLVI/3CLVI_0.jpeg", + "3CLVI/3CLVI_1.jpeg", + "3CLVI/3CLVI_2.jpeg", + "3CLVI/3CLVI_3.jpeg", + "3CLVI/3CLVI_4.jpeg", + "3CLVI/3CLVI_5.jpeg", + "3CLVI/3CLVI_6.jpeg", + "3CLVI/3CLVI_7.jpeg", + "3CLVI/3CLVI_8.jpeg", + "3CLVI/3CLVI_9.jpeg", + "3CLVI/3CLVI_10.jpeg", + "3CLVI/3CLVI_11.jpeg", + "3CLVI/3CLVI_12.jpeg", + "3CLVI/3CLVI_13.jpeg", + "3CLVI/3CLVI_14.jpeg", + "3CLVI/3CLVI_15.jpeg", + "3CLVI/3CLVI_16.jpeg", + "3CLVI/3CLVI_17.jpeg", + "3CLVI/3CLVI_18.jpeg", + "3CLVI/3CLVI_19.jpeg", + "3CLVI/3CLVI_20.jpeg", + "3CLVI/3CLVI_21.jpeg", + "3CLVI/3CLVI_22.jpeg", + "3CLVI/3CLVI_23.jpeg", + "3CLVI/3CLVI_24.jpeg", + "3CLVI/3CLVI_25.jpeg", + "3CLVI/3CLVI_26.jpeg", + "3CLVI/3CLVI_27.jpeg", + "3CLVI/3CLVI_28.jpeg", + "3CLVI/3CLVI_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 96, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person opened the book?\nChoice list: \nA. Put down the towel.\nB. Took the paper/notebook.\nC. Took the bag.\nD. Tidied up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the towel", + "took the paper notebook", + "took the bag", + "tidied up the table" + ], + "image_quantity_level": "Medium", + "image": [ + "FETME/FETME_0.jpeg", + "FETME/FETME_1.jpeg", + "FETME/FETME_2.jpeg", + "FETME/FETME_3.jpeg", + "FETME/FETME_4.jpeg", + "FETME/FETME_5.jpeg", + "FETME/FETME_6.jpeg", + "FETME/FETME_7.jpeg", + "FETME/FETME_8.jpeg", + "FETME/FETME_9.jpeg", + "FETME/FETME_10.jpeg", + "FETME/FETME_11.jpeg", + "FETME/FETME_12.jpeg", + "FETME/FETME_13.jpeg", + "FETME/FETME_14.jpeg", + "FETME/FETME_15.jpeg", + "FETME/FETME_16.jpeg", + "FETME/FETME_17.jpeg", + "FETME/FETME_18.jpeg", + "FETME/FETME_19.jpeg", + "FETME/FETME_20.jpeg", + "FETME/FETME_21.jpeg", + "FETME/FETME_22.jpeg", + "FETME/FETME_23.jpeg", + "FETME/FETME_24.jpeg", + "FETME/FETME_25.jpeg", + "FETME/FETME_26.jpeg", + "FETME/FETME_27.jpeg", + "FETME/FETME_28.jpeg", + "FETME/FETME_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 102, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person put down the food?\nChoice list: \nA. Opened the closet/cabinet.\nB. Closed the book.\nC. Put down the cup/glass/bottle.\nD. Put down the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the closet cabinet", + "closed the book", + "put down the cup glass bottle", + "put down the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "2KGV3/2KGV3_0.jpeg", + "2KGV3/2KGV3_1.jpeg", + "2KGV3/2KGV3_2.jpeg", + "2KGV3/2KGV3_3.jpeg", + "2KGV3/2KGV3_4.jpeg", + "2KGV3/2KGV3_5.jpeg", + "2KGV3/2KGV3_6.jpeg", + "2KGV3/2KGV3_7.jpeg", + "2KGV3/2KGV3_8.jpeg", + "2KGV3/2KGV3_9.jpeg", + "2KGV3/2KGV3_10.jpeg", + "2KGV3/2KGV3_11.jpeg", + "2KGV3/2KGV3_12.jpeg", + "2KGV3/2KGV3_13.jpeg", + "2KGV3/2KGV3_14.jpeg", + "2KGV3/2KGV3_15.jpeg", + "2KGV3/2KGV3_16.jpeg", + "2KGV3/2KGV3_17.jpeg", + "2KGV3/2KGV3_18.jpeg", + "2KGV3/2KGV3_19.jpeg", + "2KGV3/2KGV3_20.jpeg", + "2KGV3/2KGV3_21.jpeg", + "2KGV3/2KGV3_22.jpeg", + "2KGV3/2KGV3_23.jpeg", + "2KGV3/2KGV3_24.jpeg", + "2KGV3/2KGV3_25.jpeg", + "2KGV3/2KGV3_26.jpeg", + "2KGV3/2KGV3_27.jpeg", + "2KGV3/2KGV3_28.jpeg", + "2KGV3/2KGV3_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 103, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person washed the clothes?\nChoice list: \nA. Sat on the floor.\nB. Took the laptop.\nC. Put down the towel.\nD. Put down the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the floor", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "sat on the floor", + "took the laptop", + "put down the towel", + "put down the laptop" + ], + "image_quantity_level": "Medium", + "image": [ + "Z6HEA/Z6HEA_0.jpeg", + "Z6HEA/Z6HEA_1.jpeg", + "Z6HEA/Z6HEA_2.jpeg", + "Z6HEA/Z6HEA_3.jpeg", + "Z6HEA/Z6HEA_4.jpeg", + "Z6HEA/Z6HEA_5.jpeg", + "Z6HEA/Z6HEA_6.jpeg", + "Z6HEA/Z6HEA_7.jpeg", + "Z6HEA/Z6HEA_8.jpeg", + "Z6HEA/Z6HEA_9.jpeg", + "Z6HEA/Z6HEA_10.jpeg", + "Z6HEA/Z6HEA_11.jpeg", + "Z6HEA/Z6HEA_12.jpeg", + "Z6HEA/Z6HEA_13.jpeg", + "Z6HEA/Z6HEA_14.jpeg", + "Z6HEA/Z6HEA_15.jpeg", + "Z6HEA/Z6HEA_16.jpeg", + "Z6HEA/Z6HEA_17.jpeg", + "Z6HEA/Z6HEA_18.jpeg", + "Z6HEA/Z6HEA_19.jpeg", + "Z6HEA/Z6HEA_20.jpeg", + "Z6HEA/Z6HEA_21.jpeg", + "Z6HEA/Z6HEA_22.jpeg", + "Z6HEA/Z6HEA_23.jpeg", + "Z6HEA/Z6HEA_24.jpeg", + "Z6HEA/Z6HEA_25.jpeg", + "Z6HEA/Z6HEA_26.jpeg", + "Z6HEA/Z6HEA_27.jpeg", + "Z6HEA/Z6HEA_28.jpeg", + "Z6HEA/Z6HEA_29.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 76, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened before the person closed the laptop?\nChoice list: \nA. Took the bag.\nB. Took the sandwich.\nC. Took the food.\nD. Put down the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the broom", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the bag", + "took the sandwich", + "took the food", + "put down the broom" + ], + "image_quantity_level": "Many", + "image": [ + "7MRKY/7MRKY_0.jpeg", + "7MRKY/7MRKY_1.jpeg", + "7MRKY/7MRKY_2.jpeg", + "7MRKY/7MRKY_3.jpeg", + "7MRKY/7MRKY_4.jpeg", + "7MRKY/7MRKY_5.jpeg", + "7MRKY/7MRKY_6.jpeg", + "7MRKY/7MRKY_7.jpeg", + "7MRKY/7MRKY_8.jpeg", + "7MRKY/7MRKY_9.jpeg", + "7MRKY/7MRKY_10.jpeg", + "7MRKY/7MRKY_11.jpeg", + "7MRKY/7MRKY_12.jpeg", + "7MRKY/7MRKY_13.jpeg", + "7MRKY/7MRKY_14.jpeg", + "7MRKY/7MRKY_15.jpeg", + "7MRKY/7MRKY_16.jpeg", + "7MRKY/7MRKY_17.jpeg", + "7MRKY/7MRKY_18.jpeg", + "7MRKY/7MRKY_19.jpeg", + "7MRKY/7MRKY_20.jpeg", + "7MRKY/7MRKY_21.jpeg", + "7MRKY/7MRKY_22.jpeg", + "7MRKY/7MRKY_23.jpeg", + "7MRKY/7MRKY_24.jpeg", + "7MRKY/7MRKY_25.jpeg", + "7MRKY/7MRKY_26.jpeg", + "7MRKY/7MRKY_27.jpeg", + "7MRKY/7MRKY_28.jpeg", + "7MRKY/7MRKY_29.jpeg", + "7MRKY/7MRKY_30.jpeg", + "7MRKY/7MRKY_31.jpeg", + "7MRKY/7MRKY_32.jpeg", + "7MRKY/7MRKY_33.jpeg", + "7MRKY/7MRKY_34.jpeg", + "7MRKY/7MRKY_35.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 27, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person opened the closet/cabinet?\nChoice list: \nA. Put down the dish.\nB. Closed the box.\nC. Closed the book.\nD. Threw the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the dish", + "closed the box", + "closed the book", + "threw the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "2PREF/2PREF_0.jpeg", + "2PREF/2PREF_1.jpeg", + "2PREF/2PREF_2.jpeg", + "2PREF/2PREF_3.jpeg", + "2PREF/2PREF_4.jpeg", + "2PREF/2PREF_5.jpeg", + "2PREF/2PREF_6.jpeg", + "2PREF/2PREF_7.jpeg", + "2PREF/2PREF_8.jpeg", + "2PREF/2PREF_9.jpeg", + "2PREF/2PREF_10.jpeg", + "2PREF/2PREF_11.jpeg", + "2PREF/2PREF_12.jpeg", + "2PREF/2PREF_13.jpeg", + "2PREF/2PREF_14.jpeg", + "2PREF/2PREF_15.jpeg", + "2PREF/2PREF_16.jpeg", + "2PREF/2PREF_17.jpeg", + "2PREF/2PREF_18.jpeg", + "2PREF/2PREF_19.jpeg", + "2PREF/2PREF_20.jpeg", + "2PREF/2PREF_21.jpeg", + "2PREF/2PREF_22.jpeg", + "2PREF/2PREF_23.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 43, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person closed the door?\nChoice list: \nA. Put down the broom.\nB. Put down the laptop.\nC. Threw the shoe.\nD. Took the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the broom", + "put down the laptop", + "threw the shoe", + "took the picture" + ], + "image_quantity_level": "Many", + "image": [ + "136V6/136V6_0.jpeg", + "136V6/136V6_1.jpeg", + "136V6/136V6_2.jpeg", + "136V6/136V6_3.jpeg", + "136V6/136V6_4.jpeg", + "136V6/136V6_5.jpeg", + "136V6/136V6_6.jpeg", + "136V6/136V6_7.jpeg", + "136V6/136V6_8.jpeg", + "136V6/136V6_9.jpeg", + "136V6/136V6_10.jpeg", + "136V6/136V6_11.jpeg", + "136V6/136V6_12.jpeg", + "136V6/136V6_13.jpeg", + "136V6/136V6_14.jpeg", + "136V6/136V6_15.jpeg", + "136V6/136V6_16.jpeg", + "136V6/136V6_17.jpeg", + "136V6/136V6_18.jpeg", + "136V6/136V6_19.jpeg", + "136V6/136V6_20.jpeg", + "136V6/136V6_21.jpeg", + "136V6/136V6_22.jpeg", + "136V6/136V6_23.jpeg", + "136V6/136V6_24.jpeg", + "136V6/136V6_25.jpeg", + "136V6/136V6_26.jpeg", + "136V6/136V6_27.jpeg", + "136V6/136V6_28.jpeg", + "136V6/136V6_29.jpeg", + "136V6/136V6_30.jpeg", + "136V6/136V6_31.jpeg", + "136V6/136V6_32.jpeg", + "136V6/136V6_33.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 77, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person closed the door?\nChoice list: \nA. Put down the broom.\nB. Put down the laptop.\nC. Threw the shoe.\nD. Took the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the broom", + "put down the laptop", + "threw the shoe", + "took the picture" + ], + "image_quantity_level": "Many", + "image": [ + "136V6/136V6_0.jpeg", + "136V6/136V6_1.jpeg", + "136V6/136V6_2.jpeg", + "136V6/136V6_3.jpeg", + "136V6/136V6_4.jpeg", + "136V6/136V6_5.jpeg", + "136V6/136V6_6.jpeg", + "136V6/136V6_7.jpeg", + "136V6/136V6_8.jpeg", + "136V6/136V6_9.jpeg", + "136V6/136V6_10.jpeg", + "136V6/136V6_11.jpeg", + "136V6/136V6_12.jpeg", + "136V6/136V6_13.jpeg", + "136V6/136V6_14.jpeg", + "136V6/136V6_15.jpeg", + "136V6/136V6_16.jpeg", + "136V6/136V6_17.jpeg", + "136V6/136V6_18.jpeg", + "136V6/136V6_19.jpeg", + "136V6/136V6_20.jpeg", + "136V6/136V6_21.jpeg", + "136V6/136V6_22.jpeg", + "136V6/136V6_23.jpeg", + "136V6/136V6_24.jpeg", + "136V6/136V6_25.jpeg", + "136V6/136V6_26.jpeg", + "136V6/136V6_27.jpeg", + "136V6/136V6_28.jpeg", + "136V6/136V6_29.jpeg", + "136V6/136V6_30.jpeg", + "136V6/136V6_31.jpeg", + "136V6/136V6_32.jpeg", + "136V6/136V6_33.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 65, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the towel?\nChoice list: \nA. Opened the bag.\nB. Put down the phone/camera.\nC. Threw the clothes.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the bag", + "put down the phone camera", + "threw the clothes", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "0OE6M/0OE6M_0.jpeg", + "0OE6M/0OE6M_1.jpeg", + "0OE6M/0OE6M_2.jpeg", + "0OE6M/0OE6M_3.jpeg", + "0OE6M/0OE6M_4.jpeg", + "0OE6M/0OE6M_5.jpeg", + "0OE6M/0OE6M_6.jpeg", + "0OE6M/0OE6M_7.jpeg", + "0OE6M/0OE6M_8.jpeg", + "0OE6M/0OE6M_9.jpeg", + "0OE6M/0OE6M_10.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 157, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened before the person closed the window?\nChoice list: \nA. Put down the food.\nB. Washed the window.\nC. Sat on the sofa/couch.\nD. Opened the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the food", + "washed the window", + "sat on the sofa couch", + "opened the door" + ], + "image_quantity_level": "Medium", + "image": [ + "EDVAM/EDVAM_0.jpeg", + "EDVAM/EDVAM_1.jpeg", + "EDVAM/EDVAM_2.jpeg", + "EDVAM/EDVAM_3.jpeg", + "EDVAM/EDVAM_4.jpeg", + "EDVAM/EDVAM_5.jpeg", + "EDVAM/EDVAM_6.jpeg", + "EDVAM/EDVAM_7.jpeg", + "EDVAM/EDVAM_8.jpeg", + "EDVAM/EDVAM_9.jpeg", + "EDVAM/EDVAM_10.jpeg", + "EDVAM/EDVAM_11.jpeg", + "EDVAM/EDVAM_12.jpeg", + "EDVAM/EDVAM_13.jpeg", + "EDVAM/EDVAM_14.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 86, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person opened the closet/cabinet?\nChoice list: \nA. Put down the towel.\nB. Threw the shoe.\nC. Closed the door.\nD. Lied on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the towel", + "threw the shoe", + "closed the door", + "lied on the bed" + ], + "image_quantity_level": "Medium", + "image": [ + "CH8XJ/CH8XJ_0.jpeg", + "CH8XJ/CH8XJ_1.jpeg", + "CH8XJ/CH8XJ_2.jpeg", + "CH8XJ/CH8XJ_3.jpeg", + "CH8XJ/CH8XJ_4.jpeg", + "CH8XJ/CH8XJ_5.jpeg", + "CH8XJ/CH8XJ_6.jpeg", + "CH8XJ/CH8XJ_7.jpeg", + "CH8XJ/CH8XJ_8.jpeg", + "CH8XJ/CH8XJ_9.jpeg", + "CH8XJ/CH8XJ_10.jpeg", + "CH8XJ/CH8XJ_11.jpeg", + "CH8XJ/CH8XJ_12.jpeg", + "CH8XJ/CH8XJ_13.jpeg", + "CH8XJ/CH8XJ_14.jpeg", + "CH8XJ/CH8XJ_15.jpeg", + "CH8XJ/CH8XJ_16.jpeg", + "CH8XJ/CH8XJ_17.jpeg", + "CH8XJ/CH8XJ_18.jpeg", + "CH8XJ/CH8XJ_19.jpeg", + "CH8XJ/CH8XJ_20.jpeg", + "CH8XJ/CH8XJ_21.jpeg", + "CH8XJ/CH8XJ_22.jpeg", + "CH8XJ/CH8XJ_23.jpeg", + "CH8XJ/CH8XJ_24.jpeg", + "CH8XJ/CH8XJ_25.jpeg", + "CH8XJ/CH8XJ_26.jpeg", + "CH8XJ/CH8XJ_27.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 148, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person held the food?\nChoice list: \nA. Took the book.\nB. Closed the refrigerator.\nC. Opened the door.\nD. Put down the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the book", + "closed the refrigerator", + "opened the door", + "put down the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "ZFQNZ/ZFQNZ_0.jpeg", + "ZFQNZ/ZFQNZ_1.jpeg", + "ZFQNZ/ZFQNZ_2.jpeg", + "ZFQNZ/ZFQNZ_3.jpeg", + "ZFQNZ/ZFQNZ_4.jpeg", + "ZFQNZ/ZFQNZ_5.jpeg", + "ZFQNZ/ZFQNZ_6.jpeg", + "ZFQNZ/ZFQNZ_7.jpeg", + "ZFQNZ/ZFQNZ_8.jpeg", + "ZFQNZ/ZFQNZ_9.jpeg", + "ZFQNZ/ZFQNZ_10.jpeg", + "ZFQNZ/ZFQNZ_11.jpeg", + "ZFQNZ/ZFQNZ_12.jpeg", + "ZFQNZ/ZFQNZ_13.jpeg", + "ZFQNZ/ZFQNZ_14.jpeg", + "ZFQNZ/ZFQNZ_15.jpeg", + "ZFQNZ/ZFQNZ_16.jpeg", + "ZFQNZ/ZFQNZ_17.jpeg", + "ZFQNZ/ZFQNZ_18.jpeg", + "ZFQNZ/ZFQNZ_19.jpeg", + "ZFQNZ/ZFQNZ_20.jpeg", + "ZFQNZ/ZFQNZ_21.jpeg", + "ZFQNZ/ZFQNZ_22.jpeg", + "ZFQNZ/ZFQNZ_23.jpeg", + "ZFQNZ/ZFQNZ_24.jpeg", + "ZFQNZ/ZFQNZ_25.jpeg", + "ZFQNZ/ZFQNZ_26.jpeg", + "ZFQNZ/ZFQNZ_27.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 80, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person held the medicine?\nChoice list: \nA. Opened the refrigerator.\nB. Ate the sandwich.\nC. Opened the box.\nD. Sat on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "ate the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the refrigerator", + "ate the sandwich", + "opened the box", + "sat on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "O7OD2/O7OD2_0.jpeg", + "O7OD2/O7OD2_1.jpeg", + "O7OD2/O7OD2_2.jpeg", + "O7OD2/O7OD2_3.jpeg", + "O7OD2/O7OD2_4.jpeg", + "O7OD2/O7OD2_5.jpeg", + "O7OD2/O7OD2_6.jpeg", + "O7OD2/O7OD2_7.jpeg", + "O7OD2/O7OD2_8.jpeg", + "O7OD2/O7OD2_9.jpeg", + "O7OD2/O7OD2_10.jpeg", + "O7OD2/O7OD2_11.jpeg", + "O7OD2/O7OD2_12.jpeg", + "O7OD2/O7OD2_13.jpeg", + "O7OD2/O7OD2_14.jpeg", + "O7OD2/O7OD2_15.jpeg", + "O7OD2/O7OD2_16.jpeg", + "O7OD2/O7OD2_17.jpeg", + "O7OD2/O7OD2_18.jpeg", + "O7OD2/O7OD2_19.jpeg", + "O7OD2/O7OD2_20.jpeg", + "O7OD2/O7OD2_21.jpeg", + "O7OD2/O7OD2_22.jpeg", + "O7OD2/O7OD2_23.jpeg", + "O7OD2/O7OD2_24.jpeg", + "O7OD2/O7OD2_25.jpeg", + "O7OD2/O7OD2_26.jpeg", + "O7OD2/O7OD2_27.jpeg", + "O7OD2/O7OD2_28.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 81, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person took the phone/camera?\nChoice list: \nA. Threw the bag.\nB. Opened the bag.\nC. Put down the food.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the bag", + "opened the bag", + "put down the food", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "7TA23/7TA23_0.jpeg", + "7TA23/7TA23_1.jpeg", + "7TA23/7TA23_2.jpeg", + "7TA23/7TA23_3.jpeg", + "7TA23/7TA23_4.jpeg", + "7TA23/7TA23_5.jpeg", + "7TA23/7TA23_6.jpeg", + "7TA23/7TA23_7.jpeg", + "7TA23/7TA23_8.jpeg", + "7TA23/7TA23_9.jpeg", + "7TA23/7TA23_10.jpeg", + "7TA23/7TA23_11.jpeg", + "7TA23/7TA23_12.jpeg", + "7TA23/7TA23_13.jpeg", + "7TA23/7TA23_14.jpeg", + "7TA23/7TA23_15.jpeg", + "7TA23/7TA23_16.jpeg", + "7TA23/7TA23_17.jpeg", + "7TA23/7TA23_18.jpeg", + "7TA23/7TA23_19.jpeg", + "7TA23/7TA23_20.jpeg", + "7TA23/7TA23_21.jpeg", + "7TA23/7TA23_22.jpeg", + "7TA23/7TA23_23.jpeg", + "7TA23/7TA23_24.jpeg", + "7TA23/7TA23_25.jpeg", + "7TA23/7TA23_26.jpeg", + "7TA23/7TA23_27.jpeg", + "7TA23/7TA23_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 112, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Put down the bag.\nB. Closed the book.\nC. Took the shoe.\nD. Washed the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the bag", + "closed the book", + "took the shoe", + "washed the table" + ], + "image_quantity_level": "Medium", + "image": [ + "8BBX0/8BBX0_0.jpeg", + "8BBX0/8BBX0_1.jpeg", + "8BBX0/8BBX0_2.jpeg", + "8BBX0/8BBX0_3.jpeg", + "8BBX0/8BBX0_4.jpeg", + "8BBX0/8BBX0_5.jpeg", + "8BBX0/8BBX0_6.jpeg", + "8BBX0/8BBX0_7.jpeg", + "8BBX0/8BBX0_8.jpeg", + "8BBX0/8BBX0_9.jpeg", + "8BBX0/8BBX0_10.jpeg", + "8BBX0/8BBX0_11.jpeg", + "8BBX0/8BBX0_12.jpeg", + "8BBX0/8BBX0_13.jpeg", + "8BBX0/8BBX0_14.jpeg", + "8BBX0/8BBX0_15.jpeg", + "8BBX0/8BBX0_16.jpeg", + "8BBX0/8BBX0_17.jpeg", + "8BBX0/8BBX0_18.jpeg", + "8BBX0/8BBX0_19.jpeg", + "8BBX0/8BBX0_20.jpeg", + "8BBX0/8BBX0_21.jpeg", + "8BBX0/8BBX0_22.jpeg", + "8BBX0/8BBX0_23.jpeg", + "8BBX0/8BBX0_24.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 88, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person sat on the sofa/couch?\nChoice list: \nA. Opened the box.\nB. Opened the door.\nC. Sat on the floor.\nD. Put down the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "opened the box", + "opened the door", + "sat on the floor", + "put down the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "8XXNK/8XXNK_0.jpeg", + "8XXNK/8XXNK_1.jpeg", + "8XXNK/8XXNK_2.jpeg", + "8XXNK/8XXNK_3.jpeg", + "8XXNK/8XXNK_4.jpeg", + "8XXNK/8XXNK_5.jpeg", + "8XXNK/8XXNK_6.jpeg", + "8XXNK/8XXNK_7.jpeg", + "8XXNK/8XXNK_8.jpeg", + "8XXNK/8XXNK_9.jpeg", + "8XXNK/8XXNK_10.jpeg", + "8XXNK/8XXNK_11.jpeg", + "8XXNK/8XXNK_12.jpeg", + "8XXNK/8XXNK_13.jpeg", + "8XXNK/8XXNK_14.jpeg", + "8XXNK/8XXNK_15.jpeg", + "8XXNK/8XXNK_16.jpeg", + "8XXNK/8XXNK_17.jpeg", + "8XXNK/8XXNK_18.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 97, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person closed the door?\nChoice list: \nA. Closed the box.\nB. Put down the book.\nC. Took the dish.\nD. Took the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "closed the box", + "put down the book", + "took the dish", + "took the picture" + ], + "image_quantity_level": "Many", + "image": [ + "I562C/I562C_0.jpeg", + "I562C/I562C_1.jpeg", + "I562C/I562C_2.jpeg", + "I562C/I562C_3.jpeg", + "I562C/I562C_4.jpeg", + "I562C/I562C_5.jpeg", + "I562C/I562C_6.jpeg", + "I562C/I562C_7.jpeg", + "I562C/I562C_8.jpeg", + "I562C/I562C_9.jpeg", + "I562C/I562C_10.jpeg", + "I562C/I562C_11.jpeg", + "I562C/I562C_12.jpeg", + "I562C/I562C_13.jpeg", + "I562C/I562C_14.jpeg", + "I562C/I562C_15.jpeg", + "I562C/I562C_16.jpeg", + "I562C/I562C_17.jpeg", + "I562C/I562C_18.jpeg", + "I562C/I562C_19.jpeg", + "I562C/I562C_20.jpeg", + "I562C/I562C_21.jpeg", + "I562C/I562C_22.jpeg", + "I562C/I562C_23.jpeg", + "I562C/I562C_24.jpeg", + "I562C/I562C_25.jpeg", + "I562C/I562C_26.jpeg", + "I562C/I562C_27.jpeg", + "I562C/I562C_28.jpeg", + "I562C/I562C_29.jpeg", + "I562C/I562C_30.jpeg", + "I562C/I562C_31.jpeg", + "I562C/I562C_32.jpeg", + "I562C/I562C_33.jpeg", + "I562C/I562C_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 125, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person opened the bag?\nChoice list: \nA. Sat at the table.\nB. Tidied up the closet/cabinet.\nC. Took the laptop.\nD. Put down the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "sat at the table", + "tidied up the closet cabinet", + "took the laptop", + "put down the cup glass bottle" + ], + "image_quantity_level": "Many", + "image": [ + "T1CQE/T1CQE_0.jpeg", + "T1CQE/T1CQE_1.jpeg", + "T1CQE/T1CQE_2.jpeg", + "T1CQE/T1CQE_3.jpeg", + "T1CQE/T1CQE_4.jpeg", + "T1CQE/T1CQE_5.jpeg", + "T1CQE/T1CQE_6.jpeg", + "T1CQE/T1CQE_7.jpeg", + "T1CQE/T1CQE_8.jpeg", + "T1CQE/T1CQE_9.jpeg", + "T1CQE/T1CQE_10.jpeg", + "T1CQE/T1CQE_11.jpeg", + "T1CQE/T1CQE_12.jpeg", + "T1CQE/T1CQE_13.jpeg", + "T1CQE/T1CQE_14.jpeg", + "T1CQE/T1CQE_15.jpeg", + "T1CQE/T1CQE_16.jpeg", + "T1CQE/T1CQE_17.jpeg", + "T1CQE/T1CQE_18.jpeg", + "T1CQE/T1CQE_19.jpeg", + "T1CQE/T1CQE_20.jpeg", + "T1CQE/T1CQE_21.jpeg", + "T1CQE/T1CQE_22.jpeg", + "T1CQE/T1CQE_23.jpeg", + "T1CQE/T1CQE_24.jpeg", + "T1CQE/T1CQE_25.jpeg", + "T1CQE/T1CQE_26.jpeg", + "T1CQE/T1CQE_27.jpeg", + "T1CQE/T1CQE_28.jpeg", + "T1CQE/T1CQE_29.jpeg", + "T1CQE/T1CQE_30.jpeg", + "T1CQE/T1CQE_31.jpeg", + "T1CQE/T1CQE_32.jpeg", + "T1CQE/T1CQE_33.jpeg", + "T1CQE/T1CQE_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 180, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person drank from the cup/glass/bottle?\nChoice list: \nA. Lied on the floor.\nB. Took the broom.\nC. Put down the dish.\nD. Opened the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "lied on the floor", + "took the broom", + "put down the dish", + "opened the bag" + ], + "image_quantity_level": "Many", + "image": [ + "74DPG/74DPG_0.jpeg", + "74DPG/74DPG_1.jpeg", + "74DPG/74DPG_2.jpeg", + "74DPG/74DPG_3.jpeg", + "74DPG/74DPG_4.jpeg", + "74DPG/74DPG_5.jpeg", + "74DPG/74DPG_6.jpeg", + "74DPG/74DPG_7.jpeg", + "74DPG/74DPG_8.jpeg", + "74DPG/74DPG_9.jpeg", + "74DPG/74DPG_10.jpeg", + "74DPG/74DPG_11.jpeg", + "74DPG/74DPG_12.jpeg", + "74DPG/74DPG_13.jpeg", + "74DPG/74DPG_14.jpeg", + "74DPG/74DPG_15.jpeg", + "74DPG/74DPG_16.jpeg", + "74DPG/74DPG_17.jpeg", + "74DPG/74DPG_18.jpeg", + "74DPG/74DPG_19.jpeg", + "74DPG/74DPG_20.jpeg", + "74DPG/74DPG_21.jpeg", + "74DPG/74DPG_22.jpeg", + "74DPG/74DPG_23.jpeg", + "74DPG/74DPG_24.jpeg", + "74DPG/74DPG_25.jpeg", + "74DPG/74DPG_26.jpeg", + "74DPG/74DPG_27.jpeg", + "74DPG/74DPG_28.jpeg", + "74DPG/74DPG_29.jpeg", + "74DPG/74DPG_30.jpeg", + "74DPG/74DPG_31.jpeg", + "74DPG/74DPG_32.jpeg", + "74DPG/74DPG_33.jpeg", + "74DPG/74DPG_34.jpeg", + "74DPG/74DPG_35.jpeg", + "74DPG/74DPG_36.jpeg", + "74DPG/74DPG_37.jpeg", + "74DPG/74DPG_38.jpeg", + "74DPG/74DPG_39.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 94, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person put down the blanket?\nChoice list: \nA. Took the pillow.\nB. Washed the cup/glass/bottle.\nC. Took the bag.\nD. Washed the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "washed the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the pillow", + "washed the cup glass bottle", + "took the bag", + "washed the clothes" + ], + "image_quantity_level": "Many", + "image": [ + "9BCZU/9BCZU_0.jpeg", + "9BCZU/9BCZU_1.jpeg", + "9BCZU/9BCZU_2.jpeg", + "9BCZU/9BCZU_3.jpeg", + "9BCZU/9BCZU_4.jpeg", + "9BCZU/9BCZU_5.jpeg", + "9BCZU/9BCZU_6.jpeg", + "9BCZU/9BCZU_7.jpeg", + "9BCZU/9BCZU_8.jpeg", + "9BCZU/9BCZU_9.jpeg", + "9BCZU/9BCZU_10.jpeg", + "9BCZU/9BCZU_11.jpeg", + "9BCZU/9BCZU_12.jpeg", + "9BCZU/9BCZU_13.jpeg", + "9BCZU/9BCZU_14.jpeg", + "9BCZU/9BCZU_15.jpeg", + "9BCZU/9BCZU_16.jpeg", + "9BCZU/9BCZU_17.jpeg", + "9BCZU/9BCZU_18.jpeg", + "9BCZU/9BCZU_19.jpeg", + "9BCZU/9BCZU_20.jpeg", + "9BCZU/9BCZU_21.jpeg", + "9BCZU/9BCZU_22.jpeg", + "9BCZU/9BCZU_23.jpeg", + "9BCZU/9BCZU_24.jpeg", + "9BCZU/9BCZU_25.jpeg", + "9BCZU/9BCZU_26.jpeg", + "9BCZU/9BCZU_27.jpeg", + "9BCZU/9BCZU_28.jpeg", + "9BCZU/9BCZU_29.jpeg", + "9BCZU/9BCZU_30.jpeg", + "9BCZU/9BCZU_31.jpeg", + "9BCZU/9BCZU_32.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 156, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person opened the book?\nChoice list: \nA. Took the paper/notebook.\nB. Put down the food.\nC. Threw the clothes.\nD. Took the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the paper notebook", + "put down the food", + "threw the clothes", + "took the dish" + ], + "image_quantity_level": "Many", + "image": [ + "NW0KT/NW0KT_0.jpeg", + "NW0KT/NW0KT_1.jpeg", + "NW0KT/NW0KT_2.jpeg", + "NW0KT/NW0KT_3.jpeg", + "NW0KT/NW0KT_4.jpeg", + "NW0KT/NW0KT_5.jpeg", + "NW0KT/NW0KT_6.jpeg", + "NW0KT/NW0KT_7.jpeg", + "NW0KT/NW0KT_8.jpeg", + "NW0KT/NW0KT_9.jpeg", + "NW0KT/NW0KT_10.jpeg", + "NW0KT/NW0KT_11.jpeg", + "NW0KT/NW0KT_12.jpeg", + "NW0KT/NW0KT_13.jpeg", + "NW0KT/NW0KT_14.jpeg", + "NW0KT/NW0KT_15.jpeg", + "NW0KT/NW0KT_16.jpeg", + "NW0KT/NW0KT_17.jpeg", + "NW0KT/NW0KT_18.jpeg", + "NW0KT/NW0KT_19.jpeg", + "NW0KT/NW0KT_20.jpeg", + "NW0KT/NW0KT_21.jpeg", + "NW0KT/NW0KT_22.jpeg", + "NW0KT/NW0KT_23.jpeg", + "NW0KT/NW0KT_24.jpeg", + "NW0KT/NW0KT_25.jpeg", + "NW0KT/NW0KT_26.jpeg", + "NW0KT/NW0KT_27.jpeg", + "NW0KT/NW0KT_28.jpeg", + "NW0KT/NW0KT_29.jpeg", + "NW0KT/NW0KT_30.jpeg", + "NW0KT/NW0KT_31.jpeg", + "NW0KT/NW0KT_32.jpeg", + "NW0KT/NW0KT_33.jpeg", + "NW0KT/NW0KT_34.jpeg", + "NW0KT/NW0KT_35.jpeg", + "NW0KT/NW0KT_36.jpeg", + "NW0KT/NW0KT_37.jpeg", + "NW0KT/NW0KT_38.jpeg", + "NW0KT/NW0KT_39.jpeg", + "NW0KT/NW0KT_40.jpeg", + "NW0KT/NW0KT_41.jpeg", + "NW0KT/NW0KT_42.jpeg", + "NW0KT/NW0KT_43.jpeg", + "NW0KT/NW0KT_44.jpeg", + "NW0KT/NW0KT_45.jpeg", + "NW0KT/NW0KT_46.jpeg", + "NW0KT/NW0KT_47.jpeg", + "NW0KT/NW0KT_48.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 146, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person sat on the sofa/couch?\nChoice list: \nA. Took the box.\nB. Took the dish.\nC. Threw the pillow.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the box", + "took the dish", + "threw the pillow", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "RNLTR/RNLTR_0.jpeg", + "RNLTR/RNLTR_1.jpeg", + "RNLTR/RNLTR_2.jpeg", + "RNLTR/RNLTR_3.jpeg", + "RNLTR/RNLTR_4.jpeg", + "RNLTR/RNLTR_5.jpeg", + "RNLTR/RNLTR_6.jpeg", + "RNLTR/RNLTR_7.jpeg", + "RNLTR/RNLTR_8.jpeg", + "RNLTR/RNLTR_9.jpeg", + "RNLTR/RNLTR_10.jpeg", + "RNLTR/RNLTR_11.jpeg", + "RNLTR/RNLTR_12.jpeg", + "RNLTR/RNLTR_13.jpeg", + "RNLTR/RNLTR_14.jpeg", + "RNLTR/RNLTR_15.jpeg", + "RNLTR/RNLTR_16.jpeg", + "RNLTR/RNLTR_17.jpeg", + "RNLTR/RNLTR_18.jpeg", + "RNLTR/RNLTR_19.jpeg", + "RNLTR/RNLTR_20.jpeg", + "RNLTR/RNLTR_21.jpeg", + "RNLTR/RNLTR_22.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 158, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person put down the towel?\nChoice list: \nA. Opened the bag.\nB. Closed the box.\nC. Took the paper/notebook.\nD. Closed the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the bag", + "closed the box", + "took the paper notebook", + "closed the refrigerator" + ], + "image_quantity_level": "Medium", + "image": [ + "TOOYI/TOOYI_0.jpeg", + "TOOYI/TOOYI_1.jpeg", + "TOOYI/TOOYI_2.jpeg", + "TOOYI/TOOYI_3.jpeg", + "TOOYI/TOOYI_4.jpeg", + "TOOYI/TOOYI_5.jpeg", + "TOOYI/TOOYI_6.jpeg", + "TOOYI/TOOYI_7.jpeg", + "TOOYI/TOOYI_8.jpeg", + "TOOYI/TOOYI_9.jpeg", + "TOOYI/TOOYI_10.jpeg", + "TOOYI/TOOYI_11.jpeg", + "TOOYI/TOOYI_12.jpeg", + "TOOYI/TOOYI_13.jpeg", + "TOOYI/TOOYI_14.jpeg", + "TOOYI/TOOYI_15.jpeg", + "TOOYI/TOOYI_16.jpeg", + "TOOYI/TOOYI_17.jpeg", + "TOOYI/TOOYI_18.jpeg", + "TOOYI/TOOYI_19.jpeg", + "TOOYI/TOOYI_20.jpeg", + "TOOYI/TOOYI_21.jpeg", + "TOOYI/TOOYI_22.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 121, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person drank from the cup/glass/bottle?\nChoice list: \nA. Ate the sandwich.\nB. Took the dish.\nC. Washed the cup/glass/bottle.\nD. Took the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "ate the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "ate the sandwich", + "took the dish", + "washed the cup glass bottle", + "took the picture" + ], + "image_quantity_level": "Many", + "image": [ + "1HAYV/1HAYV_0.jpeg", + "1HAYV/1HAYV_1.jpeg", + "1HAYV/1HAYV_2.jpeg", + "1HAYV/1HAYV_3.jpeg", + "1HAYV/1HAYV_4.jpeg", + "1HAYV/1HAYV_5.jpeg", + "1HAYV/1HAYV_6.jpeg", + "1HAYV/1HAYV_7.jpeg", + "1HAYV/1HAYV_8.jpeg", + "1HAYV/1HAYV_9.jpeg", + "1HAYV/1HAYV_10.jpeg", + "1HAYV/1HAYV_11.jpeg", + "1HAYV/1HAYV_12.jpeg", + "1HAYV/1HAYV_13.jpeg", + "1HAYV/1HAYV_14.jpeg", + "1HAYV/1HAYV_15.jpeg", + "1HAYV/1HAYV_16.jpeg", + "1HAYV/1HAYV_17.jpeg", + "1HAYV/1HAYV_18.jpeg", + "1HAYV/1HAYV_19.jpeg", + "1HAYV/1HAYV_20.jpeg", + "1HAYV/1HAYV_21.jpeg", + "1HAYV/1HAYV_22.jpeg", + "1HAYV/1HAYV_23.jpeg", + "1HAYV/1HAYV_24.jpeg", + "1HAYV/1HAYV_25.jpeg", + "1HAYV/1HAYV_26.jpeg", + "1HAYV/1HAYV_27.jpeg", + "1HAYV/1HAYV_28.jpeg", + "1HAYV/1HAYV_29.jpeg", + "1HAYV/1HAYV_30.jpeg", + "1HAYV/1HAYV_31.jpeg", + "1HAYV/1HAYV_32.jpeg", + "1HAYV/1HAYV_33.jpeg", + "1HAYV/1HAYV_34.jpeg", + "1HAYV/1HAYV_35.jpeg", + "1HAYV/1HAYV_36.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 193, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person closed the door?\nChoice list: \nA. Opened the bag.\nB. Put down the sandwich.\nC. Put down the pillow.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the bag", + "put down the sandwich", + "put down the pillow", + "took the food" + ], + "image_quantity_level": "Medium", + "image": [ + "3W6TL/3W6TL_0.jpeg", + "3W6TL/3W6TL_1.jpeg", + "3W6TL/3W6TL_2.jpeg", + "3W6TL/3W6TL_3.jpeg", + "3W6TL/3W6TL_4.jpeg", + "3W6TL/3W6TL_5.jpeg", + "3W6TL/3W6TL_6.jpeg", + "3W6TL/3W6TL_7.jpeg", + "3W6TL/3W6TL_8.jpeg", + "3W6TL/3W6TL_9.jpeg", + "3W6TL/3W6TL_10.jpeg", + "3W6TL/3W6TL_11.jpeg", + "3W6TL/3W6TL_12.jpeg", + "3W6TL/3W6TL_13.jpeg", + "3W6TL/3W6TL_14.jpeg", + "3W6TL/3W6TL_15.jpeg", + "3W6TL/3W6TL_16.jpeg", + "3W6TL/3W6TL_17.jpeg", + "3W6TL/3W6TL_18.jpeg", + "3W6TL/3W6TL_19.jpeg", + "3W6TL/3W6TL_20.jpeg", + "3W6TL/3W6TL_21.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 141, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened before the person took the cup/glass/bottle?\nChoice list: \nA. Sat on the sofa/couch.\nB. Put down the paper/notebook.\nC. Sat on the bed.\nD. Washed the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "sat on the sofa couch", + "put down the paper notebook", + "sat on the bed", + "washed the table" + ], + "image_quantity_level": "Medium", + "image": [ + "JY6TC/JY6TC_0.jpeg", + "JY6TC/JY6TC_1.jpeg", + "JY6TC/JY6TC_2.jpeg", + "JY6TC/JY6TC_3.jpeg", + "JY6TC/JY6TC_4.jpeg", + "JY6TC/JY6TC_5.jpeg", + "JY6TC/JY6TC_6.jpeg", + "JY6TC/JY6TC_7.jpeg", + "JY6TC/JY6TC_8.jpeg", + "JY6TC/JY6TC_9.jpeg", + "JY6TC/JY6TC_10.jpeg", + "JY6TC/JY6TC_11.jpeg", + "JY6TC/JY6TC_12.jpeg", + "JY6TC/JY6TC_13.jpeg", + "JY6TC/JY6TC_14.jpeg", + "JY6TC/JY6TC_15.jpeg", + "JY6TC/JY6TC_16.jpeg", + "JY6TC/JY6TC_17.jpeg", + "JY6TC/JY6TC_18.jpeg", + "JY6TC/JY6TC_19.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 145, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person closed the box?\nChoice list: \nA. Put down the food.\nB. Threw the towel.\nC. Took the shoe.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the food", + "threw the towel", + "took the shoe", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "TIPFG/TIPFG_0.jpeg", + "TIPFG/TIPFG_1.jpeg", + "TIPFG/TIPFG_2.jpeg", + "TIPFG/TIPFG_3.jpeg", + "TIPFG/TIPFG_4.jpeg", + "TIPFG/TIPFG_5.jpeg", + "TIPFG/TIPFG_6.jpeg", + "TIPFG/TIPFG_7.jpeg", + "TIPFG/TIPFG_8.jpeg", + "TIPFG/TIPFG_9.jpeg", + "TIPFG/TIPFG_10.jpeg", + "TIPFG/TIPFG_11.jpeg", + "TIPFG/TIPFG_12.jpeg", + "TIPFG/TIPFG_13.jpeg", + "TIPFG/TIPFG_14.jpeg", + "TIPFG/TIPFG_15.jpeg", + "TIPFG/TIPFG_16.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 151, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the food?\nChoice list: \nA. Closed the laptop.\nB. Sat on the sofa/couch.\nC. Took the blanket.\nD. Took the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the laptop", + "sat on the sofa couch", + "took the blanket", + "took the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "IU5TH/IU5TH_0.jpeg", + "IU5TH/IU5TH_1.jpeg", + "IU5TH/IU5TH_2.jpeg", + "IU5TH/IU5TH_3.jpeg", + "IU5TH/IU5TH_4.jpeg", + "IU5TH/IU5TH_5.jpeg", + "IU5TH/IU5TH_6.jpeg", + "IU5TH/IU5TH_7.jpeg", + "IU5TH/IU5TH_8.jpeg", + "IU5TH/IU5TH_9.jpeg", + "IU5TH/IU5TH_10.jpeg", + "IU5TH/IU5TH_11.jpeg", + "IU5TH/IU5TH_12.jpeg", + "IU5TH/IU5TH_13.jpeg", + "IU5TH/IU5TH_14.jpeg", + "IU5TH/IU5TH_15.jpeg", + "IU5TH/IU5TH_16.jpeg", + "IU5TH/IU5TH_17.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 143, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person took the food?\nChoice list: \nA. Ate the medicine.\nB. Tidied up the blanket.\nC. Put down the cup/glass/bottle.\nD. Took the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "ate the medicine", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "ate the medicine", + "tidied up the blanket", + "put down the cup glass bottle", + "took the box" + ], + "image_quantity_level": "Many", + "image": [ + "ZS9XR/ZS9XR_0.jpeg", + "ZS9XR/ZS9XR_1.jpeg", + "ZS9XR/ZS9XR_2.jpeg", + "ZS9XR/ZS9XR_3.jpeg", + "ZS9XR/ZS9XR_4.jpeg", + "ZS9XR/ZS9XR_5.jpeg", + "ZS9XR/ZS9XR_6.jpeg", + "ZS9XR/ZS9XR_7.jpeg", + "ZS9XR/ZS9XR_8.jpeg", + "ZS9XR/ZS9XR_9.jpeg", + "ZS9XR/ZS9XR_10.jpeg", + "ZS9XR/ZS9XR_11.jpeg", + "ZS9XR/ZS9XR_12.jpeg", + "ZS9XR/ZS9XR_13.jpeg", + "ZS9XR/ZS9XR_14.jpeg", + "ZS9XR/ZS9XR_15.jpeg", + "ZS9XR/ZS9XR_16.jpeg", + "ZS9XR/ZS9XR_17.jpeg", + "ZS9XR/ZS9XR_18.jpeg", + "ZS9XR/ZS9XR_19.jpeg", + "ZS9XR/ZS9XR_20.jpeg", + "ZS9XR/ZS9XR_21.jpeg", + "ZS9XR/ZS9XR_22.jpeg", + "ZS9XR/ZS9XR_23.jpeg", + "ZS9XR/ZS9XR_24.jpeg", + "ZS9XR/ZS9XR_25.jpeg", + "ZS9XR/ZS9XR_26.jpeg", + "ZS9XR/ZS9XR_27.jpeg", + "ZS9XR/ZS9XR_28.jpeg", + "ZS9XR/ZS9XR_29.jpeg", + "ZS9XR/ZS9XR_30.jpeg", + "ZS9XR/ZS9XR_31.jpeg", + "ZS9XR/ZS9XR_32.jpeg", + "ZS9XR/ZS9XR_33.jpeg", + "ZS9XR/ZS9XR_34.jpeg", + "ZS9XR/ZS9XR_35.jpeg", + "ZS9XR/ZS9XR_36.jpeg", + "ZS9XR/ZS9XR_37.jpeg", + "ZS9XR/ZS9XR_38.jpeg", + "ZS9XR/ZS9XR_39.jpeg", + "ZS9XR/ZS9XR_40.jpeg", + "ZS9XR/ZS9XR_41.jpeg", + "ZS9XR/ZS9XR_42.jpeg", + "ZS9XR/ZS9XR_43.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 100, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person sat on the sofa/couch?\nChoice list: \nA. Lied on the bed.\nB. Opened the refrigerator.\nC. Took the pillow.\nD. Ate the medicine.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "lied on the bed", + "opened the refrigerator", + "took the pillow", + "ate the medicine" + ], + "image_quantity_level": "Medium", + "image": [ + "JOLLV/JOLLV_0.jpeg", + "JOLLV/JOLLV_1.jpeg", + "JOLLV/JOLLV_2.jpeg", + "JOLLV/JOLLV_3.jpeg", + "JOLLV/JOLLV_4.jpeg", + "JOLLV/JOLLV_5.jpeg", + "JOLLV/JOLLV_6.jpeg", + "JOLLV/JOLLV_7.jpeg", + "JOLLV/JOLLV_8.jpeg", + "JOLLV/JOLLV_9.jpeg", + "JOLLV/JOLLV_10.jpeg", + "JOLLV/JOLLV_11.jpeg", + "JOLLV/JOLLV_12.jpeg", + "JOLLV/JOLLV_13.jpeg", + "JOLLV/JOLLV_14.jpeg", + "JOLLV/JOLLV_15.jpeg", + "JOLLV/JOLLV_16.jpeg", + "JOLLV/JOLLV_17.jpeg", + "JOLLV/JOLLV_18.jpeg", + "JOLLV/JOLLV_19.jpeg", + "JOLLV/JOLLV_20.jpeg", + "JOLLV/JOLLV_21.jpeg", + "JOLLV/JOLLV_22.jpeg", + "JOLLV/JOLLV_23.jpeg", + "JOLLV/JOLLV_24.jpeg", + "JOLLV/JOLLV_25.jpeg", + "JOLLV/JOLLV_26.jpeg", + "JOLLV/JOLLV_27.jpeg", + "JOLLV/JOLLV_28.jpeg", + "JOLLV/JOLLV_29.jpeg", + "JOLLV/JOLLV_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 113, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person opened the refrigerator?\nChoice list: \nA. Opened the bag.\nB. Put down the dish.\nC. Put down the food.\nD. Took the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "opened the bag", + "put down the dish", + "put down the food", + "took the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "S1J2Q/S1J2Q_0.jpeg", + "S1J2Q/S1J2Q_1.jpeg", + "S1J2Q/S1J2Q_2.jpeg", + "S1J2Q/S1J2Q_3.jpeg", + "S1J2Q/S1J2Q_4.jpeg", + "S1J2Q/S1J2Q_5.jpeg", + "S1J2Q/S1J2Q_6.jpeg", + "S1J2Q/S1J2Q_7.jpeg", + "S1J2Q/S1J2Q_8.jpeg", + "S1J2Q/S1J2Q_9.jpeg", + "S1J2Q/S1J2Q_10.jpeg", + "S1J2Q/S1J2Q_11.jpeg", + "S1J2Q/S1J2Q_12.jpeg", + "S1J2Q/S1J2Q_13.jpeg", + "S1J2Q/S1J2Q_14.jpeg", + "S1J2Q/S1J2Q_15.jpeg", + "S1J2Q/S1J2Q_16.jpeg", + "S1J2Q/S1J2Q_17.jpeg", + "S1J2Q/S1J2Q_18.jpeg", + "S1J2Q/S1J2Q_19.jpeg", + "S1J2Q/S1J2Q_20.jpeg", + "S1J2Q/S1J2Q_21.jpeg", + "S1J2Q/S1J2Q_22.jpeg", + "S1J2Q/S1J2Q_23.jpeg", + "S1J2Q/S1J2Q_24.jpeg", + "S1J2Q/S1J2Q_25.jpeg", + "S1J2Q/S1J2Q_26.jpeg", + "S1J2Q/S1J2Q_27.jpeg", + "S1J2Q/S1J2Q_28.jpeg", + "S1J2Q/S1J2Q_29.jpeg", + "S1J2Q/S1J2Q_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 115, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person lied on the floor?\nChoice list: \nA. Put down the clothes.\nB. Tidied up the blanket.\nC. Opened the book.\nD. Took the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the cup glass bottle", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the clothes", + "tidied up the blanket", + "opened the book", + "took the cup glass bottle" + ], + "image_quantity_level": "Medium", + "image": [ + "1G90H/1G90H_0.jpeg", + "1G90H/1G90H_1.jpeg", + "1G90H/1G90H_2.jpeg", + "1G90H/1G90H_3.jpeg", + "1G90H/1G90H_4.jpeg", + "1G90H/1G90H_5.jpeg", + "1G90H/1G90H_6.jpeg", + "1G90H/1G90H_7.jpeg", + "1G90H/1G90H_8.jpeg", + "1G90H/1G90H_9.jpeg", + "1G90H/1G90H_10.jpeg", + "1G90H/1G90H_11.jpeg", + "1G90H/1G90H_12.jpeg", + "1G90H/1G90H_13.jpeg", + "1G90H/1G90H_14.jpeg", + "1G90H/1G90H_15.jpeg", + "1G90H/1G90H_16.jpeg", + "1G90H/1G90H_17.jpeg", + "1G90H/1G90H_18.jpeg", + "1G90H/1G90H_19.jpeg", + "1G90H/1G90H_20.jpeg", + "1G90H/1G90H_21.jpeg", + "1G90H/1G90H_22.jpeg", + "1G90H/1G90H_23.jpeg", + "1G90H/1G90H_24.jpeg", + "1G90H/1G90H_25.jpeg", + "1G90H/1G90H_26.jpeg", + "1G90H/1G90H_27.jpeg", + "1G90H/1G90H_28.jpeg", + "1G90H/1G90H_29.jpeg", + "1G90H/1G90H_30.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 127, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person put down the cup/glass/bottle?\nChoice list: \nA. Took the food.\nB. Sat on the bed.\nC. Opened the laptop.\nD. Took the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the food", + "sat on the bed", + "opened the laptop", + "took the book" + ], + "image_quantity_level": "Medium", + "image": [ + "21MI8/21MI8_0.jpeg", + "21MI8/21MI8_1.jpeg", + "21MI8/21MI8_2.jpeg", + "21MI8/21MI8_3.jpeg", + "21MI8/21MI8_4.jpeg", + "21MI8/21MI8_5.jpeg", + "21MI8/21MI8_6.jpeg", + "21MI8/21MI8_7.jpeg", + "21MI8/21MI8_8.jpeg", + "21MI8/21MI8_9.jpeg", + "21MI8/21MI8_10.jpeg", + "21MI8/21MI8_11.jpeg", + "21MI8/21MI8_12.jpeg", + "21MI8/21MI8_13.jpeg", + "21MI8/21MI8_14.jpeg", + "21MI8/21MI8_15.jpeg", + "21MI8/21MI8_16.jpeg", + "21MI8/21MI8_17.jpeg", + "21MI8/21MI8_18.jpeg", + "21MI8/21MI8_19.jpeg", + "21MI8/21MI8_20.jpeg", + "21MI8/21MI8_21.jpeg", + "21MI8/21MI8_22.jpeg", + "21MI8/21MI8_23.jpeg", + "21MI8/21MI8_24.jpeg", + "21MI8/21MI8_25.jpeg", + "21MI8/21MI8_26.jpeg", + "21MI8/21MI8_27.jpeg", + "21MI8/21MI8_28.jpeg", + "21MI8/21MI8_29.jpeg", + "21MI8/21MI8_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 132, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person washed the dish?\nChoice list: \nA. Washed the table.\nB. Took the cup/glass/bottle.\nC. Tidied up the towel.\nD. Put down the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "washed the table", + "took the cup glass bottle", + "tidied up the towel", + "put down the box" + ], + "image_quantity_level": "Medium", + "image": [ + "VO45S/VO45S_0.jpeg", + "VO45S/VO45S_1.jpeg", + "VO45S/VO45S_2.jpeg", + "VO45S/VO45S_3.jpeg", + "VO45S/VO45S_4.jpeg", + "VO45S/VO45S_5.jpeg", + "VO45S/VO45S_6.jpeg", + "VO45S/VO45S_7.jpeg", + "VO45S/VO45S_8.jpeg", + "VO45S/VO45S_9.jpeg", + "VO45S/VO45S_10.jpeg", + "VO45S/VO45S_11.jpeg", + "VO45S/VO45S_12.jpeg", + "VO45S/VO45S_13.jpeg", + "VO45S/VO45S_14.jpeg", + "VO45S/VO45S_15.jpeg", + "VO45S/VO45S_16.jpeg", + "VO45S/VO45S_17.jpeg", + "VO45S/VO45S_18.jpeg", + "VO45S/VO45S_19.jpeg", + "VO45S/VO45S_20.jpeg", + "VO45S/VO45S_21.jpeg", + "VO45S/VO45S_22.jpeg", + "VO45S/VO45S_23.jpeg", + "VO45S/VO45S_24.jpeg", + "VO45S/VO45S_25.jpeg", + "VO45S/VO45S_26.jpeg", + "VO45S/VO45S_27.jpeg", + "VO45S/VO45S_28.jpeg", + "VO45S/VO45S_29.jpeg", + "VO45S/VO45S_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 138, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person opened the refrigerator?\nChoice list: \nA. Took the food.\nB. Took the pillow.\nC. Sat on the sofa/couch.\nD. Put down the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the food", + "took the pillow", + "sat on the sofa couch", + "put down the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "JOLLV/JOLLV_0.jpeg", + "JOLLV/JOLLV_1.jpeg", + "JOLLV/JOLLV_2.jpeg", + "JOLLV/JOLLV_3.jpeg", + "JOLLV/JOLLV_4.jpeg", + "JOLLV/JOLLV_5.jpeg", + "JOLLV/JOLLV_6.jpeg", + "JOLLV/JOLLV_7.jpeg", + "JOLLV/JOLLV_8.jpeg", + "JOLLV/JOLLV_9.jpeg", + "JOLLV/JOLLV_10.jpeg", + "JOLLV/JOLLV_11.jpeg", + "JOLLV/JOLLV_12.jpeg", + "JOLLV/JOLLV_13.jpeg", + "JOLLV/JOLLV_14.jpeg", + "JOLLV/JOLLV_15.jpeg", + "JOLLV/JOLLV_16.jpeg", + "JOLLV/JOLLV_17.jpeg", + "JOLLV/JOLLV_18.jpeg", + "JOLLV/JOLLV_19.jpeg", + "JOLLV/JOLLV_20.jpeg", + "JOLLV/JOLLV_21.jpeg", + "JOLLV/JOLLV_22.jpeg", + "JOLLV/JOLLV_23.jpeg", + "JOLLV/JOLLV_24.jpeg", + "JOLLV/JOLLV_25.jpeg", + "JOLLV/JOLLV_26.jpeg", + "JOLLV/JOLLV_27.jpeg", + "JOLLV/JOLLV_28.jpeg", + "JOLLV/JOLLV_29.jpeg", + "JOLLV/JOLLV_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 118, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person took the cup/glass/bottle?\nChoice list: \nA. Put down the dish.\nB. Opened the book.\nC. Put down the phone/camera.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the dish", + "opened the book", + "put down the phone camera", + "took the food" + ], + "image_quantity_level": "Many", + "image": [ + "J7BOV/J7BOV_0.jpeg", + "J7BOV/J7BOV_1.jpeg", + "J7BOV/J7BOV_2.jpeg", + "J7BOV/J7BOV_3.jpeg", + "J7BOV/J7BOV_4.jpeg", + "J7BOV/J7BOV_5.jpeg", + "J7BOV/J7BOV_6.jpeg", + "J7BOV/J7BOV_7.jpeg", + "J7BOV/J7BOV_8.jpeg", + "J7BOV/J7BOV_9.jpeg", + "J7BOV/J7BOV_10.jpeg", + "J7BOV/J7BOV_11.jpeg", + "J7BOV/J7BOV_12.jpeg", + "J7BOV/J7BOV_13.jpeg", + "J7BOV/J7BOV_14.jpeg", + "J7BOV/J7BOV_15.jpeg", + "J7BOV/J7BOV_16.jpeg", + "J7BOV/J7BOV_17.jpeg", + "J7BOV/J7BOV_18.jpeg", + "J7BOV/J7BOV_19.jpeg", + "J7BOV/J7BOV_20.jpeg", + "J7BOV/J7BOV_21.jpeg", + "J7BOV/J7BOV_22.jpeg", + "J7BOV/J7BOV_23.jpeg", + "J7BOV/J7BOV_24.jpeg", + "J7BOV/J7BOV_25.jpeg", + "J7BOV/J7BOV_26.jpeg", + "J7BOV/J7BOV_27.jpeg", + "J7BOV/J7BOV_28.jpeg", + "J7BOV/J7BOV_29.jpeg", + "J7BOV/J7BOV_30.jpeg", + "J7BOV/J7BOV_31.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 124, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person put down the pillow?\nChoice list: \nA. Put down the laptop.\nB. Tidied up the blanket.\nC. Closed the door.\nD. Took the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the laptop", + "tidied up the blanket", + "closed the door", + "took the book" + ], + "image_quantity_level": "Many", + "image": [ + "3DO95/3DO95_0.jpeg", + "3DO95/3DO95_1.jpeg", + "3DO95/3DO95_2.jpeg", + "3DO95/3DO95_3.jpeg", + "3DO95/3DO95_4.jpeg", + "3DO95/3DO95_5.jpeg", + "3DO95/3DO95_6.jpeg", + "3DO95/3DO95_7.jpeg", + "3DO95/3DO95_8.jpeg", + "3DO95/3DO95_9.jpeg", + "3DO95/3DO95_10.jpeg", + "3DO95/3DO95_11.jpeg", + "3DO95/3DO95_12.jpeg", + "3DO95/3DO95_13.jpeg", + "3DO95/3DO95_14.jpeg", + "3DO95/3DO95_15.jpeg", + "3DO95/3DO95_16.jpeg", + "3DO95/3DO95_17.jpeg", + "3DO95/3DO95_18.jpeg", + "3DO95/3DO95_19.jpeg", + "3DO95/3DO95_20.jpeg", + "3DO95/3DO95_21.jpeg", + "3DO95/3DO95_22.jpeg", + "3DO95/3DO95_23.jpeg", + "3DO95/3DO95_24.jpeg", + "3DO95/3DO95_25.jpeg", + "3DO95/3DO95_26.jpeg", + "3DO95/3DO95_27.jpeg", + "3DO95/3DO95_28.jpeg", + "3DO95/3DO95_29.jpeg", + "3DO95/3DO95_30.jpeg", + "3DO95/3DO95_31.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 130, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person threw the clothes?\nChoice list: \nA. Ate the sandwich.\nB. Put down the clothes.\nC. Took the cup/glass/bottle.\nD. Took the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "ate the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "ate the sandwich", + "put down the clothes", + "took the cup glass bottle", + "took the pillow" + ], + "image_quantity_level": "Many", + "image": [ + "17RPG/17RPG_0.jpeg", + "17RPG/17RPG_1.jpeg", + "17RPG/17RPG_2.jpeg", + "17RPG/17RPG_3.jpeg", + "17RPG/17RPG_4.jpeg", + "17RPG/17RPG_5.jpeg", + "17RPG/17RPG_6.jpeg", + "17RPG/17RPG_7.jpeg", + "17RPG/17RPG_8.jpeg", + "17RPG/17RPG_9.jpeg", + "17RPG/17RPG_10.jpeg", + "17RPG/17RPG_11.jpeg", + "17RPG/17RPG_12.jpeg", + "17RPG/17RPG_13.jpeg", + "17RPG/17RPG_14.jpeg", + "17RPG/17RPG_15.jpeg", + "17RPG/17RPG_16.jpeg", + "17RPG/17RPG_17.jpeg", + "17RPG/17RPG_18.jpeg", + "17RPG/17RPG_19.jpeg", + "17RPG/17RPG_20.jpeg", + "17RPG/17RPG_21.jpeg", + "17RPG/17RPG_22.jpeg", + "17RPG/17RPG_23.jpeg", + "17RPG/17RPG_24.jpeg", + "17RPG/17RPG_25.jpeg", + "17RPG/17RPG_26.jpeg", + "17RPG/17RPG_27.jpeg", + "17RPG/17RPG_28.jpeg", + "17RPG/17RPG_29.jpeg", + "17RPG/17RPG_30.jpeg", + "17RPG/17RPG_31.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 126, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat on the floor?\nChoice list: \nA. Took the laptop.\nB. Threw the blanket.\nC. Put down the pillow.\nD. Took the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the broom", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the laptop", + "threw the blanket", + "put down the pillow", + "took the broom" + ], + "image_quantity_level": "Medium", + "image": [ + "L29HE/L29HE_0.jpeg", + "L29HE/L29HE_1.jpeg", + "L29HE/L29HE_2.jpeg", + "L29HE/L29HE_3.jpeg", + "L29HE/L29HE_4.jpeg", + "L29HE/L29HE_5.jpeg", + "L29HE/L29HE_6.jpeg", + "L29HE/L29HE_7.jpeg", + "L29HE/L29HE_8.jpeg", + "L29HE/L29HE_9.jpeg", + "L29HE/L29HE_10.jpeg", + "L29HE/L29HE_11.jpeg", + "L29HE/L29HE_12.jpeg", + "L29HE/L29HE_13.jpeg", + "L29HE/L29HE_14.jpeg", + "L29HE/L29HE_15.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 152, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person sat on the sofa/couch?\nChoice list: \nA. Took the clothes.\nB. Put down the picture.\nC. Closed the window.\nD. Put down the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the clothes", + "put down the picture", + "closed the window", + "put down the phone camera" + ], + "image_quantity_level": "Many", + "image": [ + "OY3LS/OY3LS_0.jpeg", + "OY3LS/OY3LS_1.jpeg", + "OY3LS/OY3LS_2.jpeg", + "OY3LS/OY3LS_3.jpeg", + "OY3LS/OY3LS_4.jpeg", + "OY3LS/OY3LS_5.jpeg", + "OY3LS/OY3LS_6.jpeg", + "OY3LS/OY3LS_7.jpeg", + "OY3LS/OY3LS_8.jpeg", + "OY3LS/OY3LS_9.jpeg", + "OY3LS/OY3LS_10.jpeg", + "OY3LS/OY3LS_11.jpeg", + "OY3LS/OY3LS_12.jpeg", + "OY3LS/OY3LS_13.jpeg", + "OY3LS/OY3LS_14.jpeg", + "OY3LS/OY3LS_15.jpeg", + "OY3LS/OY3LS_16.jpeg", + "OY3LS/OY3LS_17.jpeg", + "OY3LS/OY3LS_18.jpeg", + "OY3LS/OY3LS_19.jpeg", + "OY3LS/OY3LS_20.jpeg", + "OY3LS/OY3LS_21.jpeg", + "OY3LS/OY3LS_22.jpeg", + "OY3LS/OY3LS_23.jpeg", + "OY3LS/OY3LS_24.jpeg", + "OY3LS/OY3LS_25.jpeg", + "OY3LS/OY3LS_26.jpeg", + "OY3LS/OY3LS_27.jpeg", + "OY3LS/OY3LS_28.jpeg", + "OY3LS/OY3LS_29.jpeg", + "OY3LS/OY3LS_30.jpeg", + "OY3LS/OY3LS_31.jpeg", + "OY3LS/OY3LS_32.jpeg", + "OY3LS/OY3LS_33.jpeg", + "OY3LS/OY3LS_34.jpeg", + "OY3LS/OY3LS_35.jpeg", + "OY3LS/OY3LS_36.jpeg", + "OY3LS/OY3LS_37.jpeg", + "OY3LS/OY3LS_38.jpeg", + "OY3LS/OY3LS_39.jpeg", + "OY3LS/OY3LS_40.jpeg", + "OY3LS/OY3LS_41.jpeg", + "OY3LS/OY3LS_42.jpeg", + "OY3LS/OY3LS_43.jpeg", + "OY3LS/OY3LS_44.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 116, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person drank from the cup/glass/bottle?\nChoice list: \nA. Took the towel.\nB. Put down the dish.\nC. Opened the book.\nD. Took the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the towel", + "put down the dish", + "opened the book", + "took the dish" + ], + "image_quantity_level": "Many", + "image": [ + "F0ZPW/F0ZPW_0.jpeg", + "F0ZPW/F0ZPW_1.jpeg", + "F0ZPW/F0ZPW_2.jpeg", + "F0ZPW/F0ZPW_3.jpeg", + "F0ZPW/F0ZPW_4.jpeg", + "F0ZPW/F0ZPW_5.jpeg", + "F0ZPW/F0ZPW_6.jpeg", + "F0ZPW/F0ZPW_7.jpeg", + "F0ZPW/F0ZPW_8.jpeg", + "F0ZPW/F0ZPW_9.jpeg", + "F0ZPW/F0ZPW_10.jpeg", + "F0ZPW/F0ZPW_11.jpeg", + "F0ZPW/F0ZPW_12.jpeg", + "F0ZPW/F0ZPW_13.jpeg", + "F0ZPW/F0ZPW_14.jpeg", + "F0ZPW/F0ZPW_15.jpeg", + "F0ZPW/F0ZPW_16.jpeg", + "F0ZPW/F0ZPW_17.jpeg", + "F0ZPW/F0ZPW_18.jpeg", + "F0ZPW/F0ZPW_19.jpeg", + "F0ZPW/F0ZPW_20.jpeg", + "F0ZPW/F0ZPW_21.jpeg", + "F0ZPW/F0ZPW_22.jpeg", + "F0ZPW/F0ZPW_23.jpeg", + "F0ZPW/F0ZPW_24.jpeg", + "F0ZPW/F0ZPW_25.jpeg", + "F0ZPW/F0ZPW_26.jpeg", + "F0ZPW/F0ZPW_27.jpeg", + "F0ZPW/F0ZPW_28.jpeg", + "F0ZPW/F0ZPW_29.jpeg", + "F0ZPW/F0ZPW_30.jpeg", + "F0ZPW/F0ZPW_31.jpeg", + "F0ZPW/F0ZPW_32.jpeg", + "F0ZPW/F0ZPW_33.jpeg", + "F0ZPW/F0ZPW_34.jpeg", + "F0ZPW/F0ZPW_35.jpeg", + "F0ZPW/F0ZPW_36.jpeg", + "F0ZPW/F0ZPW_37.jpeg", + "F0ZPW/F0ZPW_38.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 120, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person held the dish?\nChoice list: \nA. Closed the door.\nB. Put down the food.\nC. Closed the refrigerator.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the food", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "closed the door", + "put down the food", + "closed the refrigerator", + "took the food" + ], + "image_quantity_level": "Medium", + "image": [ + "UIERL/UIERL_0.jpeg", + "UIERL/UIERL_1.jpeg", + "UIERL/UIERL_2.jpeg", + "UIERL/UIERL_3.jpeg", + "UIERL/UIERL_4.jpeg", + "UIERL/UIERL_5.jpeg", + "UIERL/UIERL_6.jpeg", + "UIERL/UIERL_7.jpeg", + "UIERL/UIERL_8.jpeg", + "UIERL/UIERL_9.jpeg", + "UIERL/UIERL_10.jpeg", + "UIERL/UIERL_11.jpeg", + "UIERL/UIERL_12.jpeg", + "UIERL/UIERL_13.jpeg", + "UIERL/UIERL_14.jpeg", + "UIERL/UIERL_15.jpeg", + "UIERL/UIERL_16.jpeg", + "UIERL/UIERL_17.jpeg", + "UIERL/UIERL_18.jpeg", + "UIERL/UIERL_19.jpeg", + "UIERL/UIERL_20.jpeg", + "UIERL/UIERL_21.jpeg", + "UIERL/UIERL_22.jpeg", + "UIERL/UIERL_23.jpeg", + "UIERL/UIERL_24.jpeg", + "UIERL/UIERL_25.jpeg", + "UIERL/UIERL_26.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 123, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person ate the sandwich?\nChoice list: \nA. Sat on the floor.\nB. Closed the refrigerator.\nC. Closed the closet/cabinet.\nD. Took the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "sat on the floor", + "closed the refrigerator", + "closed the closet cabinet", + "took the box" + ], + "image_quantity_level": "Medium", + "image": [ + "R9382/R9382_0.jpeg", + "R9382/R9382_1.jpeg", + "R9382/R9382_2.jpeg", + "R9382/R9382_3.jpeg", + "R9382/R9382_4.jpeg", + "R9382/R9382_5.jpeg", + "R9382/R9382_6.jpeg", + "R9382/R9382_7.jpeg", + "R9382/R9382_8.jpeg", + "R9382/R9382_9.jpeg", + "R9382/R9382_10.jpeg", + "R9382/R9382_11.jpeg", + "R9382/R9382_12.jpeg", + "R9382/R9382_13.jpeg", + "R9382/R9382_14.jpeg", + "R9382/R9382_15.jpeg", + "R9382/R9382_16.jpeg", + "R9382/R9382_17.jpeg", + "R9382/R9382_18.jpeg", + "R9382/R9382_19.jpeg", + "R9382/R9382_20.jpeg", + "R9382/R9382_21.jpeg", + "R9382/R9382_22.jpeg", + "R9382/R9382_23.jpeg", + "R9382/R9382_24.jpeg", + "R9382/R9382_25.jpeg", + "R9382/R9382_26.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 133, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person closed the laptop?\nChoice list: \nA. Put down the sandwich.\nB. Took the cup/glass/bottle.\nC. Took the pillow.\nD. Put down the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the sandwich", + "took the cup glass bottle", + "took the pillow", + "put down the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "C1DK7/C1DK7_0.jpeg", + "C1DK7/C1DK7_1.jpeg", + "C1DK7/C1DK7_2.jpeg", + "C1DK7/C1DK7_3.jpeg", + "C1DK7/C1DK7_4.jpeg", + "C1DK7/C1DK7_5.jpeg", + "C1DK7/C1DK7_6.jpeg", + "C1DK7/C1DK7_7.jpeg", + "C1DK7/C1DK7_8.jpeg", + "C1DK7/C1DK7_9.jpeg", + "C1DK7/C1DK7_10.jpeg", + "C1DK7/C1DK7_11.jpeg", + "C1DK7/C1DK7_12.jpeg", + "C1DK7/C1DK7_13.jpeg", + "C1DK7/C1DK7_14.jpeg", + "C1DK7/C1DK7_15.jpeg", + "C1DK7/C1DK7_16.jpeg", + "C1DK7/C1DK7_17.jpeg", + "C1DK7/C1DK7_18.jpeg", + "C1DK7/C1DK7_19.jpeg", + "C1DK7/C1DK7_20.jpeg", + "C1DK7/C1DK7_21.jpeg", + "C1DK7/C1DK7_22.jpeg", + "C1DK7/C1DK7_23.jpeg", + "C1DK7/C1DK7_24.jpeg", + "C1DK7/C1DK7_25.jpeg", + "C1DK7/C1DK7_26.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 54, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Tidied up the clothes.\nB. Put down the bag.\nC. Put down the food.\nD. Opened the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidied up the clothes", + "put down the bag", + "put down the food", + "opened the book" + ], + "image_quantity_level": "Many", + "image": [ + "56XKK/56XKK_0.jpeg", + "56XKK/56XKK_1.jpeg", + "56XKK/56XKK_2.jpeg", + "56XKK/56XKK_3.jpeg", + "56XKK/56XKK_4.jpeg", + "56XKK/56XKK_5.jpeg", + "56XKK/56XKK_6.jpeg", + "56XKK/56XKK_7.jpeg", + "56XKK/56XKK_8.jpeg", + "56XKK/56XKK_9.jpeg", + "56XKK/56XKK_10.jpeg", + "56XKK/56XKK_11.jpeg", + "56XKK/56XKK_12.jpeg", + "56XKK/56XKK_13.jpeg", + "56XKK/56XKK_14.jpeg", + "56XKK/56XKK_15.jpeg", + "56XKK/56XKK_16.jpeg", + "56XKK/56XKK_17.jpeg", + "56XKK/56XKK_18.jpeg", + "56XKK/56XKK_19.jpeg", + "56XKK/56XKK_20.jpeg", + "56XKK/56XKK_21.jpeg", + "56XKK/56XKK_22.jpeg", + "56XKK/56XKK_23.jpeg", + "56XKK/56XKK_24.jpeg", + "56XKK/56XKK_25.jpeg", + "56XKK/56XKK_26.jpeg", + "56XKK/56XKK_27.jpeg", + "56XKK/56XKK_28.jpeg", + "56XKK/56XKK_29.jpeg", + "56XKK/56XKK_30.jpeg", + "56XKK/56XKK_31.jpeg", + "56XKK/56XKK_32.jpeg", + "56XKK/56XKK_33.jpeg", + "56XKK/56XKK_34.jpeg", + "56XKK/56XKK_35.jpeg", + "56XKK/56XKK_36.jpeg", + "56XKK/56XKK_37.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 70, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person lied on the bed?\nChoice list: \nA. Washed the dish.\nB. Opened the laptop.\nC. Put down the pillow.\nD. Took the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "washed the dish", + "opened the laptop", + "put down the pillow", + "took the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "KU656/KU656_0.jpeg", + "KU656/KU656_1.jpeg", + "KU656/KU656_2.jpeg", + "KU656/KU656_3.jpeg", + "KU656/KU656_4.jpeg", + "KU656/KU656_5.jpeg", + "KU656/KU656_6.jpeg", + "KU656/KU656_7.jpeg", + "KU656/KU656_8.jpeg", + "KU656/KU656_9.jpeg", + "KU656/KU656_10.jpeg", + "KU656/KU656_11.jpeg", + "KU656/KU656_12.jpeg", + "KU656/KU656_13.jpeg", + "KU656/KU656_14.jpeg", + "KU656/KU656_15.jpeg", + "KU656/KU656_16.jpeg", + "KU656/KU656_17.jpeg", + "KU656/KU656_18.jpeg", + "KU656/KU656_19.jpeg", + "KU656/KU656_20.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 110, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person lied on the bed?\nChoice list: \nA. Threw the pillow.\nB. Took the book.\nC. Put down the picture.\nD. Closed the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the picture", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "threw the pillow", + "took the book", + "put down the picture", + "closed the box" + ], + "image_quantity_level": "Medium", + "image": [ + "V54TI/V54TI_0.jpeg", + "V54TI/V54TI_1.jpeg", + "V54TI/V54TI_2.jpeg", + "V54TI/V54TI_3.jpeg", + "V54TI/V54TI_4.jpeg", + "V54TI/V54TI_5.jpeg", + "V54TI/V54TI_6.jpeg", + "V54TI/V54TI_7.jpeg", + "V54TI/V54TI_8.jpeg", + "V54TI/V54TI_9.jpeg", + "V54TI/V54TI_10.jpeg", + "V54TI/V54TI_11.jpeg", + "V54TI/V54TI_12.jpeg", + "V54TI/V54TI_13.jpeg", + "V54TI/V54TI_14.jpeg", + "V54TI/V54TI_15.jpeg", + "V54TI/V54TI_16.jpeg", + "V54TI/V54TI_17.jpeg", + "V54TI/V54TI_18.jpeg", + "V54TI/V54TI_19.jpeg", + "V54TI/V54TI_20.jpeg", + "V54TI/V54TI_21.jpeg", + "V54TI/V54TI_22.jpeg", + "V54TI/V54TI_23.jpeg", + "V54TI/V54TI_24.jpeg", + "V54TI/V54TI_25.jpeg", + "V54TI/V54TI_26.jpeg", + "V54TI/V54TI_27.jpeg", + "V54TI/V54TI_28.jpeg", + "V54TI/V54TI_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 114, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person closed the book?\nChoice list: \nA. Put down the phone/camera.\nB. Sat at the table.\nC. Put down the cup/glass/bottle.\nD. Put down the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the phone camera", + "sat at the table", + "put down the cup glass bottle", + "put down the broom" + ], + "image_quantity_level": "Medium", + "image": [ + "UDGRS/UDGRS_0.jpeg", + "UDGRS/UDGRS_1.jpeg", + "UDGRS/UDGRS_2.jpeg", + "UDGRS/UDGRS_3.jpeg", + "UDGRS/UDGRS_4.jpeg", + "UDGRS/UDGRS_5.jpeg", + "UDGRS/UDGRS_6.jpeg", + "UDGRS/UDGRS_7.jpeg", + "UDGRS/UDGRS_8.jpeg", + "UDGRS/UDGRS_9.jpeg", + "UDGRS/UDGRS_10.jpeg", + "UDGRS/UDGRS_11.jpeg", + "UDGRS/UDGRS_12.jpeg", + "UDGRS/UDGRS_13.jpeg", + "UDGRS/UDGRS_14.jpeg", + "UDGRS/UDGRS_15.jpeg", + "UDGRS/UDGRS_16.jpeg", + "UDGRS/UDGRS_17.jpeg", + "UDGRS/UDGRS_18.jpeg", + "UDGRS/UDGRS_19.jpeg", + "UDGRS/UDGRS_20.jpeg", + "UDGRS/UDGRS_21.jpeg", + "UDGRS/UDGRS_22.jpeg", + "UDGRS/UDGRS_23.jpeg", + "UDGRS/UDGRS_24.jpeg", + "UDGRS/UDGRS_25.jpeg", + "UDGRS/UDGRS_26.jpeg", + "UDGRS/UDGRS_27.jpeg", + "UDGRS/UDGRS_28.jpeg", + "UDGRS/UDGRS_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 119, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person put down the clothes?\nChoice list: \nA. Put down the pillow.\nB. Took the laptop.\nC. Closed the door.\nD. Closed the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the pillow", + "took the laptop", + "closed the door", + "closed the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "3T785/3T785_0.jpeg", + "3T785/3T785_1.jpeg", + "3T785/3T785_2.jpeg", + "3T785/3T785_3.jpeg", + "3T785/3T785_4.jpeg", + "3T785/3T785_5.jpeg", + "3T785/3T785_6.jpeg", + "3T785/3T785_7.jpeg", + "3T785/3T785_8.jpeg", + "3T785/3T785_9.jpeg", + "3T785/3T785_10.jpeg", + "3T785/3T785_11.jpeg", + "3T785/3T785_12.jpeg", + "3T785/3T785_13.jpeg", + "3T785/3T785_14.jpeg", + "3T785/3T785_15.jpeg", + "3T785/3T785_16.jpeg", + "3T785/3T785_17.jpeg", + "3T785/3T785_18.jpeg", + "3T785/3T785_19.jpeg", + "3T785/3T785_20.jpeg", + "3T785/3T785_21.jpeg", + "3T785/3T785_22.jpeg", + "3T785/3T785_23.jpeg", + "3T785/3T785_24.jpeg", + "3T785/3T785_25.jpeg", + "3T785/3T785_26.jpeg", + "3T785/3T785_27.jpeg", + "3T785/3T785_28.jpeg", + "3T785/3T785_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 122, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person held the broom?\nChoice list: \nA. Put down the phone/camera.\nB. Put down the cup/glass/bottle.\nC. Washed the table.\nD. Tidied up the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the phone camera", + "put down the cup glass bottle", + "washed the table", + "tidied up the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "AEUVY/AEUVY_0.jpeg", + "AEUVY/AEUVY_1.jpeg", + "AEUVY/AEUVY_2.jpeg", + "AEUVY/AEUVY_3.jpeg", + "AEUVY/AEUVY_4.jpeg", + "AEUVY/AEUVY_5.jpeg", + "AEUVY/AEUVY_6.jpeg", + "AEUVY/AEUVY_7.jpeg", + "AEUVY/AEUVY_8.jpeg", + "AEUVY/AEUVY_9.jpeg", + "AEUVY/AEUVY_10.jpeg", + "AEUVY/AEUVY_11.jpeg", + "AEUVY/AEUVY_12.jpeg", + "AEUVY/AEUVY_13.jpeg", + "AEUVY/AEUVY_14.jpeg", + "AEUVY/AEUVY_15.jpeg", + "AEUVY/AEUVY_16.jpeg", + "AEUVY/AEUVY_17.jpeg", + "AEUVY/AEUVY_18.jpeg", + "AEUVY/AEUVY_19.jpeg", + "AEUVY/AEUVY_20.jpeg", + "AEUVY/AEUVY_21.jpeg", + "AEUVY/AEUVY_22.jpeg", + "AEUVY/AEUVY_23.jpeg", + "AEUVY/AEUVY_24.jpeg", + "AEUVY/AEUVY_25.jpeg", + "AEUVY/AEUVY_26.jpeg", + "AEUVY/AEUVY_27.jpeg", + "AEUVY/AEUVY_28.jpeg", + "AEUVY/AEUVY_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 128, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person held the dish?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Sat on the floor.\nC. Took the cup/glass/bottle.\nD. Threw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the cup glass bottle", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the cup glass bottle", + "sat on the floor", + "took the cup glass bottle", + "threw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "DSZYT/DSZYT_0.jpeg", + "DSZYT/DSZYT_1.jpeg", + "DSZYT/DSZYT_2.jpeg", + "DSZYT/DSZYT_3.jpeg", + "DSZYT/DSZYT_4.jpeg", + "DSZYT/DSZYT_5.jpeg", + "DSZYT/DSZYT_6.jpeg", + "DSZYT/DSZYT_7.jpeg", + "DSZYT/DSZYT_8.jpeg", + "DSZYT/DSZYT_9.jpeg", + "DSZYT/DSZYT_10.jpeg", + "DSZYT/DSZYT_11.jpeg", + "DSZYT/DSZYT_12.jpeg", + "DSZYT/DSZYT_13.jpeg", + "DSZYT/DSZYT_14.jpeg", + "DSZYT/DSZYT_15.jpeg", + "DSZYT/DSZYT_16.jpeg", + "DSZYT/DSZYT_17.jpeg", + "DSZYT/DSZYT_18.jpeg", + "DSZYT/DSZYT_19.jpeg", + "DSZYT/DSZYT_20.jpeg", + "DSZYT/DSZYT_21.jpeg", + "DSZYT/DSZYT_22.jpeg", + "DSZYT/DSZYT_23.jpeg", + "DSZYT/DSZYT_24.jpeg", + "DSZYT/DSZYT_25.jpeg", + "DSZYT/DSZYT_26.jpeg", + "DSZYT/DSZYT_27.jpeg", + "DSZYT/DSZYT_28.jpeg", + "DSZYT/DSZYT_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 137, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person opened the closet/cabinet?\nChoice list: \nA. Closed the laptop.\nB. Put down the cup/glass/bottle.\nC. Put down the food.\nD. Threw the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the laptop", + "put down the cup glass bottle", + "put down the food", + "threw the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "P2HZG/P2HZG_0.jpeg", + "P2HZG/P2HZG_1.jpeg", + "P2HZG/P2HZG_2.jpeg", + "P2HZG/P2HZG_3.jpeg", + "P2HZG/P2HZG_4.jpeg", + "P2HZG/P2HZG_5.jpeg", + "P2HZG/P2HZG_6.jpeg", + "P2HZG/P2HZG_7.jpeg", + "P2HZG/P2HZG_8.jpeg", + "P2HZG/P2HZG_9.jpeg", + "P2HZG/P2HZG_10.jpeg", + "P2HZG/P2HZG_11.jpeg", + "P2HZG/P2HZG_12.jpeg", + "P2HZG/P2HZG_13.jpeg", + "P2HZG/P2HZG_14.jpeg", + "P2HZG/P2HZG_15.jpeg", + "P2HZG/P2HZG_16.jpeg", + "P2HZG/P2HZG_17.jpeg", + "P2HZG/P2HZG_18.jpeg", + "P2HZG/P2HZG_19.jpeg", + "P2HZG/P2HZG_20.jpeg", + "P2HZG/P2HZG_21.jpeg", + "P2HZG/P2HZG_22.jpeg", + "P2HZG/P2HZG_23.jpeg", + "P2HZG/P2HZG_24.jpeg", + "P2HZG/P2HZG_25.jpeg", + "P2HZG/P2HZG_26.jpeg", + "P2HZG/P2HZG_27.jpeg", + "P2HZG/P2HZG_28.jpeg", + "P2HZG/P2HZG_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 144, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the phone/camera?\nChoice list: \nA. Took the clothes.\nB. Put down the cup/glass/bottle.\nC. Put down the pillow.\nD. Opened the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the clothes", + "put down the cup glass bottle", + "put down the pillow", + "opened the window" + ], + "image_quantity_level": "Medium", + "image": [ + "1L5D3/1L5D3_0.jpeg", + "1L5D3/1L5D3_1.jpeg", + "1L5D3/1L5D3_2.jpeg", + "1L5D3/1L5D3_3.jpeg", + "1L5D3/1L5D3_4.jpeg", + "1L5D3/1L5D3_5.jpeg", + "1L5D3/1L5D3_6.jpeg", + "1L5D3/1L5D3_7.jpeg", + "1L5D3/1L5D3_8.jpeg", + "1L5D3/1L5D3_9.jpeg", + "1L5D3/1L5D3_10.jpeg", + "1L5D3/1L5D3_11.jpeg", + "1L5D3/1L5D3_12.jpeg", + "1L5D3/1L5D3_13.jpeg", + "1L5D3/1L5D3_14.jpeg", + "1L5D3/1L5D3_15.jpeg", + "1L5D3/1L5D3_16.jpeg", + "1L5D3/1L5D3_17.jpeg", + "1L5D3/1L5D3_18.jpeg", + "1L5D3/1L5D3_19.jpeg", + "1L5D3/1L5D3_20.jpeg", + "1L5D3/1L5D3_21.jpeg", + "1L5D3/1L5D3_22.jpeg", + "1L5D3/1L5D3_23.jpeg", + "1L5D3/1L5D3_24.jpeg", + "1L5D3/1L5D3_25.jpeg", + "1L5D3/1L5D3_26.jpeg", + "1L5D3/1L5D3_27.jpeg", + "1L5D3/1L5D3_28.jpeg", + "1L5D3/1L5D3_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 147, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person held the book?\nChoice list: \nA. Sat on the bed.\nB. Took the paper/notebook.\nC. Took the picture.\nD. Threw the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "sat on the bed", + "took the paper notebook", + "took the picture", + "threw the book" + ], + "image_quantity_level": "Medium", + "image": [ + "FETME/FETME_0.jpeg", + "FETME/FETME_1.jpeg", + "FETME/FETME_2.jpeg", + "FETME/FETME_3.jpeg", + "FETME/FETME_4.jpeg", + "FETME/FETME_5.jpeg", + "FETME/FETME_6.jpeg", + "FETME/FETME_7.jpeg", + "FETME/FETME_8.jpeg", + "FETME/FETME_9.jpeg", + "FETME/FETME_10.jpeg", + "FETME/FETME_11.jpeg", + "FETME/FETME_12.jpeg", + "FETME/FETME_13.jpeg", + "FETME/FETME_14.jpeg", + "FETME/FETME_15.jpeg", + "FETME/FETME_16.jpeg", + "FETME/FETME_17.jpeg", + "FETME/FETME_18.jpeg", + "FETME/FETME_19.jpeg", + "FETME/FETME_20.jpeg", + "FETME/FETME_21.jpeg", + "FETME/FETME_22.jpeg", + "FETME/FETME_23.jpeg", + "FETME/FETME_24.jpeg", + "FETME/FETME_25.jpeg", + "FETME/FETME_26.jpeg", + "FETME/FETME_27.jpeg", + "FETME/FETME_28.jpeg", + "FETME/FETME_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 159, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person took the phone/camera?\nChoice list: \nA. Took the clothes.\nB. Put down the cup/glass/bottle.\nC. Put down the pillow.\nD. Opened the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the clothes", + "put down the cup glass bottle", + "put down the pillow", + "opened the window" + ], + "image_quantity_level": "Medium", + "image": [ + "1L5D3/1L5D3_0.jpeg", + "1L5D3/1L5D3_1.jpeg", + "1L5D3/1L5D3_2.jpeg", + "1L5D3/1L5D3_3.jpeg", + "1L5D3/1L5D3_4.jpeg", + "1L5D3/1L5D3_5.jpeg", + "1L5D3/1L5D3_6.jpeg", + "1L5D3/1L5D3_7.jpeg", + "1L5D3/1L5D3_8.jpeg", + "1L5D3/1L5D3_9.jpeg", + "1L5D3/1L5D3_10.jpeg", + "1L5D3/1L5D3_11.jpeg", + "1L5D3/1L5D3_12.jpeg", + "1L5D3/1L5D3_13.jpeg", + "1L5D3/1L5D3_14.jpeg", + "1L5D3/1L5D3_15.jpeg", + "1L5D3/1L5D3_16.jpeg", + "1L5D3/1L5D3_17.jpeg", + "1L5D3/1L5D3_18.jpeg", + "1L5D3/1L5D3_19.jpeg", + "1L5D3/1L5D3_20.jpeg", + "1L5D3/1L5D3_21.jpeg", + "1L5D3/1L5D3_22.jpeg", + "1L5D3/1L5D3_23.jpeg", + "1L5D3/1L5D3_24.jpeg", + "1L5D3/1L5D3_25.jpeg", + "1L5D3/1L5D3_26.jpeg", + "1L5D3/1L5D3_27.jpeg", + "1L5D3/1L5D3_28.jpeg", + "1L5D3/1L5D3_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 71, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person opened the box?\nChoice list: \nA. Put down the picture.\nB. Tidied up the table.\nC. Lied on the floor.\nD. Took the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the picture", + "tidied up the table", + "lied on the floor", + "took the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "YB67Z/YB67Z_0.jpeg", + "YB67Z/YB67Z_1.jpeg", + "YB67Z/YB67Z_2.jpeg", + "YB67Z/YB67Z_3.jpeg", + "YB67Z/YB67Z_4.jpeg", + "YB67Z/YB67Z_5.jpeg", + "YB67Z/YB67Z_6.jpeg", + "YB67Z/YB67Z_7.jpeg", + "YB67Z/YB67Z_8.jpeg", + "YB67Z/YB67Z_9.jpeg", + "YB67Z/YB67Z_10.jpeg", + "YB67Z/YB67Z_11.jpeg", + "YB67Z/YB67Z_12.jpeg", + "YB67Z/YB67Z_13.jpeg", + "YB67Z/YB67Z_14.jpeg", + "YB67Z/YB67Z_15.jpeg", + "YB67Z/YB67Z_16.jpeg", + "YB67Z/YB67Z_17.jpeg", + "YB67Z/YB67Z_18.jpeg", + "YB67Z/YB67Z_19.jpeg", + "YB67Z/YB67Z_20.jpeg", + "YB67Z/YB67Z_21.jpeg", + "YB67Z/YB67Z_22.jpeg", + "YB67Z/YB67Z_23.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 83, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person held the book?\nChoice list: \nA. Closed the closet/cabinet.\nB. Took the paper/notebook.\nC. Sat on the sofa/couch.\nD. Closed the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "closed the closet cabinet", + "took the paper notebook", + "sat on the sofa couch", + "closed the refrigerator" + ], + "image_quantity_level": "Many", + "image": [ + "HOI88/HOI88_0.jpeg", + "HOI88/HOI88_1.jpeg", + "HOI88/HOI88_2.jpeg", + "HOI88/HOI88_3.jpeg", + "HOI88/HOI88_4.jpeg", + "HOI88/HOI88_5.jpeg", + "HOI88/HOI88_6.jpeg", + "HOI88/HOI88_7.jpeg", + "HOI88/HOI88_8.jpeg", + "HOI88/HOI88_9.jpeg", + "HOI88/HOI88_10.jpeg", + "HOI88/HOI88_11.jpeg", + "HOI88/HOI88_12.jpeg", + "HOI88/HOI88_13.jpeg", + "HOI88/HOI88_14.jpeg", + "HOI88/HOI88_15.jpeg", + "HOI88/HOI88_16.jpeg", + "HOI88/HOI88_17.jpeg", + "HOI88/HOI88_18.jpeg", + "HOI88/HOI88_19.jpeg", + "HOI88/HOI88_20.jpeg", + "HOI88/HOI88_21.jpeg", + "HOI88/HOI88_22.jpeg", + "HOI88/HOI88_23.jpeg", + "HOI88/HOI88_24.jpeg", + "HOI88/HOI88_25.jpeg", + "HOI88/HOI88_26.jpeg", + "HOI88/HOI88_27.jpeg", + "HOI88/HOI88_28.jpeg", + "HOI88/HOI88_29.jpeg", + "HOI88/HOI88_30.jpeg", + "HOI88/HOI88_31.jpeg", + "HOI88/HOI88_32.jpeg", + "HOI88/HOI88_33.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 84, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person put down the shoe?\nChoice list: \nA. Opened the box.\nB. Took the box.\nC. Took the cup/glass/bottle.\nD. Took the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the box", + "took the box", + "took the cup glass bottle", + "took the clothes" + ], + "image_quantity_level": "Many", + "image": [ + "AK9IB/AK9IB_0.jpeg", + "AK9IB/AK9IB_1.jpeg", + "AK9IB/AK9IB_2.jpeg", + "AK9IB/AK9IB_3.jpeg", + "AK9IB/AK9IB_4.jpeg", + "AK9IB/AK9IB_5.jpeg", + "AK9IB/AK9IB_6.jpeg", + "AK9IB/AK9IB_7.jpeg", + "AK9IB/AK9IB_8.jpeg", + "AK9IB/AK9IB_9.jpeg", + "AK9IB/AK9IB_10.jpeg", + "AK9IB/AK9IB_11.jpeg", + "AK9IB/AK9IB_12.jpeg", + "AK9IB/AK9IB_13.jpeg", + "AK9IB/AK9IB_14.jpeg", + "AK9IB/AK9IB_15.jpeg", + "AK9IB/AK9IB_16.jpeg", + "AK9IB/AK9IB_17.jpeg", + "AK9IB/AK9IB_18.jpeg", + "AK9IB/AK9IB_19.jpeg", + "AK9IB/AK9IB_20.jpeg", + "AK9IB/AK9IB_21.jpeg", + "AK9IB/AK9IB_22.jpeg", + "AK9IB/AK9IB_23.jpeg", + "AK9IB/AK9IB_24.jpeg", + "AK9IB/AK9IB_25.jpeg", + "AK9IB/AK9IB_26.jpeg", + "AK9IB/AK9IB_27.jpeg", + "AK9IB/AK9IB_28.jpeg", + "AK9IB/AK9IB_29.jpeg", + "AK9IB/AK9IB_30.jpeg", + "AK9IB/AK9IB_31.jpeg", + "AK9IB/AK9IB_32.jpeg", + "AK9IB/AK9IB_33.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 150, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person closed the refrigerator?\nChoice list: \nA. Put down the box.\nB. Put down the book.\nC. Put down the dish.\nD. Threw the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the box", + "put down the book", + "put down the dish", + "threw the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "8DL54/8DL54_0.jpeg", + "8DL54/8DL54_1.jpeg", + "8DL54/8DL54_2.jpeg", + "8DL54/8DL54_3.jpeg", + "8DL54/8DL54_4.jpeg", + "8DL54/8DL54_5.jpeg", + "8DL54/8DL54_6.jpeg", + "8DL54/8DL54_7.jpeg", + "8DL54/8DL54_8.jpeg", + "8DL54/8DL54_9.jpeg", + "8DL54/8DL54_10.jpeg", + "8DL54/8DL54_11.jpeg", + "8DL54/8DL54_12.jpeg", + "8DL54/8DL54_13.jpeg", + "8DL54/8DL54_14.jpeg", + "8DL54/8DL54_15.jpeg", + "8DL54/8DL54_16.jpeg", + "8DL54/8DL54_17.jpeg", + "8DL54/8DL54_18.jpeg", + "8DL54/8DL54_19.jpeg", + "8DL54/8DL54_20.jpeg", + "8DL54/8DL54_21.jpeg", + "8DL54/8DL54_22.jpeg", + "8DL54/8DL54_23.jpeg", + "8DL54/8DL54_24.jpeg", + "8DL54/8DL54_25.jpeg", + "8DL54/8DL54_26.jpeg", + "8DL54/8DL54_27.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 174, + "question": "Assess the progression of actions in the presented graphics and answer the connected query. You must choose your answer from the Choice List.\nWhat happened after the person drank from the cup/glass/bottle?\nChoice list: \nA. Threw the shoe.\nB. Took the broom.\nC. Put down the shoe.\nD. Sat on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the shoe", + "took the broom", + "put down the shoe", + "sat on the bed" + ], + "image_quantity_level": "Medium", + "image": [ + "F6A4W/F6A4W_0.jpeg", + "F6A4W/F6A4W_1.jpeg", + "F6A4W/F6A4W_2.jpeg", + "F6A4W/F6A4W_3.jpeg", + "F6A4W/F6A4W_4.jpeg", + "F6A4W/F6A4W_5.jpeg", + "F6A4W/F6A4W_6.jpeg", + "F6A4W/F6A4W_7.jpeg", + "F6A4W/F6A4W_8.jpeg", + "F6A4W/F6A4W_9.jpeg", + "F6A4W/F6A4W_10.jpeg", + "F6A4W/F6A4W_11.jpeg", + "F6A4W/F6A4W_12.jpeg", + "F6A4W/F6A4W_13.jpeg", + "F6A4W/F6A4W_14.jpeg", + "F6A4W/F6A4W_15.jpeg", + "F6A4W/F6A4W_16.jpeg", + "F6A4W/F6A4W_17.jpeg", + "F6A4W/F6A4W_18.jpeg", + "F6A4W/F6A4W_19.jpeg", + "F6A4W/F6A4W_20.jpeg", + "F6A4W/F6A4W_21.jpeg", + "F6A4W/F6A4W_22.jpeg", + "F6A4W/F6A4W_23.jpeg", + "F6A4W/F6A4W_24.jpeg", + "F6A4W/F6A4W_25.jpeg", + "F6A4W/F6A4W_26.jpeg", + "F6A4W/F6A4W_27.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 85, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the food?\nChoice list: \nA. Threw the towel.\nB. Closed the door.\nC. Opened the closet/cabinet.\nD. Put down the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the towel", + "closed the door", + "opened the closet cabinet", + "put down the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "DGMDT/DGMDT_0.jpeg", + "DGMDT/DGMDT_1.jpeg", + "DGMDT/DGMDT_2.jpeg", + "DGMDT/DGMDT_3.jpeg", + "DGMDT/DGMDT_4.jpeg", + "DGMDT/DGMDT_5.jpeg", + "DGMDT/DGMDT_6.jpeg", + "DGMDT/DGMDT_7.jpeg", + "DGMDT/DGMDT_8.jpeg", + "DGMDT/DGMDT_9.jpeg", + "DGMDT/DGMDT_10.jpeg", + "DGMDT/DGMDT_11.jpeg", + "DGMDT/DGMDT_12.jpeg", + "DGMDT/DGMDT_13.jpeg", + "DGMDT/DGMDT_14.jpeg", + "DGMDT/DGMDT_15.jpeg", + "DGMDT/DGMDT_16.jpeg", + "DGMDT/DGMDT_17.jpeg", + "DGMDT/DGMDT_18.jpeg", + "DGMDT/DGMDT_19.jpeg", + "DGMDT/DGMDT_20.jpeg", + "DGMDT/DGMDT_21.jpeg", + "DGMDT/DGMDT_22.jpeg", + "DGMDT/DGMDT_23.jpeg", + "DGMDT/DGMDT_24.jpeg", + "DGMDT/DGMDT_25.jpeg", + "DGMDT/DGMDT_26.jpeg", + "DGMDT/DGMDT_27.jpeg", + "DGMDT/DGMDT_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 101, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened after the person opened the closet/cabinet?\nChoice list: \nA. Put down the towel.\nB. Put down the clothes.\nC. Put down the food.\nD. Threw the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the towel", + "put down the clothes", + "put down the food", + "threw the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "ANA5N/ANA5N_0.jpeg", + "ANA5N/ANA5N_1.jpeg", + "ANA5N/ANA5N_2.jpeg", + "ANA5N/ANA5N_3.jpeg", + "ANA5N/ANA5N_4.jpeg", + "ANA5N/ANA5N_5.jpeg", + "ANA5N/ANA5N_6.jpeg", + "ANA5N/ANA5N_7.jpeg", + "ANA5N/ANA5N_8.jpeg", + "ANA5N/ANA5N_9.jpeg", + "ANA5N/ANA5N_10.jpeg", + "ANA5N/ANA5N_11.jpeg", + "ANA5N/ANA5N_12.jpeg", + "ANA5N/ANA5N_13.jpeg", + "ANA5N/ANA5N_14.jpeg", + "ANA5N/ANA5N_15.jpeg", + "ANA5N/ANA5N_16.jpeg", + "ANA5N/ANA5N_17.jpeg", + "ANA5N/ANA5N_18.jpeg", + "ANA5N/ANA5N_19.jpeg", + "ANA5N/ANA5N_20.jpeg", + "ANA5N/ANA5N_21.jpeg", + "ANA5N/ANA5N_22.jpeg", + "ANA5N/ANA5N_23.jpeg", + "ANA5N/ANA5N_24.jpeg", + "ANA5N/ANA5N_25.jpeg", + "ANA5N/ANA5N_26.jpeg", + "ANA5N/ANA5N_27.jpeg", + "ANA5N/ANA5N_28.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 198, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person drank from the cup/glass/bottle?\nChoice list: \nA. Put down the box.\nB. Took the phone/camera.\nC. Threw the box.\nD. Sat on the floor.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the box", + "took the phone camera", + "threw the box", + "sat on the floor" + ], + "image_quantity_level": "Medium", + "image": [ + "M3S4D/M3S4D_0.jpeg", + "M3S4D/M3S4D_1.jpeg", + "M3S4D/M3S4D_2.jpeg", + "M3S4D/M3S4D_3.jpeg", + "M3S4D/M3S4D_4.jpeg", + "M3S4D/M3S4D_5.jpeg", + "M3S4D/M3S4D_6.jpeg", + "M3S4D/M3S4D_7.jpeg", + "M3S4D/M3S4D_8.jpeg", + "M3S4D/M3S4D_9.jpeg", + "M3S4D/M3S4D_10.jpeg", + "M3S4D/M3S4D_11.jpeg", + "M3S4D/M3S4D_12.jpeg", + "M3S4D/M3S4D_13.jpeg", + "M3S4D/M3S4D_14.jpeg", + "M3S4D/M3S4D_15.jpeg", + "M3S4D/M3S4D_16.jpeg", + "M3S4D/M3S4D_17.jpeg", + "M3S4D/M3S4D_18.jpeg", + "M3S4D/M3S4D_19.jpeg", + "M3S4D/M3S4D_20.jpeg", + "M3S4D/M3S4D_21.jpeg", + "M3S4D/M3S4D_22.jpeg", + "M3S4D/M3S4D_23.jpeg", + "M3S4D/M3S4D_24.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 99, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person held the blanket?\nChoice list: \nA. Took the book.\nB. Put down the paper/notebook.\nC. Tidied up the closet/cabinet.\nD. Threw the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the book", + "put down the paper notebook", + "tidied up the closet cabinet", + "threw the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "6PL9Z/6PL9Z_0.jpeg", + "6PL9Z/6PL9Z_1.jpeg", + "6PL9Z/6PL9Z_2.jpeg", + "6PL9Z/6PL9Z_3.jpeg", + "6PL9Z/6PL9Z_4.jpeg", + "6PL9Z/6PL9Z_5.jpeg", + "6PL9Z/6PL9Z_6.jpeg", + "6PL9Z/6PL9Z_7.jpeg", + "6PL9Z/6PL9Z_8.jpeg", + "6PL9Z/6PL9Z_9.jpeg", + "6PL9Z/6PL9Z_10.jpeg", + "6PL9Z/6PL9Z_11.jpeg", + "6PL9Z/6PL9Z_12.jpeg", + "6PL9Z/6PL9Z_13.jpeg", + "6PL9Z/6PL9Z_14.jpeg", + "6PL9Z/6PL9Z_15.jpeg", + "6PL9Z/6PL9Z_16.jpeg", + "6PL9Z/6PL9Z_17.jpeg", + "6PL9Z/6PL9Z_18.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 142, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person opened the door?\nChoice list: \nA. Threw the towel.\nB. Threw the blanket.\nC. Put down the book.\nD. Closed the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "threw the towel", + "threw the blanket", + "put down the book", + "closed the book" + ], + "image_quantity_level": "Many", + "image": [ + "1FIGA/1FIGA_0.jpeg", + "1FIGA/1FIGA_1.jpeg", + "1FIGA/1FIGA_2.jpeg", + "1FIGA/1FIGA_3.jpeg", + "1FIGA/1FIGA_4.jpeg", + "1FIGA/1FIGA_5.jpeg", + "1FIGA/1FIGA_6.jpeg", + "1FIGA/1FIGA_7.jpeg", + "1FIGA/1FIGA_8.jpeg", + "1FIGA/1FIGA_9.jpeg", + "1FIGA/1FIGA_10.jpeg", + "1FIGA/1FIGA_11.jpeg", + "1FIGA/1FIGA_12.jpeg", + "1FIGA/1FIGA_13.jpeg", + "1FIGA/1FIGA_14.jpeg", + "1FIGA/1FIGA_15.jpeg", + "1FIGA/1FIGA_16.jpeg", + "1FIGA/1FIGA_17.jpeg", + "1FIGA/1FIGA_18.jpeg", + "1FIGA/1FIGA_19.jpeg", + "1FIGA/1FIGA_20.jpeg", + "1FIGA/1FIGA_21.jpeg", + "1FIGA/1FIGA_22.jpeg", + "1FIGA/1FIGA_23.jpeg", + "1FIGA/1FIGA_24.jpeg", + "1FIGA/1FIGA_25.jpeg", + "1FIGA/1FIGA_26.jpeg", + "1FIGA/1FIGA_27.jpeg", + "1FIGA/1FIGA_28.jpeg", + "1FIGA/1FIGA_29.jpeg", + "1FIGA/1FIGA_30.jpeg", + "1FIGA/1FIGA_31.jpeg", + "1FIGA/1FIGA_32.jpeg", + "1FIGA/1FIGA_33.jpeg", + "1FIGA/1FIGA_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 185, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the clothes?\nChoice list: \nA. Took the bag.\nB. Closed the door.\nC. Opened the bag.\nD. Sat on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the bag", + "closed the door", + "opened the bag", + "sat on the bed" + ], + "image_quantity_level": "Many", + "image": [ + "S444Y/S444Y_0.jpeg", + "S444Y/S444Y_1.jpeg", + "S444Y/S444Y_2.jpeg", + "S444Y/S444Y_3.jpeg", + "S444Y/S444Y_4.jpeg", + "S444Y/S444Y_5.jpeg", + "S444Y/S444Y_6.jpeg", + "S444Y/S444Y_7.jpeg", + "S444Y/S444Y_8.jpeg", + "S444Y/S444Y_9.jpeg", + "S444Y/S444Y_10.jpeg", + "S444Y/S444Y_11.jpeg", + "S444Y/S444Y_12.jpeg", + "S444Y/S444Y_13.jpeg", + "S444Y/S444Y_14.jpeg", + "S444Y/S444Y_15.jpeg", + "S444Y/S444Y_16.jpeg", + "S444Y/S444Y_17.jpeg", + "S444Y/S444Y_18.jpeg", + "S444Y/S444Y_19.jpeg", + "S444Y/S444Y_20.jpeg", + "S444Y/S444Y_21.jpeg", + "S444Y/S444Y_22.jpeg", + "S444Y/S444Y_23.jpeg", + "S444Y/S444Y_24.jpeg", + "S444Y/S444Y_25.jpeg", + "S444Y/S444Y_26.jpeg", + "S444Y/S444Y_27.jpeg", + "S444Y/S444Y_28.jpeg", + "S444Y/S444Y_29.jpeg", + "S444Y/S444Y_30.jpeg", + "S444Y/S444Y_31.jpeg", + "S444Y/S444Y_32.jpeg", + "S444Y/S444Y_33.jpeg", + "S444Y/S444Y_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 111, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person held the picture?\nChoice list: \nA. Sat at the table.\nB. Took the laptop.\nC. Put down the dish.\nD. Opened the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "sat at the table", + "took the laptop", + "put down the dish", + "opened the laptop" + ], + "image_quantity_level": "Many", + "image": [ + "ZAJAJ/ZAJAJ_0.jpeg", + "ZAJAJ/ZAJAJ_1.jpeg", + "ZAJAJ/ZAJAJ_2.jpeg", + "ZAJAJ/ZAJAJ_3.jpeg", + "ZAJAJ/ZAJAJ_4.jpeg", + "ZAJAJ/ZAJAJ_5.jpeg", + "ZAJAJ/ZAJAJ_6.jpeg", + "ZAJAJ/ZAJAJ_7.jpeg", + "ZAJAJ/ZAJAJ_8.jpeg", + "ZAJAJ/ZAJAJ_9.jpeg", + "ZAJAJ/ZAJAJ_10.jpeg", + "ZAJAJ/ZAJAJ_11.jpeg", + "ZAJAJ/ZAJAJ_12.jpeg", + "ZAJAJ/ZAJAJ_13.jpeg", + "ZAJAJ/ZAJAJ_14.jpeg", + "ZAJAJ/ZAJAJ_15.jpeg", + "ZAJAJ/ZAJAJ_16.jpeg", + "ZAJAJ/ZAJAJ_17.jpeg", + "ZAJAJ/ZAJAJ_18.jpeg", + "ZAJAJ/ZAJAJ_19.jpeg", + "ZAJAJ/ZAJAJ_20.jpeg", + "ZAJAJ/ZAJAJ_21.jpeg", + "ZAJAJ/ZAJAJ_22.jpeg", + "ZAJAJ/ZAJAJ_23.jpeg", + "ZAJAJ/ZAJAJ_24.jpeg", + "ZAJAJ/ZAJAJ_25.jpeg", + "ZAJAJ/ZAJAJ_26.jpeg", + "ZAJAJ/ZAJAJ_27.jpeg", + "ZAJAJ/ZAJAJ_28.jpeg", + "ZAJAJ/ZAJAJ_29.jpeg", + "ZAJAJ/ZAJAJ_30.jpeg", + "ZAJAJ/ZAJAJ_31.jpeg", + "ZAJAJ/ZAJAJ_32.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 171, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person put down the laptop?\nChoice list: \nA. Washed the window.\nB. Put down the shoe.\nC. Closed the door.\nD. Threw the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "washed the window", + "put down the shoe", + "closed the door", + "threw the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "FL6DF/FL6DF_0.jpeg", + "FL6DF/FL6DF_1.jpeg", + "FL6DF/FL6DF_2.jpeg", + "FL6DF/FL6DF_3.jpeg", + "FL6DF/FL6DF_4.jpeg", + "FL6DF/FL6DF_5.jpeg", + "FL6DF/FL6DF_6.jpeg", + "FL6DF/FL6DF_7.jpeg", + "FL6DF/FL6DF_8.jpeg", + "FL6DF/FL6DF_9.jpeg", + "FL6DF/FL6DF_10.jpeg", + "FL6DF/FL6DF_11.jpeg", + "FL6DF/FL6DF_12.jpeg", + "FL6DF/FL6DF_13.jpeg", + "FL6DF/FL6DF_14.jpeg", + "FL6DF/FL6DF_15.jpeg", + "FL6DF/FL6DF_16.jpeg", + "FL6DF/FL6DF_17.jpeg", + "FL6DF/FL6DF_18.jpeg", + "FL6DF/FL6DF_19.jpeg", + "FL6DF/FL6DF_20.jpeg", + "FL6DF/FL6DF_21.jpeg", + "FL6DF/FL6DF_22.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 189, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person drank from the cup/glass/bottle?\nChoice list: \nA. Took the sandwich.\nB. Put down the clothes.\nC. Put down the box.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the sandwich", + "put down the clothes", + "put down the box", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "KLP7V/KLP7V_0.jpeg", + "KLP7V/KLP7V_1.jpeg", + "KLP7V/KLP7V_2.jpeg", + "KLP7V/KLP7V_3.jpeg", + "KLP7V/KLP7V_4.jpeg", + "KLP7V/KLP7V_5.jpeg", + "KLP7V/KLP7V_6.jpeg", + "KLP7V/KLP7V_7.jpeg", + "KLP7V/KLP7V_8.jpeg", + "KLP7V/KLP7V_9.jpeg", + "KLP7V/KLP7V_10.jpeg", + "KLP7V/KLP7V_11.jpeg", + "KLP7V/KLP7V_12.jpeg", + "KLP7V/KLP7V_13.jpeg", + "KLP7V/KLP7V_14.jpeg", + "KLP7V/KLP7V_15.jpeg", + "KLP7V/KLP7V_16.jpeg", + "KLP7V/KLP7V_17.jpeg", + "KLP7V/KLP7V_18.jpeg", + "KLP7V/KLP7V_19.jpeg", + "KLP7V/KLP7V_20.jpeg", + "KLP7V/KLP7V_21.jpeg", + "KLP7V/KLP7V_22.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 155, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person held the clothes?\nChoice list: \nA. Opened the door.\nB. Took the towel.\nC. Threw the book.\nD. Opened the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "opened the door", + "took the towel", + "threw the book", + "opened the bag" + ], + "image_quantity_level": "Many", + "image": [ + "M1TZR/M1TZR_0.jpeg", + "M1TZR/M1TZR_1.jpeg", + "M1TZR/M1TZR_2.jpeg", + "M1TZR/M1TZR_3.jpeg", + "M1TZR/M1TZR_4.jpeg", + "M1TZR/M1TZR_5.jpeg", + "M1TZR/M1TZR_6.jpeg", + "M1TZR/M1TZR_7.jpeg", + "M1TZR/M1TZR_8.jpeg", + "M1TZR/M1TZR_9.jpeg", + "M1TZR/M1TZR_10.jpeg", + "M1TZR/M1TZR_11.jpeg", + "M1TZR/M1TZR_12.jpeg", + "M1TZR/M1TZR_13.jpeg", + "M1TZR/M1TZR_14.jpeg", + "M1TZR/M1TZR_15.jpeg", + "M1TZR/M1TZR_16.jpeg", + "M1TZR/M1TZR_17.jpeg", + "M1TZR/M1TZR_18.jpeg", + "M1TZR/M1TZR_19.jpeg", + "M1TZR/M1TZR_20.jpeg", + "M1TZR/M1TZR_21.jpeg", + "M1TZR/M1TZR_22.jpeg", + "M1TZR/M1TZR_23.jpeg", + "M1TZR/M1TZR_24.jpeg", + "M1TZR/M1TZR_25.jpeg", + "M1TZR/M1TZR_26.jpeg", + "M1TZR/M1TZR_27.jpeg", + "M1TZR/M1TZR_28.jpeg", + "M1TZR/M1TZR_29.jpeg", + "M1TZR/M1TZR_30.jpeg", + "M1TZR/M1TZR_31.jpeg", + "M1TZR/M1TZR_32.jpeg", + "M1TZR/M1TZR_33.jpeg", + "M1TZR/M1TZR_34.jpeg", + "M1TZR/M1TZR_35.jpeg", + "M1TZR/M1TZR_36.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 164, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened before the person held the mirror?\nChoice list: \nA. Took the paper/notebook.\nB. Tidied up the table.\nC. Sat on the bed.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the paper notebook", + "tidied up the table", + "sat on the bed", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "BQAUC/BQAUC_0.jpeg", + "BQAUC/BQAUC_1.jpeg", + "BQAUC/BQAUC_2.jpeg", + "BQAUC/BQAUC_3.jpeg", + "BQAUC/BQAUC_4.jpeg", + "BQAUC/BQAUC_5.jpeg", + "BQAUC/BQAUC_6.jpeg", + "BQAUC/BQAUC_7.jpeg", + "BQAUC/BQAUC_8.jpeg", + "BQAUC/BQAUC_9.jpeg", + "BQAUC/BQAUC_10.jpeg", + "BQAUC/BQAUC_11.jpeg", + "BQAUC/BQAUC_12.jpeg", + "BQAUC/BQAUC_13.jpeg", + "BQAUC/BQAUC_14.jpeg", + "BQAUC/BQAUC_15.jpeg", + "BQAUC/BQAUC_16.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 168, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person took the shoe?\nChoice list: \nA. Closed the window.\nB. Sat on the sofa/couch.\nC. Closed the closet/cabinet.\nD. Opened the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the window", + "sat on the sofa couch", + "closed the closet cabinet", + "opened the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "86GSE/86GSE_0.jpeg", + "86GSE/86GSE_1.jpeg", + "86GSE/86GSE_2.jpeg", + "86GSE/86GSE_3.jpeg", + "86GSE/86GSE_4.jpeg", + "86GSE/86GSE_5.jpeg", + "86GSE/86GSE_6.jpeg", + "86GSE/86GSE_7.jpeg", + "86GSE/86GSE_8.jpeg", + "86GSE/86GSE_9.jpeg", + "86GSE/86GSE_10.jpeg", + "86GSE/86GSE_11.jpeg", + "86GSE/86GSE_12.jpeg", + "86GSE/86GSE_13.jpeg", + "86GSE/86GSE_14.jpeg", + "86GSE/86GSE_15.jpeg", + "86GSE/86GSE_16.jpeg", + "86GSE/86GSE_17.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 199, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened before the person washed the table?\nChoice list: \nA. Closed the refrigerator.\nB. Put down the paper/notebook.\nC. Put down the dish.\nD. Put down the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "closed the refrigerator", + "put down the paper notebook", + "put down the dish", + "put down the sandwich" + ], + "image_quantity_level": "Many", + "image": [ + "ZS9XR/ZS9XR_0.jpeg", + "ZS9XR/ZS9XR_1.jpeg", + "ZS9XR/ZS9XR_2.jpeg", + "ZS9XR/ZS9XR_3.jpeg", + "ZS9XR/ZS9XR_4.jpeg", + "ZS9XR/ZS9XR_5.jpeg", + "ZS9XR/ZS9XR_6.jpeg", + "ZS9XR/ZS9XR_7.jpeg", + "ZS9XR/ZS9XR_8.jpeg", + "ZS9XR/ZS9XR_9.jpeg", + "ZS9XR/ZS9XR_10.jpeg", + "ZS9XR/ZS9XR_11.jpeg", + "ZS9XR/ZS9XR_12.jpeg", + "ZS9XR/ZS9XR_13.jpeg", + "ZS9XR/ZS9XR_14.jpeg", + "ZS9XR/ZS9XR_15.jpeg", + "ZS9XR/ZS9XR_16.jpeg", + "ZS9XR/ZS9XR_17.jpeg", + "ZS9XR/ZS9XR_18.jpeg", + "ZS9XR/ZS9XR_19.jpeg", + "ZS9XR/ZS9XR_20.jpeg", + "ZS9XR/ZS9XR_21.jpeg", + "ZS9XR/ZS9XR_22.jpeg", + "ZS9XR/ZS9XR_23.jpeg", + "ZS9XR/ZS9XR_24.jpeg", + "ZS9XR/ZS9XR_25.jpeg", + "ZS9XR/ZS9XR_26.jpeg", + "ZS9XR/ZS9XR_27.jpeg", + "ZS9XR/ZS9XR_28.jpeg", + "ZS9XR/ZS9XR_29.jpeg", + "ZS9XR/ZS9XR_30.jpeg", + "ZS9XR/ZS9XR_31.jpeg", + "ZS9XR/ZS9XR_32.jpeg", + "ZS9XR/ZS9XR_33.jpeg", + "ZS9XR/ZS9XR_34.jpeg", + "ZS9XR/ZS9XR_35.jpeg", + "ZS9XR/ZS9XR_36.jpeg", + "ZS9XR/ZS9XR_37.jpeg", + "ZS9XR/ZS9XR_38.jpeg", + "ZS9XR/ZS9XR_39.jpeg", + "ZS9XR/ZS9XR_40.jpeg", + "ZS9XR/ZS9XR_41.jpeg", + "ZS9XR/ZS9XR_42.jpeg", + "ZS9XR/ZS9XR_43.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 153, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Threw the towel.\nC. Threw the book.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the cup glass bottle", + "threw the towel", + "threw the book", + "took the food" + ], + "image_quantity_level": "Medium", + "image": [ + "LSVHK/LSVHK_0.jpeg", + "LSVHK/LSVHK_1.jpeg", + "LSVHK/LSVHK_2.jpeg", + "LSVHK/LSVHK_3.jpeg", + "LSVHK/LSVHK_4.jpeg", + "LSVHK/LSVHK_5.jpeg", + "LSVHK/LSVHK_6.jpeg", + "LSVHK/LSVHK_7.jpeg", + "LSVHK/LSVHK_8.jpeg", + "LSVHK/LSVHK_9.jpeg", + "LSVHK/LSVHK_10.jpeg", + "LSVHK/LSVHK_11.jpeg", + "LSVHK/LSVHK_12.jpeg", + "LSVHK/LSVHK_13.jpeg", + "LSVHK/LSVHK_14.jpeg", + "LSVHK/LSVHK_15.jpeg", + "LSVHK/LSVHK_16.jpeg", + "LSVHK/LSVHK_17.jpeg", + "LSVHK/LSVHK_18.jpeg", + "LSVHK/LSVHK_19.jpeg", + "LSVHK/LSVHK_20.jpeg", + "LSVHK/LSVHK_21.jpeg", + "LSVHK/LSVHK_22.jpeg", + "LSVHK/LSVHK_23.jpeg", + "LSVHK/LSVHK_24.jpeg", + "LSVHK/LSVHK_25.jpeg", + "LSVHK/LSVHK_26.jpeg", + "LSVHK/LSVHK_27.jpeg", + "LSVHK/LSVHK_28.jpeg", + "LSVHK/LSVHK_29.jpeg", + "LSVHK/LSVHK_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 161, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person put down the laptop?\nChoice list: \nA. Sat at the table.\nB. Took the broom.\nC. Took the book.\nD. Put down the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "sat at the table", + "took the broom", + "took the book", + "put down the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "EUJK0/EUJK0_0.jpeg", + "EUJK0/EUJK0_1.jpeg", + "EUJK0/EUJK0_2.jpeg", + "EUJK0/EUJK0_3.jpeg", + "EUJK0/EUJK0_4.jpeg", + "EUJK0/EUJK0_5.jpeg", + "EUJK0/EUJK0_6.jpeg", + "EUJK0/EUJK0_7.jpeg", + "EUJK0/EUJK0_8.jpeg", + "EUJK0/EUJK0_9.jpeg", + "EUJK0/EUJK0_10.jpeg", + "EUJK0/EUJK0_11.jpeg", + "EUJK0/EUJK0_12.jpeg", + "EUJK0/EUJK0_13.jpeg", + "EUJK0/EUJK0_14.jpeg", + "EUJK0/EUJK0_15.jpeg", + "EUJK0/EUJK0_16.jpeg", + "EUJK0/EUJK0_17.jpeg", + "EUJK0/EUJK0_18.jpeg", + "EUJK0/EUJK0_19.jpeg", + "EUJK0/EUJK0_20.jpeg", + "EUJK0/EUJK0_21.jpeg", + "EUJK0/EUJK0_22.jpeg", + "EUJK0/EUJK0_23.jpeg", + "EUJK0/EUJK0_24.jpeg", + "EUJK0/EUJK0_25.jpeg", + "EUJK0/EUJK0_26.jpeg", + "EUJK0/EUJK0_27.jpeg", + "EUJK0/EUJK0_28.jpeg", + "EUJK0/EUJK0_29.jpeg", + "EUJK0/EUJK0_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 175, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person held the dish?\nChoice list: \nA. Took the book.\nB. Took the cup/glass/bottle.\nC. Took the blanket.\nD. Closed the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "took the book", + "took the cup glass bottle", + "took the blanket", + "closed the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "1E7VU/1E7VU_0.jpeg", + "1E7VU/1E7VU_1.jpeg", + "1E7VU/1E7VU_2.jpeg", + "1E7VU/1E7VU_3.jpeg", + "1E7VU/1E7VU_4.jpeg", + "1E7VU/1E7VU_5.jpeg", + "1E7VU/1E7VU_6.jpeg", + "1E7VU/1E7VU_7.jpeg", + "1E7VU/1E7VU_8.jpeg", + "1E7VU/1E7VU_9.jpeg", + "1E7VU/1E7VU_10.jpeg", + "1E7VU/1E7VU_11.jpeg", + "1E7VU/1E7VU_12.jpeg", + "1E7VU/1E7VU_13.jpeg", + "1E7VU/1E7VU_14.jpeg", + "1E7VU/1E7VU_15.jpeg", + "1E7VU/1E7VU_16.jpeg", + "1E7VU/1E7VU_17.jpeg", + "1E7VU/1E7VU_18.jpeg", + "1E7VU/1E7VU_19.jpeg", + "1E7VU/1E7VU_20.jpeg", + "1E7VU/1E7VU_21.jpeg", + "1E7VU/1E7VU_22.jpeg", + "1E7VU/1E7VU_23.jpeg", + "1E7VU/1E7VU_24.jpeg", + "1E7VU/1E7VU_25.jpeg", + "1E7VU/1E7VU_26.jpeg", + "1E7VU/1E7VU_27.jpeg", + "1E7VU/1E7VU_28.jpeg", + "1E7VU/1E7VU_29.jpeg", + "1E7VU/1E7VU_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 182, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened before the person put down the book?\nChoice list: \nA. Ate the medicine.\nB. Took the bag.\nC. Tidied up the blanket.\nD. Opened the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "ate the medicine", + "took the bag", + "tidied up the blanket", + "opened the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "4P13T/4P13T_0.jpeg", + "4P13T/4P13T_1.jpeg", + "4P13T/4P13T_2.jpeg", + "4P13T/4P13T_3.jpeg", + "4P13T/4P13T_4.jpeg", + "4P13T/4P13T_5.jpeg", + "4P13T/4P13T_6.jpeg", + "4P13T/4P13T_7.jpeg", + "4P13T/4P13T_8.jpeg", + "4P13T/4P13T_9.jpeg", + "4P13T/4P13T_10.jpeg", + "4P13T/4P13T_11.jpeg", + "4P13T/4P13T_12.jpeg", + "4P13T/4P13T_13.jpeg", + "4P13T/4P13T_14.jpeg", + "4P13T/4P13T_15.jpeg", + "4P13T/4P13T_16.jpeg", + "4P13T/4P13T_17.jpeg", + "4P13T/4P13T_18.jpeg", + "4P13T/4P13T_19.jpeg", + "4P13T/4P13T_20.jpeg", + "4P13T/4P13T_21.jpeg", + "4P13T/4P13T_22.jpeg", + "4P13T/4P13T_23.jpeg", + "4P13T/4P13T_24.jpeg", + "4P13T/4P13T_25.jpeg", + "4P13T/4P13T_26.jpeg", + "4P13T/4P13T_27.jpeg", + "4P13T/4P13T_28.jpeg", + "4P13T/4P13T_29.jpeg", + "4P13T/4P13T_30.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 184, + "question": "Inspect the flow of events in the presented pictures and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person tidied up the table?\nChoice list: \nA. Sat at the table.\nB. Put down the dish.\nC. Threw the towel.\nD. Took the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "sat at the table", + "put down the dish", + "threw the towel", + "took the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "ZVRBL/ZVRBL_0.jpeg", + "ZVRBL/ZVRBL_1.jpeg", + "ZVRBL/ZVRBL_2.jpeg", + "ZVRBL/ZVRBL_3.jpeg", + "ZVRBL/ZVRBL_4.jpeg", + "ZVRBL/ZVRBL_5.jpeg", + "ZVRBL/ZVRBL_6.jpeg", + "ZVRBL/ZVRBL_7.jpeg", + "ZVRBL/ZVRBL_8.jpeg", + "ZVRBL/ZVRBL_9.jpeg", + "ZVRBL/ZVRBL_10.jpeg", + "ZVRBL/ZVRBL_11.jpeg", + "ZVRBL/ZVRBL_12.jpeg", + "ZVRBL/ZVRBL_13.jpeg", + "ZVRBL/ZVRBL_14.jpeg", + "ZVRBL/ZVRBL_15.jpeg", + "ZVRBL/ZVRBL_16.jpeg", + "ZVRBL/ZVRBL_17.jpeg", + "ZVRBL/ZVRBL_18.jpeg", + "ZVRBL/ZVRBL_19.jpeg", + "ZVRBL/ZVRBL_20.jpeg", + "ZVRBL/ZVRBL_21.jpeg", + "ZVRBL/ZVRBL_22.jpeg", + "ZVRBL/ZVRBL_23.jpeg", + "ZVRBL/ZVRBL_24.jpeg", + "ZVRBL/ZVRBL_25.jpeg", + "ZVRBL/ZVRBL_26.jpeg", + "ZVRBL/ZVRBL_27.jpeg", + "ZVRBL/ZVRBL_28.jpeg", + "ZVRBL/ZVRBL_29.jpeg", + "ZVRBL/ZVRBL_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 191, + "question": "Observe the continuum of events in the provided pictorials and reply to the pertinent question. You must choose your answer from the Choice List.\nWhat happened after the person took the book?\nChoice list: \nA. Put down the book.\nB. Put down the bag.\nC. Closed the refrigerator.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the book", + "put down the bag", + "closed the refrigerator", + "put down the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "HONL7/HONL7_0.jpeg", + "HONL7/HONL7_1.jpeg", + "HONL7/HONL7_2.jpeg", + "HONL7/HONL7_3.jpeg", + "HONL7/HONL7_4.jpeg", + "HONL7/HONL7_5.jpeg", + "HONL7/HONL7_6.jpeg", + "HONL7/HONL7_7.jpeg", + "HONL7/HONL7_8.jpeg", + "HONL7/HONL7_9.jpeg", + "HONL7/HONL7_10.jpeg", + "HONL7/HONL7_11.jpeg", + "HONL7/HONL7_12.jpeg", + "HONL7/HONL7_13.jpeg", + "HONL7/HONL7_14.jpeg", + "HONL7/HONL7_15.jpeg", + "HONL7/HONL7_16.jpeg", + "HONL7/HONL7_17.jpeg", + "HONL7/HONL7_18.jpeg", + "HONL7/HONL7_19.jpeg", + "HONL7/HONL7_20.jpeg", + "HONL7/HONL7_21.jpeg", + "HONL7/HONL7_22.jpeg", + "HONL7/HONL7_23.jpeg", + "HONL7/HONL7_24.jpeg", + "HONL7/HONL7_25.jpeg", + "HONL7/HONL7_26.jpeg", + "HONL7/HONL7_27.jpeg", + "HONL7/HONL7_28.jpeg", + "HONL7/HONL7_29.jpeg", + "HONL7/HONL7_30.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 136, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person sat on the floor?\nChoice list: \nA. Took the bag.\nB. Took the phone/camera.\nC. Took the blanket.\nD. Took the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the bag", + "took the phone camera", + "took the blanket", + "took the box" + ], + "image_quantity_level": "Many", + "image": [ + "CR5G1/CR5G1_0.jpeg", + "CR5G1/CR5G1_1.jpeg", + "CR5G1/CR5G1_2.jpeg", + "CR5G1/CR5G1_3.jpeg", + "CR5G1/CR5G1_4.jpeg", + "CR5G1/CR5G1_5.jpeg", + "CR5G1/CR5G1_6.jpeg", + "CR5G1/CR5G1_7.jpeg", + "CR5G1/CR5G1_8.jpeg", + "CR5G1/CR5G1_9.jpeg", + "CR5G1/CR5G1_10.jpeg", + "CR5G1/CR5G1_11.jpeg", + "CR5G1/CR5G1_12.jpeg", + "CR5G1/CR5G1_13.jpeg", + "CR5G1/CR5G1_14.jpeg", + "CR5G1/CR5G1_15.jpeg", + "CR5G1/CR5G1_16.jpeg", + "CR5G1/CR5G1_17.jpeg", + "CR5G1/CR5G1_18.jpeg", + "CR5G1/CR5G1_19.jpeg", + "CR5G1/CR5G1_20.jpeg", + "CR5G1/CR5G1_21.jpeg", + "CR5G1/CR5G1_22.jpeg", + "CR5G1/CR5G1_23.jpeg", + "CR5G1/CR5G1_24.jpeg", + "CR5G1/CR5G1_25.jpeg", + "CR5G1/CR5G1_26.jpeg", + "CR5G1/CR5G1_27.jpeg", + "CR5G1/CR5G1_28.jpeg", + "CR5G1/CR5G1_29.jpeg", + "CR5G1/CR5G1_30.jpeg", + "CR5G1/CR5G1_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 166, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person took the dish?\nChoice list: \nA. Put down the food.\nB. Put down the box.\nC. Took the paper/notebook.\nD. Put down the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the food", + "put down the box", + "took the paper notebook", + "put down the blanket" + ], + "image_quantity_level": "Many", + "image": [ + "JBY4E/JBY4E_0.jpeg", + "JBY4E/JBY4E_1.jpeg", + "JBY4E/JBY4E_2.jpeg", + "JBY4E/JBY4E_3.jpeg", + "JBY4E/JBY4E_4.jpeg", + "JBY4E/JBY4E_5.jpeg", + "JBY4E/JBY4E_6.jpeg", + "JBY4E/JBY4E_7.jpeg", + "JBY4E/JBY4E_8.jpeg", + "JBY4E/JBY4E_9.jpeg", + "JBY4E/JBY4E_10.jpeg", + "JBY4E/JBY4E_11.jpeg", + "JBY4E/JBY4E_12.jpeg", + "JBY4E/JBY4E_13.jpeg", + "JBY4E/JBY4E_14.jpeg", + "JBY4E/JBY4E_15.jpeg", + "JBY4E/JBY4E_16.jpeg", + "JBY4E/JBY4E_17.jpeg", + "JBY4E/JBY4E_18.jpeg", + "JBY4E/JBY4E_19.jpeg", + "JBY4E/JBY4E_20.jpeg", + "JBY4E/JBY4E_21.jpeg", + "JBY4E/JBY4E_22.jpeg", + "JBY4E/JBY4E_23.jpeg", + "JBY4E/JBY4E_24.jpeg", + "JBY4E/JBY4E_25.jpeg", + "JBY4E/JBY4E_26.jpeg", + "JBY4E/JBY4E_27.jpeg", + "JBY4E/JBY4E_28.jpeg", + "JBY4E/JBY4E_29.jpeg", + "JBY4E/JBY4E_30.jpeg", + "JBY4E/JBY4E_31.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 172, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person took the food?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Put down the bag.\nC. Threw the pillow.\nD. Closed the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the cup glass bottle", + "put down the bag", + "threw the pillow", + "closed the box" + ], + "image_quantity_level": "Many", + "image": [ + "MVPQF/MVPQF_0.jpeg", + "MVPQF/MVPQF_1.jpeg", + "MVPQF/MVPQF_2.jpeg", + "MVPQF/MVPQF_3.jpeg", + "MVPQF/MVPQF_4.jpeg", + "MVPQF/MVPQF_5.jpeg", + "MVPQF/MVPQF_6.jpeg", + "MVPQF/MVPQF_7.jpeg", + "MVPQF/MVPQF_8.jpeg", + "MVPQF/MVPQF_9.jpeg", + "MVPQF/MVPQF_10.jpeg", + "MVPQF/MVPQF_11.jpeg", + "MVPQF/MVPQF_12.jpeg", + "MVPQF/MVPQF_13.jpeg", + "MVPQF/MVPQF_14.jpeg", + "MVPQF/MVPQF_15.jpeg", + "MVPQF/MVPQF_16.jpeg", + "MVPQF/MVPQF_17.jpeg", + "MVPQF/MVPQF_18.jpeg", + "MVPQF/MVPQF_19.jpeg", + "MVPQF/MVPQF_20.jpeg", + "MVPQF/MVPQF_21.jpeg", + "MVPQF/MVPQF_22.jpeg", + "MVPQF/MVPQF_23.jpeg", + "MVPQF/MVPQF_24.jpeg", + "MVPQF/MVPQF_25.jpeg", + "MVPQF/MVPQF_26.jpeg", + "MVPQF/MVPQF_27.jpeg", + "MVPQF/MVPQF_28.jpeg", + "MVPQF/MVPQF_29.jpeg", + "MVPQF/MVPQF_30.jpeg", + "MVPQF/MVPQF_31.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 129, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person put down the clothes?\nChoice list: \nA. Took the broom.\nB. Took the blanket.\nC. Opened the door.\nD. Threw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the broom", + "took the blanket", + "opened the door", + "threw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "43FG9/43FG9_0.jpeg", + "43FG9/43FG9_1.jpeg", + "43FG9/43FG9_2.jpeg", + "43FG9/43FG9_3.jpeg", + "43FG9/43FG9_4.jpeg", + "43FG9/43FG9_5.jpeg", + "43FG9/43FG9_6.jpeg", + "43FG9/43FG9_7.jpeg", + "43FG9/43FG9_8.jpeg", + "43FG9/43FG9_9.jpeg", + "43FG9/43FG9_10.jpeg", + "43FG9/43FG9_11.jpeg", + "43FG9/43FG9_12.jpeg", + "43FG9/43FG9_13.jpeg", + "43FG9/43FG9_14.jpeg", + "43FG9/43FG9_15.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 165, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat on the sofa/couch?\nChoice list: \nA. Put down the laptop.\nB. Put down the bag.\nC. Took the pillow.\nD. Threw the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the laptop", + "put down the bag", + "took the pillow", + "threw the book" + ], + "image_quantity_level": "Many", + "image": [ + "OY3LS/OY3LS_0.jpeg", + "OY3LS/OY3LS_1.jpeg", + "OY3LS/OY3LS_2.jpeg", + "OY3LS/OY3LS_3.jpeg", + "OY3LS/OY3LS_4.jpeg", + "OY3LS/OY3LS_5.jpeg", + "OY3LS/OY3LS_6.jpeg", + "OY3LS/OY3LS_7.jpeg", + "OY3LS/OY3LS_8.jpeg", + "OY3LS/OY3LS_9.jpeg", + "OY3LS/OY3LS_10.jpeg", + "OY3LS/OY3LS_11.jpeg", + "OY3LS/OY3LS_12.jpeg", + "OY3LS/OY3LS_13.jpeg", + "OY3LS/OY3LS_14.jpeg", + "OY3LS/OY3LS_15.jpeg", + "OY3LS/OY3LS_16.jpeg", + "OY3LS/OY3LS_17.jpeg", + "OY3LS/OY3LS_18.jpeg", + "OY3LS/OY3LS_19.jpeg", + "OY3LS/OY3LS_20.jpeg", + "OY3LS/OY3LS_21.jpeg", + "OY3LS/OY3LS_22.jpeg", + "OY3LS/OY3LS_23.jpeg", + "OY3LS/OY3LS_24.jpeg", + "OY3LS/OY3LS_25.jpeg", + "OY3LS/OY3LS_26.jpeg", + "OY3LS/OY3LS_27.jpeg", + "OY3LS/OY3LS_28.jpeg", + "OY3LS/OY3LS_29.jpeg", + "OY3LS/OY3LS_30.jpeg", + "OY3LS/OY3LS_31.jpeg", + "OY3LS/OY3LS_32.jpeg", + "OY3LS/OY3LS_33.jpeg", + "OY3LS/OY3LS_34.jpeg", + "OY3LS/OY3LS_35.jpeg", + "OY3LS/OY3LS_36.jpeg", + "OY3LS/OY3LS_37.jpeg", + "OY3LS/OY3LS_38.jpeg", + "OY3LS/OY3LS_39.jpeg", + "OY3LS/OY3LS_40.jpeg", + "OY3LS/OY3LS_41.jpeg", + "OY3LS/OY3LS_42.jpeg", + "OY3LS/OY3LS_43.jpeg", + "OY3LS/OY3LS_44.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 140, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person took the phone/camera?\nChoice list: \nA. Opened the window.\nB. Put down the clothes.\nC. Took the paper/notebook.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "opened the window", + "put down the clothes", + "took the paper notebook", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "YFI1M/YFI1M_0.jpeg", + "YFI1M/YFI1M_1.jpeg", + "YFI1M/YFI1M_2.jpeg", + "YFI1M/YFI1M_3.jpeg", + "YFI1M/YFI1M_4.jpeg", + "YFI1M/YFI1M_5.jpeg", + "YFI1M/YFI1M_6.jpeg", + "YFI1M/YFI1M_7.jpeg", + "YFI1M/YFI1M_8.jpeg", + "YFI1M/YFI1M_9.jpeg", + "YFI1M/YFI1M_10.jpeg", + "YFI1M/YFI1M_11.jpeg", + "YFI1M/YFI1M_12.jpeg", + "YFI1M/YFI1M_13.jpeg", + "YFI1M/YFI1M_14.jpeg", + "YFI1M/YFI1M_15.jpeg", + "YFI1M/YFI1M_16.jpeg", + "YFI1M/YFI1M_17.jpeg", + "YFI1M/YFI1M_18.jpeg", + "YFI1M/YFI1M_19.jpeg", + "YFI1M/YFI1M_20.jpeg", + "YFI1M/YFI1M_21.jpeg", + "YFI1M/YFI1M_22.jpeg", + "YFI1M/YFI1M_23.jpeg", + "YFI1M/YFI1M_24.jpeg", + "YFI1M/YFI1M_25.jpeg", + "YFI1M/YFI1M_26.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 154, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person closed the door?\nChoice list: \nA. Closed the refrigerator.\nB. Tidied up the blanket.\nC. Took the towel.\nD. Took the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "closed the refrigerator", + "tidied up the blanket", + "took the towel", + "took the cup glass bottle" + ], + "image_quantity_level": "Medium", + "image": [ + "M704T/M704T_0.jpeg", + "M704T/M704T_1.jpeg", + "M704T/M704T_2.jpeg", + "M704T/M704T_3.jpeg", + "M704T/M704T_4.jpeg", + "M704T/M704T_5.jpeg", + "M704T/M704T_6.jpeg", + "M704T/M704T_7.jpeg", + "M704T/M704T_8.jpeg", + "M704T/M704T_9.jpeg", + "M704T/M704T_10.jpeg", + "M704T/M704T_11.jpeg", + "M704T/M704T_12.jpeg", + "M704T/M704T_13.jpeg", + "M704T/M704T_14.jpeg", + "M704T/M704T_15.jpeg", + "M704T/M704T_16.jpeg", + "M704T/M704T_17.jpeg", + "M704T/M704T_18.jpeg", + "M704T/M704T_19.jpeg", + "M704T/M704T_20.jpeg", + "M704T/M704T_21.jpeg", + "M704T/M704T_22.jpeg", + "M704T/M704T_23.jpeg", + "M704T/M704T_24.jpeg", + "M704T/M704T_25.jpeg", + "M704T/M704T_26.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 173, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person held the sandwich?\nChoice list: \nA. Tidied up the blanket.\nB. Closed the door.\nC. Closed the refrigerator.\nD. Threw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidied up the blanket", + "closed the door", + "closed the refrigerator", + "threw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "R9382/R9382_0.jpeg", + "R9382/R9382_1.jpeg", + "R9382/R9382_2.jpeg", + "R9382/R9382_3.jpeg", + "R9382/R9382_4.jpeg", + "R9382/R9382_5.jpeg", + "R9382/R9382_6.jpeg", + "R9382/R9382_7.jpeg", + "R9382/R9382_8.jpeg", + "R9382/R9382_9.jpeg", + "R9382/R9382_10.jpeg", + "R9382/R9382_11.jpeg", + "R9382/R9382_12.jpeg", + "R9382/R9382_13.jpeg", + "R9382/R9382_14.jpeg", + "R9382/R9382_15.jpeg", + "R9382/R9382_16.jpeg", + "R9382/R9382_17.jpeg", + "R9382/R9382_18.jpeg", + "R9382/R9382_19.jpeg", + "R9382/R9382_20.jpeg", + "R9382/R9382_21.jpeg", + "R9382/R9382_22.jpeg", + "R9382/R9382_23.jpeg", + "R9382/R9382_24.jpeg", + "R9382/R9382_25.jpeg", + "R9382/R9382_26.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 167, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person sat on the sofa/couch?\nChoice list: \nA. Threw the blanket.\nB. Closed the door.\nC. Took the book.\nD. Lied on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "threw the blanket", + "closed the door", + "took the book", + "lied on the sofa couch" + ], + "image_quantity_level": "Many", + "image": [ + "RXLKF/RXLKF_0.jpeg", + "RXLKF/RXLKF_1.jpeg", + "RXLKF/RXLKF_2.jpeg", + "RXLKF/RXLKF_3.jpeg", + "RXLKF/RXLKF_4.jpeg", + "RXLKF/RXLKF_5.jpeg", + "RXLKF/RXLKF_6.jpeg", + "RXLKF/RXLKF_7.jpeg", + "RXLKF/RXLKF_8.jpeg", + "RXLKF/RXLKF_9.jpeg", + "RXLKF/RXLKF_10.jpeg", + "RXLKF/RXLKF_11.jpeg", + "RXLKF/RXLKF_12.jpeg", + "RXLKF/RXLKF_13.jpeg", + "RXLKF/RXLKF_14.jpeg", + "RXLKF/RXLKF_15.jpeg", + "RXLKF/RXLKF_16.jpeg", + "RXLKF/RXLKF_17.jpeg", + "RXLKF/RXLKF_18.jpeg", + "RXLKF/RXLKF_19.jpeg", + "RXLKF/RXLKF_20.jpeg", + "RXLKF/RXLKF_21.jpeg", + "RXLKF/RXLKF_22.jpeg", + "RXLKF/RXLKF_23.jpeg", + "RXLKF/RXLKF_24.jpeg", + "RXLKF/RXLKF_25.jpeg", + "RXLKF/RXLKF_26.jpeg", + "RXLKF/RXLKF_27.jpeg", + "RXLKF/RXLKF_28.jpeg", + "RXLKF/RXLKF_29.jpeg", + "RXLKF/RXLKF_30.jpeg", + "RXLKF/RXLKF_31.jpeg", + "RXLKF/RXLKF_32.jpeg", + "RXLKF/RXLKF_33.jpeg", + "RXLKF/RXLKF_34.jpeg", + "RXLKF/RXLKF_35.jpeg", + "RXLKF/RXLKF_36.jpeg", + "RXLKF/RXLKF_37.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 91, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened after the person opened the refrigerator?\nChoice list: \nA. Closed the window.\nB. Put down the cup/glass/bottle.\nC. Took the paper/notebook.\nD. Sat at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "closed the window", + "put down the cup glass bottle", + "took the paper notebook", + "sat at the table" + ], + "image_quantity_level": "Medium", + "image": [ + "0F453/0F453_0.jpeg", + "0F453/0F453_1.jpeg", + "0F453/0F453_2.jpeg", + "0F453/0F453_3.jpeg", + "0F453/0F453_4.jpeg", + "0F453/0F453_5.jpeg", + "0F453/0F453_6.jpeg", + "0F453/0F453_7.jpeg", + "0F453/0F453_8.jpeg", + "0F453/0F453_9.jpeg", + "0F453/0F453_10.jpeg", + "0F453/0F453_11.jpeg", + "0F453/0F453_12.jpeg", + "0F453/0F453_13.jpeg", + "0F453/0F453_14.jpeg", + "0F453/0F453_15.jpeg", + "0F453/0F453_16.jpeg", + "0F453/0F453_17.jpeg", + "0F453/0F453_18.jpeg", + "0F453/0F453_19.jpeg", + "0F453/0F453_20.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 160, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Took the towel.\nB. Took the book.\nC. Opened the door.\nD. Sat at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the towel", + "took the book", + "opened the door", + "sat at the table" + ], + "image_quantity_level": "Medium", + "image": [ + "APVAD/APVAD_0.jpeg", + "APVAD/APVAD_1.jpeg", + "APVAD/APVAD_2.jpeg", + "APVAD/APVAD_3.jpeg", + "APVAD/APVAD_4.jpeg", + "APVAD/APVAD_5.jpeg", + "APVAD/APVAD_6.jpeg", + "APVAD/APVAD_7.jpeg", + "APVAD/APVAD_8.jpeg", + "APVAD/APVAD_9.jpeg", + "APVAD/APVAD_10.jpeg", + "APVAD/APVAD_11.jpeg", + "APVAD/APVAD_12.jpeg", + "APVAD/APVAD_13.jpeg", + "APVAD/APVAD_14.jpeg", + "APVAD/APVAD_15.jpeg", + "APVAD/APVAD_16.jpeg", + "APVAD/APVAD_17.jpeg", + "APVAD/APVAD_18.jpeg", + "APVAD/APVAD_19.jpeg", + "APVAD/APVAD_20.jpeg", + "APVAD/APVAD_21.jpeg", + "APVAD/APVAD_22.jpeg", + "APVAD/APVAD_23.jpeg", + "APVAD/APVAD_24.jpeg", + "APVAD/APVAD_25.jpeg", + "APVAD/APVAD_26.jpeg", + "APVAD/APVAD_27.jpeg", + "APVAD/APVAD_28.jpeg", + "APVAD/APVAD_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 162, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person opened the door?\nChoice list: \nA. Threw the box.\nB. Put down the phone/camera.\nC. Took the pillow.\nD. Tidied up the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "threw the box", + "put down the phone camera", + "took the pillow", + "tidied up the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "N7130/N7130_0.jpeg", + "N7130/N7130_1.jpeg", + "N7130/N7130_2.jpeg", + "N7130/N7130_3.jpeg", + "N7130/N7130_4.jpeg", + "N7130/N7130_5.jpeg", + "N7130/N7130_6.jpeg", + "N7130/N7130_7.jpeg", + "N7130/N7130_8.jpeg", + "N7130/N7130_9.jpeg", + "N7130/N7130_10.jpeg", + "N7130/N7130_11.jpeg", + "N7130/N7130_12.jpeg", + "N7130/N7130_13.jpeg", + "N7130/N7130_14.jpeg", + "N7130/N7130_15.jpeg", + "N7130/N7130_16.jpeg", + "N7130/N7130_17.jpeg", + "N7130/N7130_18.jpeg", + "N7130/N7130_19.jpeg", + "N7130/N7130_20.jpeg", + "N7130/N7130_21.jpeg", + "N7130/N7130_22.jpeg", + "N7130/N7130_23.jpeg", + "N7130/N7130_24.jpeg", + "N7130/N7130_25.jpeg", + "N7130/N7130_26.jpeg", + "N7130/N7130_27.jpeg", + "N7130/N7130_28.jpeg", + "N7130/N7130_29.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 169, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person washed the window?\nChoice list: \nA. Closed the door.\nB. Lied on the floor.\nC. Took the picture.\nD. Opened the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the door", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "closed the door", + "lied on the floor", + "took the picture", + "opened the box" + ], + "image_quantity_level": "Medium", + "image": [ + "68YR6/68YR6_0.jpeg", + "68YR6/68YR6_1.jpeg", + "68YR6/68YR6_2.jpeg", + "68YR6/68YR6_3.jpeg", + "68YR6/68YR6_4.jpeg", + "68YR6/68YR6_5.jpeg", + "68YR6/68YR6_6.jpeg", + "68YR6/68YR6_7.jpeg", + "68YR6/68YR6_8.jpeg", + "68YR6/68YR6_9.jpeg", + "68YR6/68YR6_10.jpeg", + "68YR6/68YR6_11.jpeg", + "68YR6/68YR6_12.jpeg", + "68YR6/68YR6_13.jpeg", + "68YR6/68YR6_14.jpeg", + "68YR6/68YR6_15.jpeg", + "68YR6/68YR6_16.jpeg", + "68YR6/68YR6_17.jpeg", + "68YR6/68YR6_18.jpeg", + "68YR6/68YR6_19.jpeg", + "68YR6/68YR6_20.jpeg", + "68YR6/68YR6_21.jpeg", + "68YR6/68YR6_22.jpeg", + "68YR6/68YR6_23.jpeg", + "68YR6/68YR6_24.jpeg", + "68YR6/68YR6_25.jpeg", + "68YR6/68YR6_26.jpeg", + "68YR6/68YR6_27.jpeg", + "68YR6/68YR6_28.jpeg", + "68YR6/68YR6_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 170, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened after the person threw the clothes?\nChoice list: \nA. Took the sandwich.\nB. Put down the food.\nC. Put down the pillow.\nD. Took the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the sandwich", + "put down the food", + "put down the pillow", + "took the food" + ], + "image_quantity_level": "Medium", + "image": [ + "Q7RK6/Q7RK6_0.jpeg", + "Q7RK6/Q7RK6_1.jpeg", + "Q7RK6/Q7RK6_2.jpeg", + "Q7RK6/Q7RK6_3.jpeg", + "Q7RK6/Q7RK6_4.jpeg", + "Q7RK6/Q7RK6_5.jpeg", + "Q7RK6/Q7RK6_6.jpeg", + "Q7RK6/Q7RK6_7.jpeg", + "Q7RK6/Q7RK6_8.jpeg", + "Q7RK6/Q7RK6_9.jpeg", + "Q7RK6/Q7RK6_10.jpeg", + "Q7RK6/Q7RK6_11.jpeg", + "Q7RK6/Q7RK6_12.jpeg", + "Q7RK6/Q7RK6_13.jpeg", + "Q7RK6/Q7RK6_14.jpeg", + "Q7RK6/Q7RK6_15.jpeg", + "Q7RK6/Q7RK6_16.jpeg", + "Q7RK6/Q7RK6_17.jpeg", + "Q7RK6/Q7RK6_18.jpeg", + "Q7RK6/Q7RK6_19.jpeg", + "Q7RK6/Q7RK6_20.jpeg", + "Q7RK6/Q7RK6_21.jpeg", + "Q7RK6/Q7RK6_22.jpeg", + "Q7RK6/Q7RK6_23.jpeg", + "Q7RK6/Q7RK6_24.jpeg", + "Q7RK6/Q7RK6_25.jpeg", + "Q7RK6/Q7RK6_26.jpeg", + "Q7RK6/Q7RK6_27.jpeg", + "Q7RK6/Q7RK6_28.jpeg", + "Q7RK6/Q7RK6_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 176, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person held the dish?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Sat on the floor.\nC. Took the cup/glass/bottle.\nD. Threw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the cup glass bottle", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the cup glass bottle", + "sat on the floor", + "took the cup glass bottle", + "threw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "DSZYT/DSZYT_0.jpeg", + "DSZYT/DSZYT_1.jpeg", + "DSZYT/DSZYT_2.jpeg", + "DSZYT/DSZYT_3.jpeg", + "DSZYT/DSZYT_4.jpeg", + "DSZYT/DSZYT_5.jpeg", + "DSZYT/DSZYT_6.jpeg", + "DSZYT/DSZYT_7.jpeg", + "DSZYT/DSZYT_8.jpeg", + "DSZYT/DSZYT_9.jpeg", + "DSZYT/DSZYT_10.jpeg", + "DSZYT/DSZYT_11.jpeg", + "DSZYT/DSZYT_12.jpeg", + "DSZYT/DSZYT_13.jpeg", + "DSZYT/DSZYT_14.jpeg", + "DSZYT/DSZYT_15.jpeg", + "DSZYT/DSZYT_16.jpeg", + "DSZYT/DSZYT_17.jpeg", + "DSZYT/DSZYT_18.jpeg", + "DSZYT/DSZYT_19.jpeg", + "DSZYT/DSZYT_20.jpeg", + "DSZYT/DSZYT_21.jpeg", + "DSZYT/DSZYT_22.jpeg", + "DSZYT/DSZYT_23.jpeg", + "DSZYT/DSZYT_24.jpeg", + "DSZYT/DSZYT_25.jpeg", + "DSZYT/DSZYT_26.jpeg", + "DSZYT/DSZYT_27.jpeg", + "DSZYT/DSZYT_28.jpeg", + "DSZYT/DSZYT_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 179, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the clothes?\nChoice list: \nA. Took the broom.\nB. Threw the pillow.\nC. Sat on the sofa/couch.\nD. Put down the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "threw the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "took the broom", + "threw the pillow", + "sat on the sofa couch", + "put down the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "APVAD/APVAD_0.jpeg", + "APVAD/APVAD_1.jpeg", + "APVAD/APVAD_2.jpeg", + "APVAD/APVAD_3.jpeg", + "APVAD/APVAD_4.jpeg", + "APVAD/APVAD_5.jpeg", + "APVAD/APVAD_6.jpeg", + "APVAD/APVAD_7.jpeg", + "APVAD/APVAD_8.jpeg", + "APVAD/APVAD_9.jpeg", + "APVAD/APVAD_10.jpeg", + "APVAD/APVAD_11.jpeg", + "APVAD/APVAD_12.jpeg", + "APVAD/APVAD_13.jpeg", + "APVAD/APVAD_14.jpeg", + "APVAD/APVAD_15.jpeg", + "APVAD/APVAD_16.jpeg", + "APVAD/APVAD_17.jpeg", + "APVAD/APVAD_18.jpeg", + "APVAD/APVAD_19.jpeg", + "APVAD/APVAD_20.jpeg", + "APVAD/APVAD_21.jpeg", + "APVAD/APVAD_22.jpeg", + "APVAD/APVAD_23.jpeg", + "APVAD/APVAD_24.jpeg", + "APVAD/APVAD_25.jpeg", + "APVAD/APVAD_26.jpeg", + "APVAD/APVAD_27.jpeg", + "APVAD/APVAD_28.jpeg", + "APVAD/APVAD_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 187, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person opened the book?\nChoice list: \nA. Put down the towel.\nB. Took the paper/notebook.\nC. Took the bag.\nD. Tidied up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the towel", + "took the paper notebook", + "took the bag", + "tidied up the table" + ], + "image_quantity_level": "Medium", + "image": [ + "FETME/FETME_0.jpeg", + "FETME/FETME_1.jpeg", + "FETME/FETME_2.jpeg", + "FETME/FETME_3.jpeg", + "FETME/FETME_4.jpeg", + "FETME/FETME_5.jpeg", + "FETME/FETME_6.jpeg", + "FETME/FETME_7.jpeg", + "FETME/FETME_8.jpeg", + "FETME/FETME_9.jpeg", + "FETME/FETME_10.jpeg", + "FETME/FETME_11.jpeg", + "FETME/FETME_12.jpeg", + "FETME/FETME_13.jpeg", + "FETME/FETME_14.jpeg", + "FETME/FETME_15.jpeg", + "FETME/FETME_16.jpeg", + "FETME/FETME_17.jpeg", + "FETME/FETME_18.jpeg", + "FETME/FETME_19.jpeg", + "FETME/FETME_20.jpeg", + "FETME/FETME_21.jpeg", + "FETME/FETME_22.jpeg", + "FETME/FETME_23.jpeg", + "FETME/FETME_24.jpeg", + "FETME/FETME_25.jpeg", + "FETME/FETME_26.jpeg", + "FETME/FETME_27.jpeg", + "FETME/FETME_28.jpeg", + "FETME/FETME_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 190, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person closed the closet/cabinet?\nChoice list: \nA. Took the towel.\nB. Put down the phone/camera.\nC. Took the sandwich.\nD. Took the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the towel", + "put down the phone camera", + "took the sandwich", + "took the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "UEP20/UEP20_0.jpeg", + "UEP20/UEP20_1.jpeg", + "UEP20/UEP20_2.jpeg", + "UEP20/UEP20_3.jpeg", + "UEP20/UEP20_4.jpeg", + "UEP20/UEP20_5.jpeg", + "UEP20/UEP20_6.jpeg", + "UEP20/UEP20_7.jpeg", + "UEP20/UEP20_8.jpeg", + "UEP20/UEP20_9.jpeg", + "UEP20/UEP20_10.jpeg", + "UEP20/UEP20_11.jpeg", + "UEP20/UEP20_12.jpeg", + "UEP20/UEP20_13.jpeg", + "UEP20/UEP20_14.jpeg", + "UEP20/UEP20_15.jpeg", + "UEP20/UEP20_16.jpeg", + "UEP20/UEP20_17.jpeg", + "UEP20/UEP20_18.jpeg", + "UEP20/UEP20_19.jpeg", + "UEP20/UEP20_20.jpeg", + "UEP20/UEP20_21.jpeg", + "UEP20/UEP20_22.jpeg", + "UEP20/UEP20_23.jpeg", + "UEP20/UEP20_24.jpeg", + "UEP20/UEP20_25.jpeg", + "UEP20/UEP20_26.jpeg", + "UEP20/UEP20_27.jpeg", + "UEP20/UEP20_28.jpeg", + "UEP20/UEP20_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 192, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened before the person held the phone/camera?\nChoice list: \nA. Tidied up the blanket.\nB. Took the cup/glass/bottle.\nC. Tidied up the closet/cabinet.\nD. Put down the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "tidied up the blanket", + "took the cup glass bottle", + "tidied up the closet cabinet", + "put down the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "1L5D3/1L5D3_0.jpeg", + "1L5D3/1L5D3_1.jpeg", + "1L5D3/1L5D3_2.jpeg", + "1L5D3/1L5D3_3.jpeg", + "1L5D3/1L5D3_4.jpeg", + "1L5D3/1L5D3_5.jpeg", + "1L5D3/1L5D3_6.jpeg", + "1L5D3/1L5D3_7.jpeg", + "1L5D3/1L5D3_8.jpeg", + "1L5D3/1L5D3_9.jpeg", + "1L5D3/1L5D3_10.jpeg", + "1L5D3/1L5D3_11.jpeg", + "1L5D3/1L5D3_12.jpeg", + "1L5D3/1L5D3_13.jpeg", + "1L5D3/1L5D3_14.jpeg", + "1L5D3/1L5D3_15.jpeg", + "1L5D3/1L5D3_16.jpeg", + "1L5D3/1L5D3_17.jpeg", + "1L5D3/1L5D3_18.jpeg", + "1L5D3/1L5D3_19.jpeg", + "1L5D3/1L5D3_20.jpeg", + "1L5D3/1L5D3_21.jpeg", + "1L5D3/1L5D3_22.jpeg", + "1L5D3/1L5D3_23.jpeg", + "1L5D3/1L5D3_24.jpeg", + "1L5D3/1L5D3_25.jpeg", + "1L5D3/1L5D3_26.jpeg", + "1L5D3/1L5D3_27.jpeg", + "1L5D3/1L5D3_28.jpeg", + "1L5D3/1L5D3_29.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 95, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person sat at the table?\nChoice list: \nA. Threw the bag.\nB. Closed the closet/cabinet.\nC. Ate the sandwich.\nD. Opened the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "threw the bag", + "closed the closet cabinet", + "ate the sandwich", + "opened the box" + ], + "image_quantity_level": "Medium", + "image": [ + "TVCQF/TVCQF_0.jpeg", + "TVCQF/TVCQF_1.jpeg", + "TVCQF/TVCQF_2.jpeg", + "TVCQF/TVCQF_3.jpeg", + "TVCQF/TVCQF_4.jpeg", + "TVCQF/TVCQF_5.jpeg", + "TVCQF/TVCQF_6.jpeg", + "TVCQF/TVCQF_7.jpeg", + "TVCQF/TVCQF_8.jpeg", + "TVCQF/TVCQF_9.jpeg", + "TVCQF/TVCQF_10.jpeg", + "TVCQF/TVCQF_11.jpeg", + "TVCQF/TVCQF_12.jpeg", + "TVCQF/TVCQF_13.jpeg", + "TVCQF/TVCQF_14.jpeg", + "TVCQF/TVCQF_15.jpeg", + "TVCQF/TVCQF_16.jpeg", + "TVCQF/TVCQF_17.jpeg", + "TVCQF/TVCQF_18.jpeg", + "TVCQF/TVCQF_19.jpeg", + "TVCQF/TVCQF_20.jpeg", + "TVCQF/TVCQF_21.jpeg", + "TVCQF/TVCQF_22.jpeg", + "TVCQF/TVCQF_23.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 98, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person opened the box?\nChoice list: \nA. Put down the dish.\nB. Put down the shoe.\nC. Sat on the sofa/couch.\nD. Put down the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the dish", + "put down the shoe", + "sat on the sofa couch", + "put down the food" + ], + "image_quantity_level": "Many", + "image": [ + "AK9IB/AK9IB_0.jpeg", + "AK9IB/AK9IB_1.jpeg", + "AK9IB/AK9IB_2.jpeg", + "AK9IB/AK9IB_3.jpeg", + "AK9IB/AK9IB_4.jpeg", + "AK9IB/AK9IB_5.jpeg", + "AK9IB/AK9IB_6.jpeg", + "AK9IB/AK9IB_7.jpeg", + "AK9IB/AK9IB_8.jpeg", + "AK9IB/AK9IB_9.jpeg", + "AK9IB/AK9IB_10.jpeg", + "AK9IB/AK9IB_11.jpeg", + "AK9IB/AK9IB_12.jpeg", + "AK9IB/AK9IB_13.jpeg", + "AK9IB/AK9IB_14.jpeg", + "AK9IB/AK9IB_15.jpeg", + "AK9IB/AK9IB_16.jpeg", + "AK9IB/AK9IB_17.jpeg", + "AK9IB/AK9IB_18.jpeg", + "AK9IB/AK9IB_19.jpeg", + "AK9IB/AK9IB_20.jpeg", + "AK9IB/AK9IB_21.jpeg", + "AK9IB/AK9IB_22.jpeg", + "AK9IB/AK9IB_23.jpeg", + "AK9IB/AK9IB_24.jpeg", + "AK9IB/AK9IB_25.jpeg", + "AK9IB/AK9IB_26.jpeg", + "AK9IB/AK9IB_27.jpeg", + "AK9IB/AK9IB_28.jpeg", + "AK9IB/AK9IB_29.jpeg", + "AK9IB/AK9IB_30.jpeg", + "AK9IB/AK9IB_31.jpeg", + "AK9IB/AK9IB_32.jpeg", + "AK9IB/AK9IB_33.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 194, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person put down the shoe?\nChoice list: \nA. Took the paper/notebook.\nB. Took the broom.\nC. Opened the laptop.\nD. Closed the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "closed the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "took the paper notebook", + "took the broom", + "opened the laptop", + "closed the box" + ], + "image_quantity_level": "Many", + "image": [ + "AK9IB/AK9IB_0.jpeg", + "AK9IB/AK9IB_1.jpeg", + "AK9IB/AK9IB_2.jpeg", + "AK9IB/AK9IB_3.jpeg", + "AK9IB/AK9IB_4.jpeg", + "AK9IB/AK9IB_5.jpeg", + "AK9IB/AK9IB_6.jpeg", + "AK9IB/AK9IB_7.jpeg", + "AK9IB/AK9IB_8.jpeg", + "AK9IB/AK9IB_9.jpeg", + "AK9IB/AK9IB_10.jpeg", + "AK9IB/AK9IB_11.jpeg", + "AK9IB/AK9IB_12.jpeg", + "AK9IB/AK9IB_13.jpeg", + "AK9IB/AK9IB_14.jpeg", + "AK9IB/AK9IB_15.jpeg", + "AK9IB/AK9IB_16.jpeg", + "AK9IB/AK9IB_17.jpeg", + "AK9IB/AK9IB_18.jpeg", + "AK9IB/AK9IB_19.jpeg", + "AK9IB/AK9IB_20.jpeg", + "AK9IB/AK9IB_21.jpeg", + "AK9IB/AK9IB_22.jpeg", + "AK9IB/AK9IB_23.jpeg", + "AK9IB/AK9IB_24.jpeg", + "AK9IB/AK9IB_25.jpeg", + "AK9IB/AK9IB_26.jpeg", + "AK9IB/AK9IB_27.jpeg", + "AK9IB/AK9IB_28.jpeg", + "AK9IB/AK9IB_29.jpeg", + "AK9IB/AK9IB_30.jpeg", + "AK9IB/AK9IB_31.jpeg", + "AK9IB/AK9IB_32.jpeg", + "AK9IB/AK9IB_33.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 183, + "question": "Review the timeline of actions in the supplied imagery and answer the associated inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the laptop?\nChoice list: \nA. Put down the paper/notebook.\nB. Sat on the bed.\nC. Threw the blanket.\nD. Opened the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the paper notebook", + "sat on the bed", + "threw the blanket", + "opened the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "M7BD4/M7BD4_0.jpeg", + "M7BD4/M7BD4_1.jpeg", + "M7BD4/M7BD4_2.jpeg", + "M7BD4/M7BD4_3.jpeg", + "M7BD4/M7BD4_4.jpeg", + "M7BD4/M7BD4_5.jpeg", + "M7BD4/M7BD4_6.jpeg", + "M7BD4/M7BD4_7.jpeg", + "M7BD4/M7BD4_8.jpeg", + "M7BD4/M7BD4_9.jpeg", + "M7BD4/M7BD4_10.jpeg", + "M7BD4/M7BD4_11.jpeg", + "M7BD4/M7BD4_12.jpeg", + "M7BD4/M7BD4_13.jpeg", + "M7BD4/M7BD4_14.jpeg", + "M7BD4/M7BD4_15.jpeg", + "M7BD4/M7BD4_16.jpeg", + "M7BD4/M7BD4_17.jpeg", + "M7BD4/M7BD4_18.jpeg", + "M7BD4/M7BD4_19.jpeg", + "M7BD4/M7BD4_20.jpeg", + "M7BD4/M7BD4_21.jpeg", + "M7BD4/M7BD4_22.jpeg", + "M7BD4/M7BD4_23.jpeg", + "M7BD4/M7BD4_24.jpeg", + "M7BD4/M7BD4_25.jpeg", + "M7BD4/M7BD4_26.jpeg", + "M7BD4/M7BD4_27.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 178, + "question": "Examine the succession of events in the supplied visuals and answer the relevant query. You must choose your answer from the Choice List.\nWhat happened after the person opened the closet/cabinet?\nChoice list: \nA. Put down the towel.\nB. Put down the clothes.\nC. Put down the food.\nD. Threw the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the towel", + "put down the clothes", + "put down the food", + "threw the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "ANA5N/ANA5N_0.jpeg", + "ANA5N/ANA5N_1.jpeg", + "ANA5N/ANA5N_2.jpeg", + "ANA5N/ANA5N_3.jpeg", + "ANA5N/ANA5N_4.jpeg", + "ANA5N/ANA5N_5.jpeg", + "ANA5N/ANA5N_6.jpeg", + "ANA5N/ANA5N_7.jpeg", + "ANA5N/ANA5N_8.jpeg", + "ANA5N/ANA5N_9.jpeg", + "ANA5N/ANA5N_10.jpeg", + "ANA5N/ANA5N_11.jpeg", + "ANA5N/ANA5N_12.jpeg", + "ANA5N/ANA5N_13.jpeg", + "ANA5N/ANA5N_14.jpeg", + "ANA5N/ANA5N_15.jpeg", + "ANA5N/ANA5N_16.jpeg", + "ANA5N/ANA5N_17.jpeg", + "ANA5N/ANA5N_18.jpeg", + "ANA5N/ANA5N_19.jpeg", + "ANA5N/ANA5N_20.jpeg", + "ANA5N/ANA5N_21.jpeg", + "ANA5N/ANA5N_22.jpeg", + "ANA5N/ANA5N_23.jpeg", + "ANA5N/ANA5N_24.jpeg", + "ANA5N/ANA5N_25.jpeg", + "ANA5N/ANA5N_26.jpeg", + "ANA5N/ANA5N_27.jpeg", + "ANA5N/ANA5N_28.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 177, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened before the person opened the door?\nChoice list: \nA. Put down the paper/notebook.\nB. Tidied up the blanket.\nC. Sat on the floor.\nD. Took the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sat on the floor", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the paper notebook", + "tidied up the blanket", + "sat on the floor", + "took the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "HCM5Y/HCM5Y_0.jpeg", + "HCM5Y/HCM5Y_1.jpeg", + "HCM5Y/HCM5Y_2.jpeg", + "HCM5Y/HCM5Y_3.jpeg", + "HCM5Y/HCM5Y_4.jpeg", + "HCM5Y/HCM5Y_5.jpeg", + "HCM5Y/HCM5Y_6.jpeg", + "HCM5Y/HCM5Y_7.jpeg", + "HCM5Y/HCM5Y_8.jpeg", + "HCM5Y/HCM5Y_9.jpeg", + "HCM5Y/HCM5Y_10.jpeg", + "HCM5Y/HCM5Y_11.jpeg", + "HCM5Y/HCM5Y_12.jpeg", + "HCM5Y/HCM5Y_13.jpeg", + "HCM5Y/HCM5Y_14.jpeg", + "HCM5Y/HCM5Y_15.jpeg", + "HCM5Y/HCM5Y_16.jpeg", + "HCM5Y/HCM5Y_17.jpeg", + "HCM5Y/HCM5Y_18.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 197, + "question": "Analyze the series of events in the provided pictures and answer the related inquiry. You must choose your answer from the Choice List.\nWhat happened after the person held the clothes?\nChoice list: \nA. Took the bag.\nB. Closed the door.\nC. Opened the bag.\nD. Sat on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "opened the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "took the bag", + "closed the door", + "opened the bag", + "sat on the bed" + ], + "image_quantity_level": "Many", + "image": [ + "S444Y/S444Y_0.jpeg", + "S444Y/S444Y_1.jpeg", + "S444Y/S444Y_2.jpeg", + "S444Y/S444Y_3.jpeg", + "S444Y/S444Y_4.jpeg", + "S444Y/S444Y_5.jpeg", + "S444Y/S444Y_6.jpeg", + "S444Y/S444Y_7.jpeg", + "S444Y/S444Y_8.jpeg", + "S444Y/S444Y_9.jpeg", + "S444Y/S444Y_10.jpeg", + "S444Y/S444Y_11.jpeg", + "S444Y/S444Y_12.jpeg", + "S444Y/S444Y_13.jpeg", + "S444Y/S444Y_14.jpeg", + "S444Y/S444Y_15.jpeg", + "S444Y/S444Y_16.jpeg", + "S444Y/S444Y_17.jpeg", + "S444Y/S444Y_18.jpeg", + "S444Y/S444Y_19.jpeg", + "S444Y/S444Y_20.jpeg", + "S444Y/S444Y_21.jpeg", + "S444Y/S444Y_22.jpeg", + "S444Y/S444Y_23.jpeg", + "S444Y/S444Y_24.jpeg", + "S444Y/S444Y_25.jpeg", + "S444Y/S444Y_26.jpeg", + "S444Y/S444Y_27.jpeg", + "S444Y/S444Y_28.jpeg", + "S444Y/S444Y_29.jpeg", + "S444Y/S444Y_30.jpeg", + "S444Y/S444Y_31.jpeg", + "S444Y/S444Y_32.jpeg", + "S444Y/S444Y_33.jpeg", + "S444Y/S444Y_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 149, + "question": "Investigate the order of events in the given graphics and reply to the corresponding inquiry. You must choose your answer from the Choice List.\nWhat happened after the person took the food?\nChoice list: \nA. Closed the box.\nB. Tidied up the closet/cabinet.\nC. Sat on the sofa/couch.\nD. Opened the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidied up the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "closed the box", + "tidied up the closet cabinet", + "sat on the sofa couch", + "opened the book" + ], + "image_quantity_level": "Many", + "image": [ + "M8OYC/M8OYC_0.jpeg", + "M8OYC/M8OYC_1.jpeg", + "M8OYC/M8OYC_2.jpeg", + "M8OYC/M8OYC_3.jpeg", + "M8OYC/M8OYC_4.jpeg", + "M8OYC/M8OYC_5.jpeg", + "M8OYC/M8OYC_6.jpeg", + "M8OYC/M8OYC_7.jpeg", + "M8OYC/M8OYC_8.jpeg", + "M8OYC/M8OYC_9.jpeg", + "M8OYC/M8OYC_10.jpeg", + "M8OYC/M8OYC_11.jpeg", + "M8OYC/M8OYC_12.jpeg", + "M8OYC/M8OYC_13.jpeg", + "M8OYC/M8OYC_14.jpeg", + "M8OYC/M8OYC_15.jpeg", + "M8OYC/M8OYC_16.jpeg", + "M8OYC/M8OYC_17.jpeg", + "M8OYC/M8OYC_18.jpeg", + "M8OYC/M8OYC_19.jpeg", + "M8OYC/M8OYC_20.jpeg", + "M8OYC/M8OYC_21.jpeg", + "M8OYC/M8OYC_22.jpeg", + "M8OYC/M8OYC_23.jpeg", + "M8OYC/M8OYC_24.jpeg", + "M8OYC/M8OYC_25.jpeg", + "M8OYC/M8OYC_26.jpeg", + "M8OYC/M8OYC_27.jpeg", + "M8OYC/M8OYC_28.jpeg", + "M8OYC/M8OYC_29.jpeg", + "M8OYC/M8OYC_30.jpeg", + "M8OYC/M8OYC_31.jpeg", + "M8OYC/M8OYC_32.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 195, + "question": "Evaluate the sequence of actions depicted in the images and provide an answer to the associated question. You must choose your answer from the Choice List.\nWhat happened before the person closed the door?\nChoice list: \nA. Lied on the bed.\nB. Took the cup/glass/bottle.\nC. Put down the blanket.\nD. Closed the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lied on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "lied on the bed", + "took the cup glass bottle", + "put down the blanket", + "closed the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "E002I/E002I_0.jpeg", + "E002I/E002I_1.jpeg", + "E002I/E002I_2.jpeg", + "E002I/E002I_3.jpeg", + "E002I/E002I_4.jpeg", + "E002I/E002I_5.jpeg", + "E002I/E002I_6.jpeg", + "E002I/E002I_7.jpeg", + "E002I/E002I_8.jpeg", + "E002I/E002I_9.jpeg", + "E002I/E002I_10.jpeg", + "E002I/E002I_11.jpeg", + "E002I/E002I_12.jpeg", + "E002I/E002I_13.jpeg", + "E002I/E002I_14.jpeg", + "E002I/E002I_15.jpeg", + "E002I/E002I_16.jpeg", + "E002I/E002I_17.jpeg", + "E002I/E002I_18.jpeg", + "E002I/E002I_19.jpeg", + "E002I/E002I_20.jpeg", + "E002I/E002I_21.jpeg", + "E002I/E002I_22.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 181, + "question": "Based on the provided images, answer the question related to the sequence of action You must choose your answer from the Choice List.\nWhat happened before the person closed the book?\nChoice list: \nA. Put down the phone/camera.\nB. Ate the medicine.\nC. Put down the blanket.\nD. Tidied up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the phone camera", + "ate the medicine", + "put down the blanket", + "tidied up the table" + ], + "image_quantity_level": "Medium", + "image": [ + "GMMVC/GMMVC_0.jpeg", + "GMMVC/GMMVC_1.jpeg", + "GMMVC/GMMVC_2.jpeg", + "GMMVC/GMMVC_3.jpeg", + "GMMVC/GMMVC_4.jpeg", + "GMMVC/GMMVC_5.jpeg", + "GMMVC/GMMVC_6.jpeg", + "GMMVC/GMMVC_7.jpeg", + "GMMVC/GMMVC_8.jpeg", + "GMMVC/GMMVC_9.jpeg", + "GMMVC/GMMVC_10.jpeg", + "GMMVC/GMMVC_11.jpeg", + "GMMVC/GMMVC_12.jpeg", + "GMMVC/GMMVC_13.jpeg", + "GMMVC/GMMVC_14.jpeg", + "GMMVC/GMMVC_15.jpeg", + "GMMVC/GMMVC_16.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 186, + "question": "Scrutinize the chain of actions in the given visuals and respond to the related question. You must choose your answer from the Choice List.\nWhat happened before the person sat on the sofa/couch?\nChoice list: \nA. Closed the refrigerator.\nB. Put down the towel.\nC. Took the shoe.\nD. Sat on the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "took the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "closed the refrigerator", + "put down the towel", + "took the shoe", + "sat on the table" + ], + "image_quantity_level": "Medium", + "image": [ + "86GSE/86GSE_0.jpeg", + "86GSE/86GSE_1.jpeg", + "86GSE/86GSE_2.jpeg", + "86GSE/86GSE_3.jpeg", + "86GSE/86GSE_4.jpeg", + "86GSE/86GSE_5.jpeg", + "86GSE/86GSE_6.jpeg", + "86GSE/86GSE_7.jpeg", + "86GSE/86GSE_8.jpeg", + "86GSE/86GSE_9.jpeg", + "86GSE/86GSE_10.jpeg", + "86GSE/86GSE_11.jpeg", + "86GSE/86GSE_12.jpeg", + "86GSE/86GSE_13.jpeg", + "86GSE/86GSE_14.jpeg", + "86GSE/86GSE_15.jpeg", + "86GSE/86GSE_16.jpeg", + "86GSE/86GSE_17.jpeg" + ], + "extracted": "C", + "result": 1 + } +] \ No newline at end of file