diff --git "a/eval_milebench/ActionPrediction/pred_with_extracted.json" "b/eval_milebench/ActionPrediction/pred_with_extracted.json" new file mode 100644--- /dev/null +++ "b/eval_milebench/ActionPrediction/pred_with_extracted.json" @@ -0,0 +1,10699 @@ +[ + { + "sample_id": 0, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the pillow.\nB. Open the door.\nC. Take the book.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the pillow", + "open the door", + "take the book", + "open the closet cabinet" + ], + "image_quantity_level": "Many", + "image": [ + "AJTDO/AJTDO_0.jpeg", + "AJTDO/AJTDO_1.jpeg", + "AJTDO/AJTDO_2.jpeg", + "AJTDO/AJTDO_3.jpeg", + "AJTDO/AJTDO_4.jpeg", + "AJTDO/AJTDO_5.jpeg", + "AJTDO/AJTDO_6.jpeg", + "AJTDO/AJTDO_7.jpeg", + "AJTDO/AJTDO_8.jpeg", + "AJTDO/AJTDO_9.jpeg", + "AJTDO/AJTDO_10.jpeg", + "AJTDO/AJTDO_11.jpeg", + "AJTDO/AJTDO_12.jpeg", + "AJTDO/AJTDO_13.jpeg", + "AJTDO/AJTDO_14.jpeg", + "AJTDO/AJTDO_15.jpeg", + "AJTDO/AJTDO_16.jpeg", + "AJTDO/AJTDO_17.jpeg", + "AJTDO/AJTDO_18.jpeg", + "AJTDO/AJTDO_19.jpeg", + "AJTDO/AJTDO_20.jpeg", + "AJTDO/AJTDO_21.jpeg", + "AJTDO/AJTDO_22.jpeg", + "AJTDO/AJTDO_23.jpeg", + "AJTDO/AJTDO_24.jpeg", + "AJTDO/AJTDO_25.jpeg", + "AJTDO/AJTDO_26.jpeg", + "AJTDO/AJTDO_27.jpeg", + "AJTDO/AJTDO_28.jpeg", + "AJTDO/AJTDO_29.jpeg", + "AJTDO/AJTDO_30.jpeg", + "AJTDO/AJTDO_31.jpeg", + "AJTDO/AJTDO_32.jpeg", + "AJTDO/AJTDO_33.jpeg", + "AJTDO/AJTDO_34.jpeg", + "AJTDO/AJTDO_35.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 56, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the clothes.\nB. Eat the sandwich.\nC. Put down the phone/camera.\nD. Throw the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "tidy up the clothes", + "eat the sandwich", + "put down the phone camera", + "throw the pillow" + ], + "image_quantity_level": "Many", + "image": [ + "AJTDO/AJTDO_0.jpeg", + "AJTDO/AJTDO_1.jpeg", + "AJTDO/AJTDO_2.jpeg", + "AJTDO/AJTDO_3.jpeg", + "AJTDO/AJTDO_4.jpeg", + "AJTDO/AJTDO_5.jpeg", + "AJTDO/AJTDO_6.jpeg", + "AJTDO/AJTDO_7.jpeg", + "AJTDO/AJTDO_8.jpeg", + "AJTDO/AJTDO_9.jpeg", + "AJTDO/AJTDO_10.jpeg", + "AJTDO/AJTDO_11.jpeg", + "AJTDO/AJTDO_12.jpeg", + "AJTDO/AJTDO_13.jpeg", + "AJTDO/AJTDO_14.jpeg", + "AJTDO/AJTDO_15.jpeg", + "AJTDO/AJTDO_16.jpeg", + "AJTDO/AJTDO_17.jpeg", + "AJTDO/AJTDO_18.jpeg", + "AJTDO/AJTDO_19.jpeg", + "AJTDO/AJTDO_20.jpeg", + "AJTDO/AJTDO_21.jpeg", + "AJTDO/AJTDO_22.jpeg", + "AJTDO/AJTDO_23.jpeg", + "AJTDO/AJTDO_24.jpeg", + "AJTDO/AJTDO_25.jpeg", + "AJTDO/AJTDO_26.jpeg", + "AJTDO/AJTDO_27.jpeg", + "AJTDO/AJTDO_28.jpeg", + "AJTDO/AJTDO_29.jpeg", + "AJTDO/AJTDO_30.jpeg", + "AJTDO/AJTDO_31.jpeg", + "AJTDO/AJTDO_32.jpeg", + "AJTDO/AJTDO_33.jpeg", + "AJTDO/AJTDO_34.jpeg", + "AJTDO/AJTDO_35.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 1, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Sit on the floor.\nB. Put down the cup/glass/bottle.\nC. Take the book.\nD. Open the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "sit on the floor", + "put down the cup glass bottle", + "take the book", + "open the book" + ], + "image_quantity_level": "Medium", + "image": [ + "RJCNV/RJCNV_0.jpeg", + "RJCNV/RJCNV_1.jpeg", + "RJCNV/RJCNV_2.jpeg", + "RJCNV/RJCNV_3.jpeg", + "RJCNV/RJCNV_4.jpeg", + "RJCNV/RJCNV_5.jpeg", + "RJCNV/RJCNV_6.jpeg", + "RJCNV/RJCNV_7.jpeg", + "RJCNV/RJCNV_8.jpeg", + "RJCNV/RJCNV_9.jpeg", + "RJCNV/RJCNV_10.jpeg", + "RJCNV/RJCNV_11.jpeg", + "RJCNV/RJCNV_12.jpeg", + "RJCNV/RJCNV_13.jpeg", + "RJCNV/RJCNV_14.jpeg", + "RJCNV/RJCNV_15.jpeg", + "RJCNV/RJCNV_16.jpeg", + "RJCNV/RJCNV_17.jpeg", + "RJCNV/RJCNV_18.jpeg", + "RJCNV/RJCNV_19.jpeg", + "RJCNV/RJCNV_20.jpeg", + "RJCNV/RJCNV_21.jpeg", + "RJCNV/RJCNV_22.jpeg", + "RJCNV/RJCNV_23.jpeg", + "RJCNV/RJCNV_24.jpeg", + "RJCNV/RJCNV_25.jpeg", + "RJCNV/RJCNV_26.jpeg", + "RJCNV/RJCNV_27.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 16, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the towel.\nB. Close the laptop.\nC. Close the door.\nD. Hold the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the towel", + "close the laptop", + "close the door", + "hold the book" + ], + "image_quantity_level": "Medium", + "image": [ + "M7BD4/M7BD4_0.jpeg", + "M7BD4/M7BD4_1.jpeg", + "M7BD4/M7BD4_2.jpeg", + "M7BD4/M7BD4_3.jpeg", + "M7BD4/M7BD4_4.jpeg", + "M7BD4/M7BD4_5.jpeg", + "M7BD4/M7BD4_6.jpeg", + "M7BD4/M7BD4_7.jpeg", + "M7BD4/M7BD4_8.jpeg", + "M7BD4/M7BD4_9.jpeg", + "M7BD4/M7BD4_10.jpeg", + "M7BD4/M7BD4_11.jpeg", + "M7BD4/M7BD4_12.jpeg", + "M7BD4/M7BD4_13.jpeg", + "M7BD4/M7BD4_14.jpeg", + "M7BD4/M7BD4_15.jpeg", + "M7BD4/M7BD4_16.jpeg", + "M7BD4/M7BD4_17.jpeg", + "M7BD4/M7BD4_18.jpeg", + "M7BD4/M7BD4_19.jpeg", + "M7BD4/M7BD4_20.jpeg", + "M7BD4/M7BD4_21.jpeg", + "M7BD4/M7BD4_22.jpeg", + "M7BD4/M7BD4_23.jpeg", + "M7BD4/M7BD4_24.jpeg", + "M7BD4/M7BD4_25.jpeg", + "M7BD4/M7BD4_26.jpeg", + "M7BD4/M7BD4_27.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 33, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the door.\nB. Take the laptop.\nC. Take the cup/glass/bottle.\nD. Throw the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "open the door", + "take the laptop", + "take the cup glass bottle", + "throw the food" + ], + "image_quantity_level": "Medium", + "image": [ + "NNG97/NNG97_0.jpeg", + "NNG97/NNG97_1.jpeg", + "NNG97/NNG97_2.jpeg", + "NNG97/NNG97_3.jpeg", + "NNG97/NNG97_4.jpeg", + "NNG97/NNG97_5.jpeg", + "NNG97/NNG97_6.jpeg", + "NNG97/NNG97_7.jpeg", + "NNG97/NNG97_8.jpeg", + "NNG97/NNG97_9.jpeg", + "NNG97/NNG97_10.jpeg", + "NNG97/NNG97_11.jpeg", + "NNG97/NNG97_12.jpeg", + "NNG97/NNG97_13.jpeg", + "NNG97/NNG97_14.jpeg", + "NNG97/NNG97_15.jpeg", + "NNG97/NNG97_16.jpeg", + "NNG97/NNG97_17.jpeg", + "NNG97/NNG97_18.jpeg", + "NNG97/NNG97_19.jpeg", + "NNG97/NNG97_20.jpeg", + "NNG97/NNG97_21.jpeg", + "NNG97/NNG97_22.jpeg", + "NNG97/NNG97_23.jpeg", + "NNG97/NNG97_24.jpeg", + "NNG97/NNG97_25.jpeg", + "NNG97/NNG97_26.jpeg", + "NNG97/NNG97_27.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 2, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Take the book.\nC. Eat the sandwich.\nD. Put down the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the cup glass bottle", + "take the book", + "eat the sandwich", + "put down the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "X8JVY/X8JVY_0.jpeg", + "X8JVY/X8JVY_1.jpeg", + "X8JVY/X8JVY_2.jpeg", + "X8JVY/X8JVY_3.jpeg", + "X8JVY/X8JVY_4.jpeg", + "X8JVY/X8JVY_5.jpeg", + "X8JVY/X8JVY_6.jpeg", + "X8JVY/X8JVY_7.jpeg", + "X8JVY/X8JVY_8.jpeg", + "X8JVY/X8JVY_9.jpeg", + "X8JVY/X8JVY_10.jpeg", + "X8JVY/X8JVY_11.jpeg", + "X8JVY/X8JVY_12.jpeg", + "X8JVY/X8JVY_13.jpeg", + "X8JVY/X8JVY_14.jpeg", + "X8JVY/X8JVY_15.jpeg", + "X8JVY/X8JVY_16.jpeg", + "X8JVY/X8JVY_17.jpeg", + "X8JVY/X8JVY_18.jpeg", + "X8JVY/X8JVY_19.jpeg", + "X8JVY/X8JVY_20.jpeg", + "X8JVY/X8JVY_21.jpeg", + "X8JVY/X8JVY_22.jpeg", + "X8JVY/X8JVY_23.jpeg", + "X8JVY/X8JVY_24.jpeg", + "X8JVY/X8JVY_25.jpeg", + "X8JVY/X8JVY_26.jpeg", + "X8JVY/X8JVY_27.jpeg", + "X8JVY/X8JVY_28.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 23, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the laptop.\nB. Lie on the sofa/couch.\nC. Close the box.\nD. Take the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "open the laptop", + "lie on the sofa couch", + "close the box", + "take the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "3XKBC/3XKBC_0.jpeg", + "3XKBC/3XKBC_1.jpeg", + "3XKBC/3XKBC_2.jpeg", + "3XKBC/3XKBC_3.jpeg", + "3XKBC/3XKBC_4.jpeg", + "3XKBC/3XKBC_5.jpeg", + "3XKBC/3XKBC_6.jpeg", + "3XKBC/3XKBC_7.jpeg", + "3XKBC/3XKBC_8.jpeg", + "3XKBC/3XKBC_9.jpeg", + "3XKBC/3XKBC_10.jpeg", + "3XKBC/3XKBC_11.jpeg", + "3XKBC/3XKBC_12.jpeg", + "3XKBC/3XKBC_13.jpeg", + "3XKBC/3XKBC_14.jpeg", + "3XKBC/3XKBC_15.jpeg", + "3XKBC/3XKBC_16.jpeg", + "3XKBC/3XKBC_17.jpeg", + "3XKBC/3XKBC_18.jpeg", + "3XKBC/3XKBC_19.jpeg", + "3XKBC/3XKBC_20.jpeg", + "3XKBC/3XKBC_21.jpeg", + "3XKBC/3XKBC_22.jpeg", + "3XKBC/3XKBC_23.jpeg", + "3XKBC/3XKBC_24.jpeg", + "3XKBC/3XKBC_25.jpeg", + "3XKBC/3XKBC_26.jpeg", + "3XKBC/3XKBC_27.jpeg", + "3XKBC/3XKBC_28.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 25, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the box.\nB. Put down the food.\nC. Take the paper/notebook.\nD. Open the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "close the box", + "put down the food", + "take the paper notebook", + "open the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "DQEC3/DQEC3_0.jpeg", + "DQEC3/DQEC3_1.jpeg", + "DQEC3/DQEC3_2.jpeg", + "DQEC3/DQEC3_3.jpeg", + "DQEC3/DQEC3_4.jpeg", + "DQEC3/DQEC3_5.jpeg", + "DQEC3/DQEC3_6.jpeg", + "DQEC3/DQEC3_7.jpeg", + "DQEC3/DQEC3_8.jpeg", + "DQEC3/DQEC3_9.jpeg", + "DQEC3/DQEC3_10.jpeg", + "DQEC3/DQEC3_11.jpeg", + "DQEC3/DQEC3_12.jpeg", + "DQEC3/DQEC3_13.jpeg", + "DQEC3/DQEC3_14.jpeg", + "DQEC3/DQEC3_15.jpeg", + "DQEC3/DQEC3_16.jpeg", + "DQEC3/DQEC3_17.jpeg", + "DQEC3/DQEC3_18.jpeg", + "DQEC3/DQEC3_19.jpeg", + "DQEC3/DQEC3_20.jpeg", + "DQEC3/DQEC3_21.jpeg", + "DQEC3/DQEC3_22.jpeg", + "DQEC3/DQEC3_23.jpeg", + "DQEC3/DQEC3_24.jpeg", + "DQEC3/DQEC3_25.jpeg", + "DQEC3/DQEC3_26.jpeg", + "DQEC3/DQEC3_27.jpeg", + "DQEC3/DQEC3_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 28, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next with the dish?\nChoice list: \nA. Wash.\nB. Put down.\nC. Hold.\nD. Take.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "wash", + "put down", + "hold", + "take" + ], + "image_quantity_level": "Medium", + "image": [ + "J6TVB/J6TVB_0.jpeg", + "J6TVB/J6TVB_1.jpeg", + "J6TVB/J6TVB_2.jpeg", + "J6TVB/J6TVB_3.jpeg", + "J6TVB/J6TVB_4.jpeg", + "J6TVB/J6TVB_5.jpeg", + "J6TVB/J6TVB_6.jpeg", + "J6TVB/J6TVB_7.jpeg", + "J6TVB/J6TVB_8.jpeg", + "J6TVB/J6TVB_9.jpeg", + "J6TVB/J6TVB_10.jpeg", + "J6TVB/J6TVB_11.jpeg", + "J6TVB/J6TVB_12.jpeg", + "J6TVB/J6TVB_13.jpeg", + "J6TVB/J6TVB_14.jpeg", + "J6TVB/J6TVB_15.jpeg", + "J6TVB/J6TVB_16.jpeg", + "J6TVB/J6TVB_17.jpeg", + "J6TVB/J6TVB_18.jpeg", + "J6TVB/J6TVB_19.jpeg", + "J6TVB/J6TVB_20.jpeg", + "J6TVB/J6TVB_21.jpeg", + "J6TVB/J6TVB_22.jpeg", + "J6TVB/J6TVB_23.jpeg", + "J6TVB/J6TVB_24.jpeg", + "J6TVB/J6TVB_25.jpeg", + "J6TVB/J6TVB_26.jpeg", + "J6TVB/J6TVB_27.jpeg", + "J6TVB/J6TVB_28.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 3, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the broom.\nB. Put down the clothes.\nC. Put down the food.\nD. Throw the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "tidy up the broom", + "put down the clothes", + "put down the food", + "throw the blanket" + ], + "image_quantity_level": "Many", + "image": [ + "3CAPI/3CAPI_0.jpeg", + "3CAPI/3CAPI_1.jpeg", + "3CAPI/3CAPI_2.jpeg", + "3CAPI/3CAPI_3.jpeg", + "3CAPI/3CAPI_4.jpeg", + "3CAPI/3CAPI_5.jpeg", + "3CAPI/3CAPI_6.jpeg", + "3CAPI/3CAPI_7.jpeg", + "3CAPI/3CAPI_8.jpeg", + "3CAPI/3CAPI_9.jpeg", + "3CAPI/3CAPI_10.jpeg", + "3CAPI/3CAPI_11.jpeg", + "3CAPI/3CAPI_12.jpeg", + "3CAPI/3CAPI_13.jpeg", + "3CAPI/3CAPI_14.jpeg", + "3CAPI/3CAPI_15.jpeg", + "3CAPI/3CAPI_16.jpeg", + "3CAPI/3CAPI_17.jpeg", + "3CAPI/3CAPI_18.jpeg", + "3CAPI/3CAPI_19.jpeg", + "3CAPI/3CAPI_20.jpeg", + "3CAPI/3CAPI_21.jpeg", + "3CAPI/3CAPI_22.jpeg", + "3CAPI/3CAPI_23.jpeg", + "3CAPI/3CAPI_24.jpeg", + "3CAPI/3CAPI_25.jpeg", + "3CAPI/3CAPI_26.jpeg", + "3CAPI/3CAPI_27.jpeg", + "3CAPI/3CAPI_28.jpeg", + "3CAPI/3CAPI_29.jpeg", + "3CAPI/3CAPI_30.jpeg", + "3CAPI/3CAPI_31.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 18, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the door.\nB. Sit at the table.\nC. Take the food.\nD. Take the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit at the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "open the door", + "sit at the table", + "take the food", + "take the dish" + ], + "image_quantity_level": "Many", + "image": [ + "M6ERH/M6ERH_0.jpeg", + "M6ERH/M6ERH_1.jpeg", + "M6ERH/M6ERH_2.jpeg", + "M6ERH/M6ERH_3.jpeg", + "M6ERH/M6ERH_4.jpeg", + "M6ERH/M6ERH_5.jpeg", + "M6ERH/M6ERH_6.jpeg", + "M6ERH/M6ERH_7.jpeg", + "M6ERH/M6ERH_8.jpeg", + "M6ERH/M6ERH_9.jpeg", + "M6ERH/M6ERH_10.jpeg", + "M6ERH/M6ERH_11.jpeg", + "M6ERH/M6ERH_12.jpeg", + "M6ERH/M6ERH_13.jpeg", + "M6ERH/M6ERH_14.jpeg", + "M6ERH/M6ERH_15.jpeg", + "M6ERH/M6ERH_16.jpeg", + "M6ERH/M6ERH_17.jpeg", + "M6ERH/M6ERH_18.jpeg", + "M6ERH/M6ERH_19.jpeg", + "M6ERH/M6ERH_20.jpeg", + "M6ERH/M6ERH_21.jpeg", + "M6ERH/M6ERH_22.jpeg", + "M6ERH/M6ERH_23.jpeg", + "M6ERH/M6ERH_24.jpeg", + "M6ERH/M6ERH_25.jpeg", + "M6ERH/M6ERH_26.jpeg", + "M6ERH/M6ERH_27.jpeg", + "M6ERH/M6ERH_28.jpeg", + "M6ERH/M6ERH_29.jpeg", + "M6ERH/M6ERH_30.jpeg", + "M6ERH/M6ERH_31.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 37, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the food.\nB. Close the door.\nC. Close the window.\nD. Open the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the window", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the food", + "close the door", + "close the window", + "open the window" + ], + "image_quantity_level": "Many", + "image": [ + "3B81O/3B81O_0.jpeg", + "3B81O/3B81O_1.jpeg", + "3B81O/3B81O_2.jpeg", + "3B81O/3B81O_3.jpeg", + "3B81O/3B81O_4.jpeg", + "3B81O/3B81O_5.jpeg", + "3B81O/3B81O_6.jpeg", + "3B81O/3B81O_7.jpeg", + "3B81O/3B81O_8.jpeg", + "3B81O/3B81O_9.jpeg", + "3B81O/3B81O_10.jpeg", + "3B81O/3B81O_11.jpeg", + "3B81O/3B81O_12.jpeg", + "3B81O/3B81O_13.jpeg", + "3B81O/3B81O_14.jpeg", + "3B81O/3B81O_15.jpeg", + "3B81O/3B81O_16.jpeg", + "3B81O/3B81O_17.jpeg", + "3B81O/3B81O_18.jpeg", + "3B81O/3B81O_19.jpeg", + "3B81O/3B81O_20.jpeg", + "3B81O/3B81O_21.jpeg", + "3B81O/3B81O_22.jpeg", + "3B81O/3B81O_23.jpeg", + "3B81O/3B81O_24.jpeg", + "3B81O/3B81O_25.jpeg", + "3B81O/3B81O_26.jpeg", + "3B81O/3B81O_27.jpeg", + "3B81O/3B81O_28.jpeg", + "3B81O/3B81O_29.jpeg", + "3B81O/3B81O_30.jpeg", + "3B81O/3B81O_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 42, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the dish.\nB. Take the food.\nC. Throw the pillow.\nD. Eat the medicine.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the dish", + "take the food", + "throw the pillow", + "eat the medicine" + ], + "image_quantity_level": "Many", + "image": [ + "M1GW9/M1GW9_0.jpeg", + "M1GW9/M1GW9_1.jpeg", + "M1GW9/M1GW9_2.jpeg", + "M1GW9/M1GW9_3.jpeg", + "M1GW9/M1GW9_4.jpeg", + "M1GW9/M1GW9_5.jpeg", + "M1GW9/M1GW9_6.jpeg", + "M1GW9/M1GW9_7.jpeg", + "M1GW9/M1GW9_8.jpeg", + "M1GW9/M1GW9_9.jpeg", + "M1GW9/M1GW9_10.jpeg", + "M1GW9/M1GW9_11.jpeg", + "M1GW9/M1GW9_12.jpeg", + "M1GW9/M1GW9_13.jpeg", + "M1GW9/M1GW9_14.jpeg", + "M1GW9/M1GW9_15.jpeg", + "M1GW9/M1GW9_16.jpeg", + "M1GW9/M1GW9_17.jpeg", + "M1GW9/M1GW9_18.jpeg", + "M1GW9/M1GW9_19.jpeg", + "M1GW9/M1GW9_20.jpeg", + "M1GW9/M1GW9_21.jpeg", + "M1GW9/M1GW9_22.jpeg", + "M1GW9/M1GW9_23.jpeg", + "M1GW9/M1GW9_24.jpeg", + "M1GW9/M1GW9_25.jpeg", + "M1GW9/M1GW9_26.jpeg", + "M1GW9/M1GW9_27.jpeg", + "M1GW9/M1GW9_28.jpeg", + "M1GW9/M1GW9_29.jpeg", + "M1GW9/M1GW9_30.jpeg", + "M1GW9/M1GW9_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 4, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the table.\nB. Open the box.\nC. Tidy up the closet/cabinet.\nD. Hold the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidy up the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidy up the table", + "open the box", + "tidy up the closet cabinet", + "hold the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "T2J3M/T2J3M_0.jpeg", + "T2J3M/T2J3M_1.jpeg", + "T2J3M/T2J3M_2.jpeg", + "T2J3M/T2J3M_3.jpeg", + "T2J3M/T2J3M_4.jpeg", + "T2J3M/T2J3M_5.jpeg", + "T2J3M/T2J3M_6.jpeg", + "T2J3M/T2J3M_7.jpeg", + "T2J3M/T2J3M_8.jpeg", + "T2J3M/T2J3M_9.jpeg", + "T2J3M/T2J3M_10.jpeg", + "T2J3M/T2J3M_11.jpeg", + "T2J3M/T2J3M_12.jpeg", + "T2J3M/T2J3M_13.jpeg", + "T2J3M/T2J3M_14.jpeg", + "T2J3M/T2J3M_15.jpeg", + "T2J3M/T2J3M_16.jpeg", + "T2J3M/T2J3M_17.jpeg", + "T2J3M/T2J3M_18.jpeg", + "T2J3M/T2J3M_19.jpeg", + "T2J3M/T2J3M_20.jpeg", + "T2J3M/T2J3M_21.jpeg", + "T2J3M/T2J3M_22.jpeg", + "T2J3M/T2J3M_23.jpeg", + "T2J3M/T2J3M_24.jpeg", + "T2J3M/T2J3M_25.jpeg", + "T2J3M/T2J3M_26.jpeg", + "T2J3M/T2J3M_27.jpeg", + "T2J3M/T2J3M_28.jpeg", + "T2J3M/T2J3M_29.jpeg", + "T2J3M/T2J3M_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 14, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the sandwich.\nB. Take the phone/camera.\nC. Take the blanket.\nD. Eat the medicine.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "eat the sandwich", + "take the phone camera", + "take the blanket", + "eat the medicine" + ], + "image_quantity_level": "Medium", + "image": [ + "7SXQS/7SXQS_0.jpeg", + "7SXQS/7SXQS_1.jpeg", + "7SXQS/7SXQS_2.jpeg", + "7SXQS/7SXQS_3.jpeg", + "7SXQS/7SXQS_4.jpeg", + "7SXQS/7SXQS_5.jpeg", + "7SXQS/7SXQS_6.jpeg", + "7SXQS/7SXQS_7.jpeg", + "7SXQS/7SXQS_8.jpeg", + "7SXQS/7SXQS_9.jpeg", + "7SXQS/7SXQS_10.jpeg", + "7SXQS/7SXQS_11.jpeg", + "7SXQS/7SXQS_12.jpeg", + "7SXQS/7SXQS_13.jpeg", + "7SXQS/7SXQS_14.jpeg", + "7SXQS/7SXQS_15.jpeg", + "7SXQS/7SXQS_16.jpeg", + "7SXQS/7SXQS_17.jpeg", + "7SXQS/7SXQS_18.jpeg", + "7SXQS/7SXQS_19.jpeg", + "7SXQS/7SXQS_20.jpeg", + "7SXQS/7SXQS_21.jpeg", + "7SXQS/7SXQS_22.jpeg", + "7SXQS/7SXQS_23.jpeg", + "7SXQS/7SXQS_24.jpeg", + "7SXQS/7SXQS_25.jpeg", + "7SXQS/7SXQS_26.jpeg", + "7SXQS/7SXQS_27.jpeg", + "7SXQS/7SXQS_28.jpeg", + "7SXQS/7SXQS_29.jpeg", + "7SXQS/7SXQS_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 21, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Hold the laptop.\nB. Sit on the sofa/couch.\nC. Take the shoe.\nD. Take the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "hold the laptop", + "sit on the sofa couch", + "take the shoe", + "take the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "ZNQVC/ZNQVC_0.jpeg", + "ZNQVC/ZNQVC_1.jpeg", + "ZNQVC/ZNQVC_2.jpeg", + "ZNQVC/ZNQVC_3.jpeg", + "ZNQVC/ZNQVC_4.jpeg", + "ZNQVC/ZNQVC_5.jpeg", + "ZNQVC/ZNQVC_6.jpeg", + "ZNQVC/ZNQVC_7.jpeg", + "ZNQVC/ZNQVC_8.jpeg", + "ZNQVC/ZNQVC_9.jpeg", + "ZNQVC/ZNQVC_10.jpeg", + "ZNQVC/ZNQVC_11.jpeg", + "ZNQVC/ZNQVC_12.jpeg", + "ZNQVC/ZNQVC_13.jpeg", + "ZNQVC/ZNQVC_14.jpeg", + "ZNQVC/ZNQVC_15.jpeg", + "ZNQVC/ZNQVC_16.jpeg", + "ZNQVC/ZNQVC_17.jpeg", + "ZNQVC/ZNQVC_18.jpeg", + "ZNQVC/ZNQVC_19.jpeg", + "ZNQVC/ZNQVC_20.jpeg", + "ZNQVC/ZNQVC_21.jpeg", + "ZNQVC/ZNQVC_22.jpeg", + "ZNQVC/ZNQVC_23.jpeg", + "ZNQVC/ZNQVC_24.jpeg", + "ZNQVC/ZNQVC_25.jpeg", + "ZNQVC/ZNQVC_26.jpeg", + "ZNQVC/ZNQVC_27.jpeg", + "ZNQVC/ZNQVC_28.jpeg", + "ZNQVC/ZNQVC_29.jpeg", + "ZNQVC/ZNQVC_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 38, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the bag.\nB. Put down the towel.\nC. Eat the sandwich.\nD. Sit at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "throw the bag", + "put down the towel", + "eat the sandwich", + "sit at the table" + ], + "image_quantity_level": "Medium", + "image": [ + "2LCLG/2LCLG_0.jpeg", + "2LCLG/2LCLG_1.jpeg", + "2LCLG/2LCLG_2.jpeg", + "2LCLG/2LCLG_3.jpeg", + "2LCLG/2LCLG_4.jpeg", + "2LCLG/2LCLG_5.jpeg", + "2LCLG/2LCLG_6.jpeg", + "2LCLG/2LCLG_7.jpeg", + "2LCLG/2LCLG_8.jpeg", + "2LCLG/2LCLG_9.jpeg", + "2LCLG/2LCLG_10.jpeg", + "2LCLG/2LCLG_11.jpeg", + "2LCLG/2LCLG_12.jpeg", + "2LCLG/2LCLG_13.jpeg", + "2LCLG/2LCLG_14.jpeg", + "2LCLG/2LCLG_15.jpeg", + "2LCLG/2LCLG_16.jpeg", + "2LCLG/2LCLG_17.jpeg", + "2LCLG/2LCLG_18.jpeg", + "2LCLG/2LCLG_19.jpeg", + "2LCLG/2LCLG_20.jpeg", + "2LCLG/2LCLG_21.jpeg", + "2LCLG/2LCLG_22.jpeg", + "2LCLG/2LCLG_23.jpeg", + "2LCLG/2LCLG_24.jpeg", + "2LCLG/2LCLG_25.jpeg", + "2LCLG/2LCLG_26.jpeg", + "2LCLG/2LCLG_27.jpeg", + "2LCLG/2LCLG_28.jpeg", + "2LCLG/2LCLG_29.jpeg", + "2LCLG/2LCLG_30.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 50, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the picture.\nB. Put down the phone/camera.\nC. Hold the box.\nD. Take the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the picture", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the picture", + "put down the phone camera", + "hold the box", + "take the blanket" + ], + "image_quantity_level": "Medium", + "image": [ + "IHSRC/IHSRC_0.jpeg", + "IHSRC/IHSRC_1.jpeg", + "IHSRC/IHSRC_2.jpeg", + "IHSRC/IHSRC_3.jpeg", + "IHSRC/IHSRC_4.jpeg", + "IHSRC/IHSRC_5.jpeg", + "IHSRC/IHSRC_6.jpeg", + "IHSRC/IHSRC_7.jpeg", + "IHSRC/IHSRC_8.jpeg", + "IHSRC/IHSRC_9.jpeg", + "IHSRC/IHSRC_10.jpeg", + "IHSRC/IHSRC_11.jpeg", + "IHSRC/IHSRC_12.jpeg", + "IHSRC/IHSRC_13.jpeg", + "IHSRC/IHSRC_14.jpeg", + "IHSRC/IHSRC_15.jpeg", + "IHSRC/IHSRC_16.jpeg", + "IHSRC/IHSRC_17.jpeg", + "IHSRC/IHSRC_18.jpeg", + "IHSRC/IHSRC_19.jpeg", + "IHSRC/IHSRC_20.jpeg", + "IHSRC/IHSRC_21.jpeg", + "IHSRC/IHSRC_22.jpeg", + "IHSRC/IHSRC_23.jpeg", + "IHSRC/IHSRC_24.jpeg", + "IHSRC/IHSRC_25.jpeg", + "IHSRC/IHSRC_26.jpeg", + "IHSRC/IHSRC_27.jpeg", + "IHSRC/IHSRC_28.jpeg", + "IHSRC/IHSRC_29.jpeg", + "IHSRC/IHSRC_30.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 57, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the medicine.\nB. Put down the towel.\nC. Put down the paper/notebook.\nD. Open the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "eat the medicine", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "eat the medicine", + "put down the towel", + "put down the paper notebook", + "open the refrigerator" + ], + "image_quantity_level": "Medium", + "image": [ + "432NL/432NL_0.jpeg", + "432NL/432NL_1.jpeg", + "432NL/432NL_2.jpeg", + "432NL/432NL_3.jpeg", + "432NL/432NL_4.jpeg", + "432NL/432NL_5.jpeg", + "432NL/432NL_6.jpeg", + "432NL/432NL_7.jpeg", + "432NL/432NL_8.jpeg", + "432NL/432NL_9.jpeg", + "432NL/432NL_10.jpeg", + "432NL/432NL_11.jpeg", + "432NL/432NL_12.jpeg", + "432NL/432NL_13.jpeg", + "432NL/432NL_14.jpeg", + "432NL/432NL_15.jpeg", + "432NL/432NL_16.jpeg", + "432NL/432NL_17.jpeg", + "432NL/432NL_18.jpeg", + "432NL/432NL_19.jpeg", + "432NL/432NL_20.jpeg", + "432NL/432NL_21.jpeg", + "432NL/432NL_22.jpeg", + "432NL/432NL_23.jpeg", + "432NL/432NL_24.jpeg", + "432NL/432NL_25.jpeg", + "432NL/432NL_26.jpeg", + "432NL/432NL_27.jpeg", + "432NL/432NL_28.jpeg", + "432NL/432NL_29.jpeg", + "432NL/432NL_30.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 75, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next with the shoe?\nChoice list: \nA. Sit on.\nB. Take.\nC. Throw.\nD. Put down.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "sit on", + "take", + "throw", + "put down" + ], + "image_quantity_level": "Medium", + "image": [ + "NVBBD/NVBBD_0.jpeg", + "NVBBD/NVBBD_1.jpeg", + "NVBBD/NVBBD_2.jpeg", + "NVBBD/NVBBD_3.jpeg", + "NVBBD/NVBBD_4.jpeg", + "NVBBD/NVBBD_5.jpeg", + "NVBBD/NVBBD_6.jpeg", + "NVBBD/NVBBD_7.jpeg", + "NVBBD/NVBBD_8.jpeg", + "NVBBD/NVBBD_9.jpeg", + "NVBBD/NVBBD_10.jpeg", + "NVBBD/NVBBD_11.jpeg", + "NVBBD/NVBBD_12.jpeg", + "NVBBD/NVBBD_13.jpeg", + "NVBBD/NVBBD_14.jpeg", + "NVBBD/NVBBD_15.jpeg", + "NVBBD/NVBBD_16.jpeg", + "NVBBD/NVBBD_17.jpeg", + "NVBBD/NVBBD_18.jpeg", + "NVBBD/NVBBD_19.jpeg", + "NVBBD/NVBBD_20.jpeg", + "NVBBD/NVBBD_21.jpeg", + "NVBBD/NVBBD_22.jpeg", + "NVBBD/NVBBD_23.jpeg", + "NVBBD/NVBBD_24.jpeg", + "NVBBD/NVBBD_25.jpeg", + "NVBBD/NVBBD_26.jpeg", + "NVBBD/NVBBD_27.jpeg", + "NVBBD/NVBBD_28.jpeg", + "NVBBD/NVBBD_29.jpeg", + "NVBBD/NVBBD_30.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 5, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the shoe.\nB. Sit on the table.\nC. Take the paper/notebook.\nD. Throw the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the shoe", + "sit on the table", + "take the paper notebook", + "throw the clothes" + ], + "image_quantity_level": "Many", + "image": [ + "727IZ/727IZ_0.jpeg", + "727IZ/727IZ_1.jpeg", + "727IZ/727IZ_2.jpeg", + "727IZ/727IZ_3.jpeg", + "727IZ/727IZ_4.jpeg", + "727IZ/727IZ_5.jpeg", + "727IZ/727IZ_6.jpeg", + "727IZ/727IZ_7.jpeg", + "727IZ/727IZ_8.jpeg", + "727IZ/727IZ_9.jpeg", + "727IZ/727IZ_10.jpeg", + "727IZ/727IZ_11.jpeg", + "727IZ/727IZ_12.jpeg", + "727IZ/727IZ_13.jpeg", + "727IZ/727IZ_14.jpeg", + "727IZ/727IZ_15.jpeg", + "727IZ/727IZ_16.jpeg", + "727IZ/727IZ_17.jpeg", + "727IZ/727IZ_18.jpeg", + "727IZ/727IZ_19.jpeg", + "727IZ/727IZ_20.jpeg", + "727IZ/727IZ_21.jpeg", + "727IZ/727IZ_22.jpeg", + "727IZ/727IZ_23.jpeg", + "727IZ/727IZ_24.jpeg", + "727IZ/727IZ_25.jpeg", + "727IZ/727IZ_26.jpeg", + "727IZ/727IZ_27.jpeg", + "727IZ/727IZ_28.jpeg", + "727IZ/727IZ_29.jpeg", + "727IZ/727IZ_30.jpeg", + "727IZ/727IZ_31.jpeg", + "727IZ/727IZ_32.jpeg", + "727IZ/727IZ_33.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 43, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the picture.\nB. Sit on the sofa/couch.\nC. Throw the clothes.\nD. Put down the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the cup glass bottle", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the picture", + "sit on the sofa couch", + "throw the clothes", + "put down the cup glass bottle" + ], + "image_quantity_level": "Many", + "image": [ + "106AY/106AY_0.jpeg", + "106AY/106AY_1.jpeg", + "106AY/106AY_2.jpeg", + "106AY/106AY_3.jpeg", + "106AY/106AY_4.jpeg", + "106AY/106AY_5.jpeg", + "106AY/106AY_6.jpeg", + "106AY/106AY_7.jpeg", + "106AY/106AY_8.jpeg", + "106AY/106AY_9.jpeg", + "106AY/106AY_10.jpeg", + "106AY/106AY_11.jpeg", + "106AY/106AY_12.jpeg", + "106AY/106AY_13.jpeg", + "106AY/106AY_14.jpeg", + "106AY/106AY_15.jpeg", + "106AY/106AY_16.jpeg", + "106AY/106AY_17.jpeg", + "106AY/106AY_18.jpeg", + "106AY/106AY_19.jpeg", + "106AY/106AY_20.jpeg", + "106AY/106AY_21.jpeg", + "106AY/106AY_22.jpeg", + "106AY/106AY_23.jpeg", + "106AY/106AY_24.jpeg", + "106AY/106AY_25.jpeg", + "106AY/106AY_26.jpeg", + "106AY/106AY_27.jpeg", + "106AY/106AY_28.jpeg", + "106AY/106AY_29.jpeg", + "106AY/106AY_30.jpeg", + "106AY/106AY_31.jpeg", + "106AY/106AY_32.jpeg", + "106AY/106AY_33.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 60, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the box.\nB. Put down the broom.\nC. Put down the towel.\nD. Take the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "open the box", + "put down the broom", + "put down the towel", + "take the clothes" + ], + "image_quantity_level": "Many", + "image": [ + "6H78U/6H78U_0.jpeg", + "6H78U/6H78U_1.jpeg", + "6H78U/6H78U_2.jpeg", + "6H78U/6H78U_3.jpeg", + "6H78U/6H78U_4.jpeg", + "6H78U/6H78U_5.jpeg", + "6H78U/6H78U_6.jpeg", + "6H78U/6H78U_7.jpeg", + "6H78U/6H78U_8.jpeg", + "6H78U/6H78U_9.jpeg", + "6H78U/6H78U_10.jpeg", + "6H78U/6H78U_11.jpeg", + "6H78U/6H78U_12.jpeg", + "6H78U/6H78U_13.jpeg", + "6H78U/6H78U_14.jpeg", + "6H78U/6H78U_15.jpeg", + "6H78U/6H78U_16.jpeg", + "6H78U/6H78U_17.jpeg", + "6H78U/6H78U_18.jpeg", + "6H78U/6H78U_19.jpeg", + "6H78U/6H78U_20.jpeg", + "6H78U/6H78U_21.jpeg", + "6H78U/6H78U_22.jpeg", + "6H78U/6H78U_23.jpeg", + "6H78U/6H78U_24.jpeg", + "6H78U/6H78U_25.jpeg", + "6H78U/6H78U_26.jpeg", + "6H78U/6H78U_27.jpeg", + "6H78U/6H78U_28.jpeg", + "6H78U/6H78U_29.jpeg", + "6H78U/6H78U_30.jpeg", + "6H78U/6H78U_31.jpeg", + "6H78U/6H78U_32.jpeg", + "6H78U/6H78U_33.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 6, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next with the sofa/couch?\nChoice list: \nA. Take.\nB. Sit on.\nC. Tidy up.\nD. Lie on.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lie on", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take", + "sit on", + "tidy up", + "lie on" + ], + "image_quantity_level": "Many", + "image": [ + "N2GFQ/N2GFQ_0.jpeg", + "N2GFQ/N2GFQ_1.jpeg", + "N2GFQ/N2GFQ_2.jpeg", + "N2GFQ/N2GFQ_3.jpeg", + "N2GFQ/N2GFQ_4.jpeg", + "N2GFQ/N2GFQ_5.jpeg", + "N2GFQ/N2GFQ_6.jpeg", + "N2GFQ/N2GFQ_7.jpeg", + "N2GFQ/N2GFQ_8.jpeg", + "N2GFQ/N2GFQ_9.jpeg", + "N2GFQ/N2GFQ_10.jpeg", + "N2GFQ/N2GFQ_11.jpeg", + "N2GFQ/N2GFQ_12.jpeg", + "N2GFQ/N2GFQ_13.jpeg", + "N2GFQ/N2GFQ_14.jpeg", + "N2GFQ/N2GFQ_15.jpeg", + "N2GFQ/N2GFQ_16.jpeg", + "N2GFQ/N2GFQ_17.jpeg", + "N2GFQ/N2GFQ_18.jpeg", + "N2GFQ/N2GFQ_19.jpeg", + "N2GFQ/N2GFQ_20.jpeg", + "N2GFQ/N2GFQ_21.jpeg", + "N2GFQ/N2GFQ_22.jpeg", + "N2GFQ/N2GFQ_23.jpeg", + "N2GFQ/N2GFQ_24.jpeg", + "N2GFQ/N2GFQ_25.jpeg", + "N2GFQ/N2GFQ_26.jpeg", + "N2GFQ/N2GFQ_27.jpeg", + "N2GFQ/N2GFQ_28.jpeg", + "N2GFQ/N2GFQ_29.jpeg", + "N2GFQ/N2GFQ_30.jpeg", + "N2GFQ/N2GFQ_31.jpeg", + "N2GFQ/N2GFQ_32.jpeg", + "N2GFQ/N2GFQ_33.jpeg", + "N2GFQ/N2GFQ_34.jpeg", + "N2GFQ/N2GFQ_35.jpeg", + "N2GFQ/N2GFQ_36.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 29, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the door.\nB. Take the picture.\nC. Lie on the sofa/couch.\nD. Open the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lie on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "close the door", + "take the picture", + "lie on the sofa couch", + "open the door" + ], + "image_quantity_level": "Many", + "image": [ + "N2GFQ/N2GFQ_0.jpeg", + "N2GFQ/N2GFQ_1.jpeg", + "N2GFQ/N2GFQ_2.jpeg", + "N2GFQ/N2GFQ_3.jpeg", + "N2GFQ/N2GFQ_4.jpeg", + "N2GFQ/N2GFQ_5.jpeg", + "N2GFQ/N2GFQ_6.jpeg", + "N2GFQ/N2GFQ_7.jpeg", + "N2GFQ/N2GFQ_8.jpeg", + "N2GFQ/N2GFQ_9.jpeg", + "N2GFQ/N2GFQ_10.jpeg", + "N2GFQ/N2GFQ_11.jpeg", + "N2GFQ/N2GFQ_12.jpeg", + "N2GFQ/N2GFQ_13.jpeg", + "N2GFQ/N2GFQ_14.jpeg", + "N2GFQ/N2GFQ_15.jpeg", + "N2GFQ/N2GFQ_16.jpeg", + "N2GFQ/N2GFQ_17.jpeg", + "N2GFQ/N2GFQ_18.jpeg", + "N2GFQ/N2GFQ_19.jpeg", + "N2GFQ/N2GFQ_20.jpeg", + "N2GFQ/N2GFQ_21.jpeg", + "N2GFQ/N2GFQ_22.jpeg", + "N2GFQ/N2GFQ_23.jpeg", + "N2GFQ/N2GFQ_24.jpeg", + "N2GFQ/N2GFQ_25.jpeg", + "N2GFQ/N2GFQ_26.jpeg", + "N2GFQ/N2GFQ_27.jpeg", + "N2GFQ/N2GFQ_28.jpeg", + "N2GFQ/N2GFQ_29.jpeg", + "N2GFQ/N2GFQ_30.jpeg", + "N2GFQ/N2GFQ_31.jpeg", + "N2GFQ/N2GFQ_32.jpeg", + "N2GFQ/N2GFQ_33.jpeg", + "N2GFQ/N2GFQ_34.jpeg", + "N2GFQ/N2GFQ_35.jpeg", + "N2GFQ/N2GFQ_36.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 7, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Put down the bag.\nC. Sit on the table.\nD. Take the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the clothes", + "put down the bag", + "sit on the table", + "take the book" + ], + "image_quantity_level": "Many", + "image": [ + "5INX3/5INX3_0.jpeg", + "5INX3/5INX3_1.jpeg", + "5INX3/5INX3_2.jpeg", + "5INX3/5INX3_3.jpeg", + "5INX3/5INX3_4.jpeg", + "5INX3/5INX3_5.jpeg", + "5INX3/5INX3_6.jpeg", + "5INX3/5INX3_7.jpeg", + "5INX3/5INX3_8.jpeg", + "5INX3/5INX3_9.jpeg", + "5INX3/5INX3_10.jpeg", + "5INX3/5INX3_11.jpeg", + "5INX3/5INX3_12.jpeg", + "5INX3/5INX3_13.jpeg", + "5INX3/5INX3_14.jpeg", + "5INX3/5INX3_15.jpeg", + "5INX3/5INX3_16.jpeg", + "5INX3/5INX3_17.jpeg", + "5INX3/5INX3_18.jpeg", + "5INX3/5INX3_19.jpeg", + "5INX3/5INX3_20.jpeg", + "5INX3/5INX3_21.jpeg", + "5INX3/5INX3_22.jpeg", + "5INX3/5INX3_23.jpeg", + "5INX3/5INX3_24.jpeg", + "5INX3/5INX3_25.jpeg", + "5INX3/5INX3_26.jpeg", + "5INX3/5INX3_27.jpeg", + "5INX3/5INX3_28.jpeg", + "5INX3/5INX3_29.jpeg", + "5INX3/5INX3_30.jpeg", + "5INX3/5INX3_31.jpeg", + "5INX3/5INX3_32.jpeg", + "5INX3/5INX3_33.jpeg", + "5INX3/5INX3_34.jpeg", + "5INX3/5INX3_35.jpeg", + "5INX3/5INX3_36.jpeg", + "5INX3/5INX3_37.jpeg", + "5INX3/5INX3_38.jpeg", + "5INX3/5INX3_39.jpeg", + "5INX3/5INX3_40.jpeg", + "5INX3/5INX3_41.jpeg", + "5INX3/5INX3_42.jpeg", + "5INX3/5INX3_43.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 8, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the shoe.\nB. Put down the laptop.\nC. Put down the dish.\nD. Open the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "throw the shoe", + "put down the laptop", + "put down the dish", + "open the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "FL6DF/FL6DF_0.jpeg", + "FL6DF/FL6DF_1.jpeg", + "FL6DF/FL6DF_2.jpeg", + "FL6DF/FL6DF_3.jpeg", + "FL6DF/FL6DF_4.jpeg", + "FL6DF/FL6DF_5.jpeg", + "FL6DF/FL6DF_6.jpeg", + "FL6DF/FL6DF_7.jpeg", + "FL6DF/FL6DF_8.jpeg", + "FL6DF/FL6DF_9.jpeg", + "FL6DF/FL6DF_10.jpeg", + "FL6DF/FL6DF_11.jpeg", + "FL6DF/FL6DF_12.jpeg", + "FL6DF/FL6DF_13.jpeg", + "FL6DF/FL6DF_14.jpeg", + "FL6DF/FL6DF_15.jpeg", + "FL6DF/FL6DF_16.jpeg", + "FL6DF/FL6DF_17.jpeg", + "FL6DF/FL6DF_18.jpeg", + "FL6DF/FL6DF_19.jpeg", + "FL6DF/FL6DF_20.jpeg", + "FL6DF/FL6DF_21.jpeg", + "FL6DF/FL6DF_22.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 47, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the picture.\nB. Take the book.\nC. Put down the blanket.\nD. Sit on the floor.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the picture", + "take the book", + "put down the blanket", + "sit on the floor" + ], + "image_quantity_level": "Medium", + "image": [ + "FL6DF/FL6DF_0.jpeg", + "FL6DF/FL6DF_1.jpeg", + "FL6DF/FL6DF_2.jpeg", + "FL6DF/FL6DF_3.jpeg", + "FL6DF/FL6DF_4.jpeg", + "FL6DF/FL6DF_5.jpeg", + "FL6DF/FL6DF_6.jpeg", + "FL6DF/FL6DF_7.jpeg", + "FL6DF/FL6DF_8.jpeg", + "FL6DF/FL6DF_9.jpeg", + "FL6DF/FL6DF_10.jpeg", + "FL6DF/FL6DF_11.jpeg", + "FL6DF/FL6DF_12.jpeg", + "FL6DF/FL6DF_13.jpeg", + "FL6DF/FL6DF_14.jpeg", + "FL6DF/FL6DF_15.jpeg", + "FL6DF/FL6DF_16.jpeg", + "FL6DF/FL6DF_17.jpeg", + "FL6DF/FL6DF_18.jpeg", + "FL6DF/FL6DF_19.jpeg", + "FL6DF/FL6DF_20.jpeg", + "FL6DF/FL6DF_21.jpeg", + "FL6DF/FL6DF_22.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 9, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the closet/cabinet.\nB. Tidy up the broom.\nC. Throw the pillow.\nD. Take the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "close the closet cabinet", + "tidy up the broom", + "throw the pillow", + "take the bag" + ], + "image_quantity_level": "Many", + "image": [ + "M52K2/M52K2_0.jpeg", + "M52K2/M52K2_1.jpeg", + "M52K2/M52K2_2.jpeg", + "M52K2/M52K2_3.jpeg", + "M52K2/M52K2_4.jpeg", + "M52K2/M52K2_5.jpeg", + "M52K2/M52K2_6.jpeg", + "M52K2/M52K2_7.jpeg", + "M52K2/M52K2_8.jpeg", + "M52K2/M52K2_9.jpeg", + "M52K2/M52K2_10.jpeg", + "M52K2/M52K2_11.jpeg", + "M52K2/M52K2_12.jpeg", + "M52K2/M52K2_13.jpeg", + "M52K2/M52K2_14.jpeg", + "M52K2/M52K2_15.jpeg", + "M52K2/M52K2_16.jpeg", + "M52K2/M52K2_17.jpeg", + "M52K2/M52K2_18.jpeg", + "M52K2/M52K2_19.jpeg", + "M52K2/M52K2_20.jpeg", + "M52K2/M52K2_21.jpeg", + "M52K2/M52K2_22.jpeg", + "M52K2/M52K2_23.jpeg", + "M52K2/M52K2_24.jpeg", + "M52K2/M52K2_25.jpeg", + "M52K2/M52K2_26.jpeg", + "M52K2/M52K2_27.jpeg", + "M52K2/M52K2_28.jpeg", + "M52K2/M52K2_29.jpeg", + "M52K2/M52K2_30.jpeg", + "M52K2/M52K2_31.jpeg", + "M52K2/M52K2_32.jpeg", + "M52K2/M52K2_33.jpeg", + "M52K2/M52K2_34.jpeg", + "M52K2/M52K2_35.jpeg", + "M52K2/M52K2_36.jpeg", + "M52K2/M52K2_37.jpeg", + "M52K2/M52K2_38.jpeg", + "M52K2/M52K2_39.jpeg", + "M52K2/M52K2_40.jpeg", + "M52K2/M52K2_41.jpeg", + "M52K2/M52K2_42.jpeg", + "M52K2/M52K2_43.jpeg", + "M52K2/M52K2_44.jpeg", + "M52K2/M52K2_45.jpeg", + "M52K2/M52K2_46.jpeg", + "M52K2/M52K2_47.jpeg", + "M52K2/M52K2_48.jpeg", + "M52K2/M52K2_49.jpeg", + "M52K2/M52K2_50.jpeg", + "M52K2/M52K2_51.jpeg", + "M52K2/M52K2_52.jpeg", + "M52K2/M52K2_53.jpeg", + "M52K2/M52K2_54.jpeg", + "M52K2/M52K2_55.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 10, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the bag.\nB. Take the food.\nC. Take the phone/camera.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the bag", + "take the food", + "take the phone camera", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "78R4Y/78R4Y_0.jpeg", + "78R4Y/78R4Y_1.jpeg", + "78R4Y/78R4Y_2.jpeg", + "78R4Y/78R4Y_3.jpeg", + "78R4Y/78R4Y_4.jpeg", + "78R4Y/78R4Y_5.jpeg", + "78R4Y/78R4Y_6.jpeg", + "78R4Y/78R4Y_7.jpeg", + "78R4Y/78R4Y_8.jpeg", + "78R4Y/78R4Y_9.jpeg", + "78R4Y/78R4Y_10.jpeg", + "78R4Y/78R4Y_11.jpeg", + "78R4Y/78R4Y_12.jpeg", + "78R4Y/78R4Y_13.jpeg", + "78R4Y/78R4Y_14.jpeg", + "78R4Y/78R4Y_15.jpeg", + "78R4Y/78R4Y_16.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 11, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the picture.\nB. Eat the medicine.\nC. Tidy up the table.\nD. Close the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the picture", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the picture", + "eat the medicine", + "tidy up the table", + "close the closet cabinet" + ], + "image_quantity_level": "Many", + "image": [ + "ZAJAJ/ZAJAJ_0.jpeg", + "ZAJAJ/ZAJAJ_1.jpeg", + "ZAJAJ/ZAJAJ_2.jpeg", + "ZAJAJ/ZAJAJ_3.jpeg", + "ZAJAJ/ZAJAJ_4.jpeg", + "ZAJAJ/ZAJAJ_5.jpeg", + "ZAJAJ/ZAJAJ_6.jpeg", + "ZAJAJ/ZAJAJ_7.jpeg", + "ZAJAJ/ZAJAJ_8.jpeg", + "ZAJAJ/ZAJAJ_9.jpeg", + "ZAJAJ/ZAJAJ_10.jpeg", + "ZAJAJ/ZAJAJ_11.jpeg", + "ZAJAJ/ZAJAJ_12.jpeg", + "ZAJAJ/ZAJAJ_13.jpeg", + "ZAJAJ/ZAJAJ_14.jpeg", + "ZAJAJ/ZAJAJ_15.jpeg", + "ZAJAJ/ZAJAJ_16.jpeg", + "ZAJAJ/ZAJAJ_17.jpeg", + "ZAJAJ/ZAJAJ_18.jpeg", + "ZAJAJ/ZAJAJ_19.jpeg", + "ZAJAJ/ZAJAJ_20.jpeg", + "ZAJAJ/ZAJAJ_21.jpeg", + "ZAJAJ/ZAJAJ_22.jpeg", + "ZAJAJ/ZAJAJ_23.jpeg", + "ZAJAJ/ZAJAJ_24.jpeg", + "ZAJAJ/ZAJAJ_25.jpeg", + "ZAJAJ/ZAJAJ_26.jpeg", + "ZAJAJ/ZAJAJ_27.jpeg", + "ZAJAJ/ZAJAJ_28.jpeg", + "ZAJAJ/ZAJAJ_29.jpeg", + "ZAJAJ/ZAJAJ_30.jpeg", + "ZAJAJ/ZAJAJ_31.jpeg", + "ZAJAJ/ZAJAJ_32.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 12, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the broom.\nB. Open the bag.\nC. Take the cup/glass/bottle.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the broom", + "open the bag", + "take the cup glass bottle", + "put down the paper notebook" + ], + "image_quantity_level": "Many", + "image": [ + "I4N6K/I4N6K_0.jpeg", + "I4N6K/I4N6K_1.jpeg", + "I4N6K/I4N6K_2.jpeg", + "I4N6K/I4N6K_3.jpeg", + "I4N6K/I4N6K_4.jpeg", + "I4N6K/I4N6K_5.jpeg", + "I4N6K/I4N6K_6.jpeg", + "I4N6K/I4N6K_7.jpeg", + "I4N6K/I4N6K_8.jpeg", + "I4N6K/I4N6K_9.jpeg", + "I4N6K/I4N6K_10.jpeg", + "I4N6K/I4N6K_11.jpeg", + "I4N6K/I4N6K_12.jpeg", + "I4N6K/I4N6K_13.jpeg", + "I4N6K/I4N6K_14.jpeg", + "I4N6K/I4N6K_15.jpeg", + "I4N6K/I4N6K_16.jpeg", + "I4N6K/I4N6K_17.jpeg", + "I4N6K/I4N6K_18.jpeg", + "I4N6K/I4N6K_19.jpeg", + "I4N6K/I4N6K_20.jpeg", + "I4N6K/I4N6K_21.jpeg", + "I4N6K/I4N6K_22.jpeg", + "I4N6K/I4N6K_23.jpeg", + "I4N6K/I4N6K_24.jpeg", + "I4N6K/I4N6K_25.jpeg", + "I4N6K/I4N6K_26.jpeg", + "I4N6K/I4N6K_27.jpeg", + "I4N6K/I4N6K_28.jpeg", + "I4N6K/I4N6K_29.jpeg", + "I4N6K/I4N6K_30.jpeg", + "I4N6K/I4N6K_31.jpeg", + "I4N6K/I4N6K_32.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 13, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the bag.\nB. Wash the window.\nC. Tidy up the blanket.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "wash the window", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the bag", + "wash the window", + "tidy up the blanket", + "sit on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "B8PQL/B8PQL_0.jpeg", + "B8PQL/B8PQL_1.jpeg", + "B8PQL/B8PQL_2.jpeg", + "B8PQL/B8PQL_3.jpeg", + "B8PQL/B8PQL_4.jpeg", + "B8PQL/B8PQL_5.jpeg", + "B8PQL/B8PQL_6.jpeg", + "B8PQL/B8PQL_7.jpeg", + "B8PQL/B8PQL_8.jpeg", + "B8PQL/B8PQL_9.jpeg", + "B8PQL/B8PQL_10.jpeg", + "B8PQL/B8PQL_11.jpeg", + "B8PQL/B8PQL_12.jpeg", + "B8PQL/B8PQL_13.jpeg", + "B8PQL/B8PQL_14.jpeg", + "B8PQL/B8PQL_15.jpeg", + "B8PQL/B8PQL_16.jpeg", + "B8PQL/B8PQL_17.jpeg", + "B8PQL/B8PQL_18.jpeg", + "B8PQL/B8PQL_19.jpeg", + "B8PQL/B8PQL_20.jpeg", + "B8PQL/B8PQL_21.jpeg", + "B8PQL/B8PQL_22.jpeg", + "B8PQL/B8PQL_23.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 15, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the shoe.\nB. Open the book.\nC. Take the shoe.\nD. Open the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the shoe", + "open the book", + "take the shoe", + "open the box" + ], + "image_quantity_level": "Many", + "image": [ + "QRWQ3/QRWQ3_0.jpeg", + "QRWQ3/QRWQ3_1.jpeg", + "QRWQ3/QRWQ3_2.jpeg", + "QRWQ3/QRWQ3_3.jpeg", + "QRWQ3/QRWQ3_4.jpeg", + "QRWQ3/QRWQ3_5.jpeg", + "QRWQ3/QRWQ3_6.jpeg", + "QRWQ3/QRWQ3_7.jpeg", + "QRWQ3/QRWQ3_8.jpeg", + "QRWQ3/QRWQ3_9.jpeg", + "QRWQ3/QRWQ3_10.jpeg", + "QRWQ3/QRWQ3_11.jpeg", + "QRWQ3/QRWQ3_12.jpeg", + "QRWQ3/QRWQ3_13.jpeg", + "QRWQ3/QRWQ3_14.jpeg", + "QRWQ3/QRWQ3_15.jpeg", + "QRWQ3/QRWQ3_16.jpeg", + "QRWQ3/QRWQ3_17.jpeg", + "QRWQ3/QRWQ3_18.jpeg", + "QRWQ3/QRWQ3_19.jpeg", + "QRWQ3/QRWQ3_20.jpeg", + "QRWQ3/QRWQ3_21.jpeg", + "QRWQ3/QRWQ3_22.jpeg", + "QRWQ3/QRWQ3_23.jpeg", + "QRWQ3/QRWQ3_24.jpeg", + "QRWQ3/QRWQ3_25.jpeg", + "QRWQ3/QRWQ3_26.jpeg", + "QRWQ3/QRWQ3_27.jpeg", + "QRWQ3/QRWQ3_28.jpeg", + "QRWQ3/QRWQ3_29.jpeg", + "QRWQ3/QRWQ3_30.jpeg", + "QRWQ3/QRWQ3_31.jpeg", + "QRWQ3/QRWQ3_32.jpeg", + "QRWQ3/QRWQ3_33.jpeg", + "QRWQ3/QRWQ3_34.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 85, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the book.\nB. Take the bag.\nC. Put down the paper/notebook.\nD. Open the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take the book", + "take the bag", + "put down the paper notebook", + "open the laptop" + ], + "image_quantity_level": "Many", + "image": [ + "T1CQE/T1CQE_0.jpeg", + "T1CQE/T1CQE_1.jpeg", + "T1CQE/T1CQE_2.jpeg", + "T1CQE/T1CQE_3.jpeg", + "T1CQE/T1CQE_4.jpeg", + "T1CQE/T1CQE_5.jpeg", + "T1CQE/T1CQE_6.jpeg", + "T1CQE/T1CQE_7.jpeg", + "T1CQE/T1CQE_8.jpeg", + "T1CQE/T1CQE_9.jpeg", + "T1CQE/T1CQE_10.jpeg", + "T1CQE/T1CQE_11.jpeg", + "T1CQE/T1CQE_12.jpeg", + "T1CQE/T1CQE_13.jpeg", + "T1CQE/T1CQE_14.jpeg", + "T1CQE/T1CQE_15.jpeg", + "T1CQE/T1CQE_16.jpeg", + "T1CQE/T1CQE_17.jpeg", + "T1CQE/T1CQE_18.jpeg", + "T1CQE/T1CQE_19.jpeg", + "T1CQE/T1CQE_20.jpeg", + "T1CQE/T1CQE_21.jpeg", + "T1CQE/T1CQE_22.jpeg", + "T1CQE/T1CQE_23.jpeg", + "T1CQE/T1CQE_24.jpeg", + "T1CQE/T1CQE_25.jpeg", + "T1CQE/T1CQE_26.jpeg", + "T1CQE/T1CQE_27.jpeg", + "T1CQE/T1CQE_28.jpeg", + "T1CQE/T1CQE_29.jpeg", + "T1CQE/T1CQE_30.jpeg", + "T1CQE/T1CQE_31.jpeg", + "T1CQE/T1CQE_32.jpeg", + "T1CQE/T1CQE_33.jpeg", + "T1CQE/T1CQE_34.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 17, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next with the blanket?\nChoice list: \nA. Tidy up.\nB. Sit on.\nC. Put down.\nD. Take.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidy up", + "sit on", + "put down", + "take" + ], + "image_quantity_level": "Medium", + "image": [ + "ATV2F/ATV2F_0.jpeg", + "ATV2F/ATV2F_1.jpeg", + "ATV2F/ATV2F_2.jpeg", + "ATV2F/ATV2F_3.jpeg", + "ATV2F/ATV2F_4.jpeg", + "ATV2F/ATV2F_5.jpeg", + "ATV2F/ATV2F_6.jpeg", + "ATV2F/ATV2F_7.jpeg", + "ATV2F/ATV2F_8.jpeg", + "ATV2F/ATV2F_9.jpeg", + "ATV2F/ATV2F_10.jpeg", + "ATV2F/ATV2F_11.jpeg", + "ATV2F/ATV2F_12.jpeg", + "ATV2F/ATV2F_13.jpeg", + "ATV2F/ATV2F_14.jpeg", + "ATV2F/ATV2F_15.jpeg", + "ATV2F/ATV2F_16.jpeg", + "ATV2F/ATV2F_17.jpeg", + "ATV2F/ATV2F_18.jpeg", + "ATV2F/ATV2F_19.jpeg", + "ATV2F/ATV2F_20.jpeg", + "ATV2F/ATV2F_21.jpeg", + "ATV2F/ATV2F_22.jpeg", + "ATV2F/ATV2F_23.jpeg", + "ATV2F/ATV2F_24.jpeg", + "ATV2F/ATV2F_25.jpeg", + "ATV2F/ATV2F_26.jpeg", + "ATV2F/ATV2F_27.jpeg", + "ATV2F/ATV2F_28.jpeg", + "ATV2F/ATV2F_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 22, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the box.\nB. Put down the towel.\nC. Take the laptop.\nD. Close the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "close the box", + "put down the towel", + "take the laptop", + "close the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "1O1JS/1O1JS_0.jpeg", + "1O1JS/1O1JS_1.jpeg", + "1O1JS/1O1JS_2.jpeg", + "1O1JS/1O1JS_3.jpeg", + "1O1JS/1O1JS_4.jpeg", + "1O1JS/1O1JS_5.jpeg", + "1O1JS/1O1JS_6.jpeg", + "1O1JS/1O1JS_7.jpeg", + "1O1JS/1O1JS_8.jpeg", + "1O1JS/1O1JS_9.jpeg", + "1O1JS/1O1JS_10.jpeg", + "1O1JS/1O1JS_11.jpeg", + "1O1JS/1O1JS_12.jpeg", + "1O1JS/1O1JS_13.jpeg", + "1O1JS/1O1JS_14.jpeg", + "1O1JS/1O1JS_15.jpeg", + "1O1JS/1O1JS_16.jpeg", + "1O1JS/1O1JS_17.jpeg", + "1O1JS/1O1JS_18.jpeg", + "1O1JS/1O1JS_19.jpeg", + "1O1JS/1O1JS_20.jpeg", + "1O1JS/1O1JS_21.jpeg", + "1O1JS/1O1JS_22.jpeg", + "1O1JS/1O1JS_23.jpeg", + "1O1JS/1O1JS_24.jpeg", + "1O1JS/1O1JS_25.jpeg", + "1O1JS/1O1JS_26.jpeg", + "1O1JS/1O1JS_27.jpeg", + "1O1JS/1O1JS_28.jpeg", + "1O1JS/1O1JS_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 24, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the closet/cabinet.\nB. Put down the phone/camera.\nC. Open the book.\nD. Open the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "open the closet cabinet", + "put down the phone camera", + "open the book", + "open the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "9ZQ0R/9ZQ0R_0.jpeg", + "9ZQ0R/9ZQ0R_1.jpeg", + "9ZQ0R/9ZQ0R_2.jpeg", + "9ZQ0R/9ZQ0R_3.jpeg", + "9ZQ0R/9ZQ0R_4.jpeg", + "9ZQ0R/9ZQ0R_5.jpeg", + "9ZQ0R/9ZQ0R_6.jpeg", + "9ZQ0R/9ZQ0R_7.jpeg", + "9ZQ0R/9ZQ0R_8.jpeg", + "9ZQ0R/9ZQ0R_9.jpeg", + "9ZQ0R/9ZQ0R_10.jpeg", + "9ZQ0R/9ZQ0R_11.jpeg", + "9ZQ0R/9ZQ0R_12.jpeg", + "9ZQ0R/9ZQ0R_13.jpeg", + "9ZQ0R/9ZQ0R_14.jpeg", + "9ZQ0R/9ZQ0R_15.jpeg", + "9ZQ0R/9ZQ0R_16.jpeg", + "9ZQ0R/9ZQ0R_17.jpeg", + "9ZQ0R/9ZQ0R_18.jpeg", + "9ZQ0R/9ZQ0R_19.jpeg", + "9ZQ0R/9ZQ0R_20.jpeg", + "9ZQ0R/9ZQ0R_21.jpeg", + "9ZQ0R/9ZQ0R_22.jpeg", + "9ZQ0R/9ZQ0R_23.jpeg", + "9ZQ0R/9ZQ0R_24.jpeg", + "9ZQ0R/9ZQ0R_25.jpeg", + "9ZQ0R/9ZQ0R_26.jpeg", + "9ZQ0R/9ZQ0R_27.jpeg", + "9ZQ0R/9ZQ0R_28.jpeg", + "9ZQ0R/9ZQ0R_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 26, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the closet/cabinet.\nB. Open the door.\nC. Close the closet/cabinet.\nD. Tidy up the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "open the closet cabinet", + "open the door", + "close the closet cabinet", + "tidy up the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "2U0QT/2U0QT_0.jpeg", + "2U0QT/2U0QT_1.jpeg", + "2U0QT/2U0QT_2.jpeg", + "2U0QT/2U0QT_3.jpeg", + "2U0QT/2U0QT_4.jpeg", + "2U0QT/2U0QT_5.jpeg", + "2U0QT/2U0QT_6.jpeg", + "2U0QT/2U0QT_7.jpeg", + "2U0QT/2U0QT_8.jpeg", + "2U0QT/2U0QT_9.jpeg", + "2U0QT/2U0QT_10.jpeg", + "2U0QT/2U0QT_11.jpeg", + "2U0QT/2U0QT_12.jpeg", + "2U0QT/2U0QT_13.jpeg", + "2U0QT/2U0QT_14.jpeg", + "2U0QT/2U0QT_15.jpeg", + "2U0QT/2U0QT_16.jpeg", + "2U0QT/2U0QT_17.jpeg", + "2U0QT/2U0QT_18.jpeg", + "2U0QT/2U0QT_19.jpeg", + "2U0QT/2U0QT_20.jpeg", + "2U0QT/2U0QT_21.jpeg", + "2U0QT/2U0QT_22.jpeg", + "2U0QT/2U0QT_23.jpeg", + "2U0QT/2U0QT_24.jpeg", + "2U0QT/2U0QT_25.jpeg", + "2U0QT/2U0QT_26.jpeg", + "2U0QT/2U0QT_27.jpeg", + "2U0QT/2U0QT_28.jpeg", + "2U0QT/2U0QT_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 31, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the window.\nB. Close the refrigerator.\nC. Sit on the sofa/couch.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "close the window", + "close the refrigerator", + "sit on the sofa couch", + "open the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "9OMY1/9OMY1_0.jpeg", + "9OMY1/9OMY1_1.jpeg", + "9OMY1/9OMY1_2.jpeg", + "9OMY1/9OMY1_3.jpeg", + "9OMY1/9OMY1_4.jpeg", + "9OMY1/9OMY1_5.jpeg", + "9OMY1/9OMY1_6.jpeg", + "9OMY1/9OMY1_7.jpeg", + "9OMY1/9OMY1_8.jpeg", + "9OMY1/9OMY1_9.jpeg", + "9OMY1/9OMY1_10.jpeg", + "9OMY1/9OMY1_11.jpeg", + "9OMY1/9OMY1_12.jpeg", + "9OMY1/9OMY1_13.jpeg", + "9OMY1/9OMY1_14.jpeg", + "9OMY1/9OMY1_15.jpeg", + "9OMY1/9OMY1_16.jpeg", + "9OMY1/9OMY1_17.jpeg", + "9OMY1/9OMY1_18.jpeg", + "9OMY1/9OMY1_19.jpeg", + "9OMY1/9OMY1_20.jpeg", + "9OMY1/9OMY1_21.jpeg", + "9OMY1/9OMY1_22.jpeg", + "9OMY1/9OMY1_23.jpeg", + "9OMY1/9OMY1_24.jpeg", + "9OMY1/9OMY1_25.jpeg", + "9OMY1/9OMY1_26.jpeg", + "9OMY1/9OMY1_27.jpeg", + "9OMY1/9OMY1_28.jpeg", + "9OMY1/9OMY1_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 34, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the cup/glass/bottle.\nB. Put down the broom.\nC. Open the bag.\nD. Lie on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the cup glass bottle", + "put down the broom", + "open the bag", + "lie on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "I1OLQ/I1OLQ_0.jpeg", + "I1OLQ/I1OLQ_1.jpeg", + "I1OLQ/I1OLQ_2.jpeg", + "I1OLQ/I1OLQ_3.jpeg", + "I1OLQ/I1OLQ_4.jpeg", + "I1OLQ/I1OLQ_5.jpeg", + "I1OLQ/I1OLQ_6.jpeg", + "I1OLQ/I1OLQ_7.jpeg", + "I1OLQ/I1OLQ_8.jpeg", + "I1OLQ/I1OLQ_9.jpeg", + "I1OLQ/I1OLQ_10.jpeg", + "I1OLQ/I1OLQ_11.jpeg", + "I1OLQ/I1OLQ_12.jpeg", + "I1OLQ/I1OLQ_13.jpeg", + "I1OLQ/I1OLQ_14.jpeg", + "I1OLQ/I1OLQ_15.jpeg", + "I1OLQ/I1OLQ_16.jpeg", + "I1OLQ/I1OLQ_17.jpeg", + "I1OLQ/I1OLQ_18.jpeg", + "I1OLQ/I1OLQ_19.jpeg", + "I1OLQ/I1OLQ_20.jpeg", + "I1OLQ/I1OLQ_21.jpeg", + "I1OLQ/I1OLQ_22.jpeg", + "I1OLQ/I1OLQ_23.jpeg", + "I1OLQ/I1OLQ_24.jpeg", + "I1OLQ/I1OLQ_25.jpeg", + "I1OLQ/I1OLQ_26.jpeg", + "I1OLQ/I1OLQ_27.jpeg", + "I1OLQ/I1OLQ_28.jpeg", + "I1OLQ/I1OLQ_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 35, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Sit on the floor.\nB. Open the bag.\nC. Lie on the bed.\nD. Tidy up the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on the floor", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "sit on the floor", + "open the bag", + "lie on the bed", + "tidy up the table" + ], + "image_quantity_level": "Medium", + "image": [ + "Z6HEA/Z6HEA_0.jpeg", + "Z6HEA/Z6HEA_1.jpeg", + "Z6HEA/Z6HEA_2.jpeg", + "Z6HEA/Z6HEA_3.jpeg", + "Z6HEA/Z6HEA_4.jpeg", + "Z6HEA/Z6HEA_5.jpeg", + "Z6HEA/Z6HEA_6.jpeg", + "Z6HEA/Z6HEA_7.jpeg", + "Z6HEA/Z6HEA_8.jpeg", + "Z6HEA/Z6HEA_9.jpeg", + "Z6HEA/Z6HEA_10.jpeg", + "Z6HEA/Z6HEA_11.jpeg", + "Z6HEA/Z6HEA_12.jpeg", + "Z6HEA/Z6HEA_13.jpeg", + "Z6HEA/Z6HEA_14.jpeg", + "Z6HEA/Z6HEA_15.jpeg", + "Z6HEA/Z6HEA_16.jpeg", + "Z6HEA/Z6HEA_17.jpeg", + "Z6HEA/Z6HEA_18.jpeg", + "Z6HEA/Z6HEA_19.jpeg", + "Z6HEA/Z6HEA_20.jpeg", + "Z6HEA/Z6HEA_21.jpeg", + "Z6HEA/Z6HEA_22.jpeg", + "Z6HEA/Z6HEA_23.jpeg", + "Z6HEA/Z6HEA_24.jpeg", + "Z6HEA/Z6HEA_25.jpeg", + "Z6HEA/Z6HEA_26.jpeg", + "Z6HEA/Z6HEA_27.jpeg", + "Z6HEA/Z6HEA_28.jpeg", + "Z6HEA/Z6HEA_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 51, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the window.\nB. Sit at the table.\nC. Throw the book.\nD. Throw the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the window", + "sit at the table", + "throw the book", + "throw the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "YHXU9/YHXU9_0.jpeg", + "YHXU9/YHXU9_1.jpeg", + "YHXU9/YHXU9_2.jpeg", + "YHXU9/YHXU9_3.jpeg", + "YHXU9/YHXU9_4.jpeg", + "YHXU9/YHXU9_5.jpeg", + "YHXU9/YHXU9_6.jpeg", + "YHXU9/YHXU9_7.jpeg", + "YHXU9/YHXU9_8.jpeg", + "YHXU9/YHXU9_9.jpeg", + "YHXU9/YHXU9_10.jpeg", + "YHXU9/YHXU9_11.jpeg", + "YHXU9/YHXU9_12.jpeg", + "YHXU9/YHXU9_13.jpeg", + "YHXU9/YHXU9_14.jpeg", + "YHXU9/YHXU9_15.jpeg", + "YHXU9/YHXU9_16.jpeg", + "YHXU9/YHXU9_17.jpeg", + "YHXU9/YHXU9_18.jpeg", + "YHXU9/YHXU9_19.jpeg", + "YHXU9/YHXU9_20.jpeg", + "YHXU9/YHXU9_21.jpeg", + "YHXU9/YHXU9_22.jpeg", + "YHXU9/YHXU9_23.jpeg", + "YHXU9/YHXU9_24.jpeg", + "YHXU9/YHXU9_25.jpeg", + "YHXU9/YHXU9_26.jpeg", + "YHXU9/YHXU9_27.jpeg", + "YHXU9/YHXU9_28.jpeg", + "YHXU9/YHXU9_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 58, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the blanket.\nB. Take the phone/camera.\nC. Close the door.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "throw the blanket", + "take the phone camera", + "close the door", + "open the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "LLOGB/LLOGB_0.jpeg", + "LLOGB/LLOGB_1.jpeg", + "LLOGB/LLOGB_2.jpeg", + "LLOGB/LLOGB_3.jpeg", + "LLOGB/LLOGB_4.jpeg", + "LLOGB/LLOGB_5.jpeg", + "LLOGB/LLOGB_6.jpeg", + "LLOGB/LLOGB_7.jpeg", + "LLOGB/LLOGB_8.jpeg", + "LLOGB/LLOGB_9.jpeg", + "LLOGB/LLOGB_10.jpeg", + "LLOGB/LLOGB_11.jpeg", + "LLOGB/LLOGB_12.jpeg", + "LLOGB/LLOGB_13.jpeg", + "LLOGB/LLOGB_14.jpeg", + "LLOGB/LLOGB_15.jpeg", + "LLOGB/LLOGB_16.jpeg", + "LLOGB/LLOGB_17.jpeg", + "LLOGB/LLOGB_18.jpeg", + "LLOGB/LLOGB_19.jpeg", + "LLOGB/LLOGB_20.jpeg", + "LLOGB/LLOGB_21.jpeg", + "LLOGB/LLOGB_22.jpeg", + "LLOGB/LLOGB_23.jpeg", + "LLOGB/LLOGB_24.jpeg", + "LLOGB/LLOGB_25.jpeg", + "LLOGB/LLOGB_26.jpeg", + "LLOGB/LLOGB_27.jpeg", + "LLOGB/LLOGB_28.jpeg", + "LLOGB/LLOGB_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 65, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the door.\nB. Take the shoe.\nC. Take the book.\nD. Eat the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "eat the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open the door", + "take the shoe", + "take the book", + "eat the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "27JQL/27JQL_0.jpeg", + "27JQL/27JQL_1.jpeg", + "27JQL/27JQL_2.jpeg", + "27JQL/27JQL_3.jpeg", + "27JQL/27JQL_4.jpeg", + "27JQL/27JQL_5.jpeg", + "27JQL/27JQL_6.jpeg", + "27JQL/27JQL_7.jpeg", + "27JQL/27JQL_8.jpeg", + "27JQL/27JQL_9.jpeg", + "27JQL/27JQL_10.jpeg", + "27JQL/27JQL_11.jpeg", + "27JQL/27JQL_12.jpeg", + "27JQL/27JQL_13.jpeg", + "27JQL/27JQL_14.jpeg", + "27JQL/27JQL_15.jpeg", + "27JQL/27JQL_16.jpeg", + "27JQL/27JQL_17.jpeg", + "27JQL/27JQL_18.jpeg", + "27JQL/27JQL_19.jpeg", + "27JQL/27JQL_20.jpeg", + "27JQL/27JQL_21.jpeg", + "27JQL/27JQL_22.jpeg", + "27JQL/27JQL_23.jpeg", + "27JQL/27JQL_24.jpeg", + "27JQL/27JQL_25.jpeg", + "27JQL/27JQL_26.jpeg", + "27JQL/27JQL_27.jpeg", + "27JQL/27JQL_28.jpeg", + "27JQL/27JQL_29.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 67, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the mirror.\nB. Close the box.\nC. Put down the towel.\nD. Open the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "wash the mirror", + "close the box", + "put down the towel", + "open the door" + ], + "image_quantity_level": "Medium", + "image": [ + "MDG2T/MDG2T_0.jpeg", + "MDG2T/MDG2T_1.jpeg", + "MDG2T/MDG2T_2.jpeg", + "MDG2T/MDG2T_3.jpeg", + "MDG2T/MDG2T_4.jpeg", + "MDG2T/MDG2T_5.jpeg", + "MDG2T/MDG2T_6.jpeg", + "MDG2T/MDG2T_7.jpeg", + "MDG2T/MDG2T_8.jpeg", + "MDG2T/MDG2T_9.jpeg", + "MDG2T/MDG2T_10.jpeg", + "MDG2T/MDG2T_11.jpeg", + "MDG2T/MDG2T_12.jpeg", + "MDG2T/MDG2T_13.jpeg", + "MDG2T/MDG2T_14.jpeg", + "MDG2T/MDG2T_15.jpeg", + "MDG2T/MDG2T_16.jpeg", + "MDG2T/MDG2T_17.jpeg", + "MDG2T/MDG2T_18.jpeg", + "MDG2T/MDG2T_19.jpeg", + "MDG2T/MDG2T_20.jpeg", + "MDG2T/MDG2T_21.jpeg", + "MDG2T/MDG2T_22.jpeg", + "MDG2T/MDG2T_23.jpeg", + "MDG2T/MDG2T_24.jpeg", + "MDG2T/MDG2T_25.jpeg", + "MDG2T/MDG2T_26.jpeg", + "MDG2T/MDG2T_27.jpeg", + "MDG2T/MDG2T_28.jpeg", + "MDG2T/MDG2T_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 68, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the closet/cabinet.\nB. Eat the medicine.\nC. Tidy up the table.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open the closet cabinet", + "eat the medicine", + "tidy up the table", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "D1WYU/D1WYU_0.jpeg", + "D1WYU/D1WYU_1.jpeg", + "D1WYU/D1WYU_2.jpeg", + "D1WYU/D1WYU_3.jpeg", + "D1WYU/D1WYU_4.jpeg", + "D1WYU/D1WYU_5.jpeg", + "D1WYU/D1WYU_6.jpeg", + "D1WYU/D1WYU_7.jpeg", + "D1WYU/D1WYU_8.jpeg", + "D1WYU/D1WYU_9.jpeg", + "D1WYU/D1WYU_10.jpeg", + "D1WYU/D1WYU_11.jpeg", + "D1WYU/D1WYU_12.jpeg", + "D1WYU/D1WYU_13.jpeg", + "D1WYU/D1WYU_14.jpeg", + "D1WYU/D1WYU_15.jpeg", + "D1WYU/D1WYU_16.jpeg", + "D1WYU/D1WYU_17.jpeg", + "D1WYU/D1WYU_18.jpeg", + "D1WYU/D1WYU_19.jpeg", + "D1WYU/D1WYU_20.jpeg", + "D1WYU/D1WYU_21.jpeg", + "D1WYU/D1WYU_22.jpeg", + "D1WYU/D1WYU_23.jpeg", + "D1WYU/D1WYU_24.jpeg", + "D1WYU/D1WYU_25.jpeg", + "D1WYU/D1WYU_26.jpeg", + "D1WYU/D1WYU_27.jpeg", + "D1WYU/D1WYU_28.jpeg", + "D1WYU/D1WYU_29.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 20, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the table.\nB. Lie on the floor.\nC. Put down the towel.\nD. Take the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "wash the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the table", + "lie on the floor", + "put down the towel", + "take the box" + ], + "image_quantity_level": "Medium", + "image": [ + "NJANX/NJANX_0.jpeg", + "NJANX/NJANX_1.jpeg", + "NJANX/NJANX_2.jpeg", + "NJANX/NJANX_3.jpeg", + "NJANX/NJANX_4.jpeg", + "NJANX/NJANX_5.jpeg", + "NJANX/NJANX_6.jpeg", + "NJANX/NJANX_7.jpeg", + "NJANX/NJANX_8.jpeg", + "NJANX/NJANX_9.jpeg", + "NJANX/NJANX_10.jpeg", + "NJANX/NJANX_11.jpeg", + "NJANX/NJANX_12.jpeg", + "NJANX/NJANX_13.jpeg", + "NJANX/NJANX_14.jpeg", + "NJANX/NJANX_15.jpeg", + "NJANX/NJANX_16.jpeg", + "NJANX/NJANX_17.jpeg", + "NJANX/NJANX_18.jpeg", + "NJANX/NJANX_19.jpeg", + "NJANX/NJANX_20.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 27, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the refrigerator.\nB. Open the refrigerator.\nC. Throw the clothes.\nD. Close the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "close the refrigerator", + "open the refrigerator", + "throw the clothes", + "close the closet cabinet" + ], + "image_quantity_level": "Many", + "image": [ + "Z6LYG/Z6LYG_0.jpeg", + "Z6LYG/Z6LYG_1.jpeg", + "Z6LYG/Z6LYG_2.jpeg", + "Z6LYG/Z6LYG_3.jpeg", + "Z6LYG/Z6LYG_4.jpeg", + "Z6LYG/Z6LYG_5.jpeg", + "Z6LYG/Z6LYG_6.jpeg", + "Z6LYG/Z6LYG_7.jpeg", + "Z6LYG/Z6LYG_8.jpeg", + "Z6LYG/Z6LYG_9.jpeg", + "Z6LYG/Z6LYG_10.jpeg", + "Z6LYG/Z6LYG_11.jpeg", + "Z6LYG/Z6LYG_12.jpeg", + "Z6LYG/Z6LYG_13.jpeg", + "Z6LYG/Z6LYG_14.jpeg", + "Z6LYG/Z6LYG_15.jpeg", + "Z6LYG/Z6LYG_16.jpeg", + "Z6LYG/Z6LYG_17.jpeg", + "Z6LYG/Z6LYG_18.jpeg", + "Z6LYG/Z6LYG_19.jpeg", + "Z6LYG/Z6LYG_20.jpeg", + "Z6LYG/Z6LYG_21.jpeg", + "Z6LYG/Z6LYG_22.jpeg", + "Z6LYG/Z6LYG_23.jpeg", + "Z6LYG/Z6LYG_24.jpeg", + "Z6LYG/Z6LYG_25.jpeg", + "Z6LYG/Z6LYG_26.jpeg", + "Z6LYG/Z6LYG_27.jpeg", + "Z6LYG/Z6LYG_28.jpeg", + "Z6LYG/Z6LYG_29.jpeg", + "Z6LYG/Z6LYG_30.jpeg", + "Z6LYG/Z6LYG_31.jpeg", + "Z6LYG/Z6LYG_32.jpeg", + "Z6LYG/Z6LYG_33.jpeg", + "Z6LYG/Z6LYG_34.jpeg", + "Z6LYG/Z6LYG_35.jpeg", + "Z6LYG/Z6LYG_36.jpeg", + "Z6LYG/Z6LYG_37.jpeg", + "Z6LYG/Z6LYG_38.jpeg", + "Z6LYG/Z6LYG_39.jpeg", + "Z6LYG/Z6LYG_40.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 32, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the closet/cabinet.\nB. Put down the paper/notebook.\nC. Take the dish.\nD. Take the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "close the closet cabinet", + "put down the paper notebook", + "take the dish", + "take the food" + ], + "image_quantity_level": "Many", + "image": [ + "GPR89/GPR89_0.jpeg", + "GPR89/GPR89_1.jpeg", + "GPR89/GPR89_2.jpeg", + "GPR89/GPR89_3.jpeg", + "GPR89/GPR89_4.jpeg", + "GPR89/GPR89_5.jpeg", + "GPR89/GPR89_6.jpeg", + "GPR89/GPR89_7.jpeg", + "GPR89/GPR89_8.jpeg", + "GPR89/GPR89_9.jpeg", + "GPR89/GPR89_10.jpeg", + "GPR89/GPR89_11.jpeg", + "GPR89/GPR89_12.jpeg", + "GPR89/GPR89_13.jpeg", + "GPR89/GPR89_14.jpeg", + "GPR89/GPR89_15.jpeg", + "GPR89/GPR89_16.jpeg", + "GPR89/GPR89_17.jpeg", + "GPR89/GPR89_18.jpeg", + "GPR89/GPR89_19.jpeg", + "GPR89/GPR89_20.jpeg", + "GPR89/GPR89_21.jpeg", + "GPR89/GPR89_22.jpeg", + "GPR89/GPR89_23.jpeg", + "GPR89/GPR89_24.jpeg", + "GPR89/GPR89_25.jpeg", + "GPR89/GPR89_26.jpeg", + "GPR89/GPR89_27.jpeg", + "GPR89/GPR89_28.jpeg", + "GPR89/GPR89_29.jpeg", + "GPR89/GPR89_30.jpeg", + "GPR89/GPR89_31.jpeg", + "GPR89/GPR89_32.jpeg", + "GPR89/GPR89_33.jpeg", + "GPR89/GPR89_34.jpeg", + "GPR89/GPR89_35.jpeg", + "GPR89/GPR89_36.jpeg", + "GPR89/GPR89_37.jpeg", + "GPR89/GPR89_38.jpeg", + "GPR89/GPR89_39.jpeg", + "GPR89/GPR89_40.jpeg", + "GPR89/GPR89_41.jpeg", + "GPR89/GPR89_42.jpeg", + "GPR89/GPR89_43.jpeg", + "GPR89/GPR89_44.jpeg", + "GPR89/GPR89_45.jpeg", + "GPR89/GPR89_46.jpeg", + "GPR89/GPR89_47.jpeg", + "GPR89/GPR89_48.jpeg", + "GPR89/GPR89_49.jpeg", + "GPR89/GPR89_50.jpeg", + "GPR89/GPR89_51.jpeg", + "GPR89/GPR89_52.jpeg", + "GPR89/GPR89_53.jpeg", + "GPR89/GPR89_54.jpeg", + "GPR89/GPR89_55.jpeg", + "GPR89/GPR89_56.jpeg", + "GPR89/GPR89_57.jpeg", + "GPR89/GPR89_58.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 36, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the laptop.\nB. Open the book.\nC. Sit on the bed.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "close the laptop", + "open the book", + "sit on the bed", + "put down the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "9A58F/9A58F_0.jpeg", + "9A58F/9A58F_1.jpeg", + "9A58F/9A58F_2.jpeg", + "9A58F/9A58F_3.jpeg", + "9A58F/9A58F_4.jpeg", + "9A58F/9A58F_5.jpeg", + "9A58F/9A58F_6.jpeg", + "9A58F/9A58F_7.jpeg", + "9A58F/9A58F_8.jpeg", + "9A58F/9A58F_9.jpeg", + "9A58F/9A58F_10.jpeg", + "9A58F/9A58F_11.jpeg", + "9A58F/9A58F_12.jpeg", + "9A58F/9A58F_13.jpeg", + "9A58F/9A58F_14.jpeg", + "9A58F/9A58F_15.jpeg", + "9A58F/9A58F_16.jpeg", + "9A58F/9A58F_17.jpeg", + "9A58F/9A58F_18.jpeg", + "9A58F/9A58F_19.jpeg", + "9A58F/9A58F_20.jpeg", + "9A58F/9A58F_21.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 41, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the blanket.\nB. Eat the medicine.\nC. Lie on the sofa/couch.\nD. Put down the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "throw the blanket", + "eat the medicine", + "lie on the sofa couch", + "put down the box" + ], + "image_quantity_level": "Medium", + "image": [ + "NM65M/NM65M_0.jpeg", + "NM65M/NM65M_1.jpeg", + "NM65M/NM65M_2.jpeg", + "NM65M/NM65M_3.jpeg", + "NM65M/NM65M_4.jpeg", + "NM65M/NM65M_5.jpeg", + "NM65M/NM65M_6.jpeg", + "NM65M/NM65M_7.jpeg", + "NM65M/NM65M_8.jpeg", + "NM65M/NM65M_9.jpeg", + "NM65M/NM65M_10.jpeg", + "NM65M/NM65M_11.jpeg", + "NM65M/NM65M_12.jpeg", + "NM65M/NM65M_13.jpeg", + "NM65M/NM65M_14.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 44, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next with the blanket?\nChoice list: \nA. Put down.\nB. Take.\nC. Hold.\nD. Throw.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down", + "take", + "hold", + "throw" + ], + "image_quantity_level": "Many", + "image": [ + "ACJBD/ACJBD_0.jpeg", + "ACJBD/ACJBD_1.jpeg", + "ACJBD/ACJBD_2.jpeg", + "ACJBD/ACJBD_3.jpeg", + "ACJBD/ACJBD_4.jpeg", + "ACJBD/ACJBD_5.jpeg", + "ACJBD/ACJBD_6.jpeg", + "ACJBD/ACJBD_7.jpeg", + "ACJBD/ACJBD_8.jpeg", + "ACJBD/ACJBD_9.jpeg", + "ACJBD/ACJBD_10.jpeg", + "ACJBD/ACJBD_11.jpeg", + "ACJBD/ACJBD_12.jpeg", + "ACJBD/ACJBD_13.jpeg", + "ACJBD/ACJBD_14.jpeg", + "ACJBD/ACJBD_15.jpeg", + "ACJBD/ACJBD_16.jpeg", + "ACJBD/ACJBD_17.jpeg", + "ACJBD/ACJBD_18.jpeg", + "ACJBD/ACJBD_19.jpeg", + "ACJBD/ACJBD_20.jpeg", + "ACJBD/ACJBD_21.jpeg", + "ACJBD/ACJBD_22.jpeg", + "ACJBD/ACJBD_23.jpeg", + "ACJBD/ACJBD_24.jpeg", + "ACJBD/ACJBD_25.jpeg", + "ACJBD/ACJBD_26.jpeg", + "ACJBD/ACJBD_27.jpeg", + "ACJBD/ACJBD_28.jpeg", + "ACJBD/ACJBD_29.jpeg", + "ACJBD/ACJBD_30.jpeg", + "ACJBD/ACJBD_31.jpeg", + "ACJBD/ACJBD_32.jpeg", + "ACJBD/ACJBD_33.jpeg", + "ACJBD/ACJBD_34.jpeg", + "ACJBD/ACJBD_35.jpeg", + "ACJBD/ACJBD_36.jpeg", + "ACJBD/ACJBD_37.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 46, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Tidy up the clothes.\nC. Take the bag.\nD. Lie on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lie on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the clothes", + "tidy up the clothes", + "take the bag", + "lie on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "55AH5/55AH5_0.jpeg", + "55AH5/55AH5_1.jpeg", + "55AH5/55AH5_2.jpeg", + "55AH5/55AH5_3.jpeg", + "55AH5/55AH5_4.jpeg", + "55AH5/55AH5_5.jpeg", + "55AH5/55AH5_6.jpeg", + "55AH5/55AH5_7.jpeg", + "55AH5/55AH5_8.jpeg", + "55AH5/55AH5_9.jpeg", + "55AH5/55AH5_10.jpeg", + "55AH5/55AH5_11.jpeg", + "55AH5/55AH5_12.jpeg", + "55AH5/55AH5_13.jpeg", + "55AH5/55AH5_14.jpeg", + "55AH5/55AH5_15.jpeg", + "55AH5/55AH5_16.jpeg", + "55AH5/55AH5_17.jpeg", + "55AH5/55AH5_18.jpeg", + "55AH5/55AH5_19.jpeg", + "55AH5/55AH5_20.jpeg", + "55AH5/55AH5_21.jpeg", + "55AH5/55AH5_22.jpeg", + "55AH5/55AH5_23.jpeg", + "55AH5/55AH5_24.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 49, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the sofa/couch.\nB. Close the box.\nC. Lie on the floor.\nD. Hold the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lie on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "lie on the sofa couch", + "close the box", + "lie on the floor", + "hold the book" + ], + "image_quantity_level": "Medium", + "image": [ + "RKGG5/RKGG5_0.jpeg", + "RKGG5/RKGG5_1.jpeg", + "RKGG5/RKGG5_2.jpeg", + "RKGG5/RKGG5_3.jpeg", + "RKGG5/RKGG5_4.jpeg", + "RKGG5/RKGG5_5.jpeg", + "RKGG5/RKGG5_6.jpeg", + "RKGG5/RKGG5_7.jpeg", + "RKGG5/RKGG5_8.jpeg", + "RKGG5/RKGG5_9.jpeg", + "RKGG5/RKGG5_10.jpeg", + "RKGG5/RKGG5_11.jpeg", + "RKGG5/RKGG5_12.jpeg", + "RKGG5/RKGG5_13.jpeg", + "RKGG5/RKGG5_14.jpeg", + "RKGG5/RKGG5_15.jpeg", + "RKGG5/RKGG5_16.jpeg", + "RKGG5/RKGG5_17.jpeg", + "RKGG5/RKGG5_18.jpeg", + "RKGG5/RKGG5_19.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 52, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the pillow.\nB. Lie on the bed.\nC. Close the window.\nD. Wash the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the window", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the pillow", + "lie on the bed", + "close the window", + "wash the cup glass bottle" + ], + "image_quantity_level": "Many", + "image": [ + "GLGQJ/GLGQJ_0.jpeg", + "GLGQJ/GLGQJ_1.jpeg", + "GLGQJ/GLGQJ_2.jpeg", + "GLGQJ/GLGQJ_3.jpeg", + "GLGQJ/GLGQJ_4.jpeg", + "GLGQJ/GLGQJ_5.jpeg", + "GLGQJ/GLGQJ_6.jpeg", + "GLGQJ/GLGQJ_7.jpeg", + "GLGQJ/GLGQJ_8.jpeg", + "GLGQJ/GLGQJ_9.jpeg", + "GLGQJ/GLGQJ_10.jpeg", + "GLGQJ/GLGQJ_11.jpeg", + "GLGQJ/GLGQJ_12.jpeg", + "GLGQJ/GLGQJ_13.jpeg", + "GLGQJ/GLGQJ_14.jpeg", + "GLGQJ/GLGQJ_15.jpeg", + "GLGQJ/GLGQJ_16.jpeg", + "GLGQJ/GLGQJ_17.jpeg", + "GLGQJ/GLGQJ_18.jpeg", + "GLGQJ/GLGQJ_19.jpeg", + "GLGQJ/GLGQJ_20.jpeg", + "GLGQJ/GLGQJ_21.jpeg", + "GLGQJ/GLGQJ_22.jpeg", + "GLGQJ/GLGQJ_23.jpeg", + "GLGQJ/GLGQJ_24.jpeg", + "GLGQJ/GLGQJ_25.jpeg", + "GLGQJ/GLGQJ_26.jpeg", + "GLGQJ/GLGQJ_27.jpeg", + "GLGQJ/GLGQJ_28.jpeg", + "GLGQJ/GLGQJ_29.jpeg", + "GLGQJ/GLGQJ_30.jpeg", + "GLGQJ/GLGQJ_31.jpeg", + "GLGQJ/GLGQJ_32.jpeg", + "GLGQJ/GLGQJ_33.jpeg", + "GLGQJ/GLGQJ_34.jpeg", + "GLGQJ/GLGQJ_35.jpeg", + "GLGQJ/GLGQJ_36.jpeg", + "GLGQJ/GLGQJ_37.jpeg", + "GLGQJ/GLGQJ_38.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 53, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the towel.\nB. Eat the sandwich.\nC. Put down the broom.\nD. Open the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "throw the towel", + "eat the sandwich", + "put down the broom", + "open the book" + ], + "image_quantity_level": "Medium", + "image": [ + "43FG9/43FG9_0.jpeg", + "43FG9/43FG9_1.jpeg", + "43FG9/43FG9_2.jpeg", + "43FG9/43FG9_3.jpeg", + "43FG9/43FG9_4.jpeg", + "43FG9/43FG9_5.jpeg", + "43FG9/43FG9_6.jpeg", + "43FG9/43FG9_7.jpeg", + "43FG9/43FG9_8.jpeg", + "43FG9/43FG9_9.jpeg", + "43FG9/43FG9_10.jpeg", + "43FG9/43FG9_11.jpeg", + "43FG9/43FG9_12.jpeg", + "43FG9/43FG9_13.jpeg", + "43FG9/43FG9_14.jpeg", + "43FG9/43FG9_15.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 59, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the cup/glass/bottle.\nB. Take the bag.\nC. Throw the pillow.\nD. Take the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the cup glass bottle", + "take the bag", + "throw the pillow", + "take the food" + ], + "image_quantity_level": "Many", + "image": [ + "NLWBW/NLWBW_0.jpeg", + "NLWBW/NLWBW_1.jpeg", + "NLWBW/NLWBW_2.jpeg", + "NLWBW/NLWBW_3.jpeg", + "NLWBW/NLWBW_4.jpeg", + "NLWBW/NLWBW_5.jpeg", + "NLWBW/NLWBW_6.jpeg", + "NLWBW/NLWBW_7.jpeg", + "NLWBW/NLWBW_8.jpeg", + "NLWBW/NLWBW_9.jpeg", + "NLWBW/NLWBW_10.jpeg", + "NLWBW/NLWBW_11.jpeg", + "NLWBW/NLWBW_12.jpeg", + "NLWBW/NLWBW_13.jpeg", + "NLWBW/NLWBW_14.jpeg", + "NLWBW/NLWBW_15.jpeg", + "NLWBW/NLWBW_16.jpeg", + "NLWBW/NLWBW_17.jpeg", + "NLWBW/NLWBW_18.jpeg", + "NLWBW/NLWBW_19.jpeg", + "NLWBW/NLWBW_20.jpeg", + "NLWBW/NLWBW_21.jpeg", + "NLWBW/NLWBW_22.jpeg", + "NLWBW/NLWBW_23.jpeg", + "NLWBW/NLWBW_24.jpeg", + "NLWBW/NLWBW_25.jpeg", + "NLWBW/NLWBW_26.jpeg", + "NLWBW/NLWBW_27.jpeg", + "NLWBW/NLWBW_28.jpeg", + "NLWBW/NLWBW_29.jpeg", + "NLWBW/NLWBW_30.jpeg", + "NLWBW/NLWBW_31.jpeg", + "NLWBW/NLWBW_32.jpeg", + "NLWBW/NLWBW_33.jpeg", + "NLWBW/NLWBW_34.jpeg", + "NLWBW/NLWBW_35.jpeg", + "NLWBW/NLWBW_36.jpeg", + "NLWBW/NLWBW_37.jpeg", + "NLWBW/NLWBW_38.jpeg", + "NLWBW/NLWBW_39.jpeg", + "NLWBW/NLWBW_40.jpeg", + "NLWBW/NLWBW_41.jpeg", + "NLWBW/NLWBW_42.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 61, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the table.\nB. Eat the medicine.\nC. Throw the shoe.\nD. Take the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the table", + "eat the medicine", + "throw the shoe", + "take the blanket" + ], + "image_quantity_level": "Many", + "image": [ + "OEE36/OEE36_0.jpeg", + "OEE36/OEE36_1.jpeg", + "OEE36/OEE36_2.jpeg", + "OEE36/OEE36_3.jpeg", + "OEE36/OEE36_4.jpeg", + "OEE36/OEE36_5.jpeg", + "OEE36/OEE36_6.jpeg", + "OEE36/OEE36_7.jpeg", + "OEE36/OEE36_8.jpeg", + "OEE36/OEE36_9.jpeg", + "OEE36/OEE36_10.jpeg", + "OEE36/OEE36_11.jpeg", + "OEE36/OEE36_12.jpeg", + "OEE36/OEE36_13.jpeg", + "OEE36/OEE36_14.jpeg", + "OEE36/OEE36_15.jpeg", + "OEE36/OEE36_16.jpeg", + "OEE36/OEE36_17.jpeg", + "OEE36/OEE36_18.jpeg", + "OEE36/OEE36_19.jpeg", + "OEE36/OEE36_20.jpeg", + "OEE36/OEE36_21.jpeg", + "OEE36/OEE36_22.jpeg", + "OEE36/OEE36_23.jpeg", + "OEE36/OEE36_24.jpeg", + "OEE36/OEE36_25.jpeg", + "OEE36/OEE36_26.jpeg", + "OEE36/OEE36_27.jpeg", + "OEE36/OEE36_28.jpeg", + "OEE36/OEE36_29.jpeg", + "OEE36/OEE36_30.jpeg", + "OEE36/OEE36_31.jpeg", + "OEE36/OEE36_32.jpeg", + "OEE36/OEE36_33.jpeg", + "OEE36/OEE36_34.jpeg", + "OEE36/OEE36_35.jpeg", + "OEE36/OEE36_36.jpeg", + "OEE36/OEE36_37.jpeg", + "OEE36/OEE36_38.jpeg", + "OEE36/OEE36_39.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 70, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next with the cup/glass/bottle?\nChoice list: \nA. Put down.\nB. Wash.\nC. Close.\nD. Take.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down", + "wash", + "close", + "take" + ], + "image_quantity_level": "Medium", + "image": [ + "E7Q3R/E7Q3R_0.jpeg", + "E7Q3R/E7Q3R_1.jpeg", + "E7Q3R/E7Q3R_2.jpeg", + "E7Q3R/E7Q3R_3.jpeg", + "E7Q3R/E7Q3R_4.jpeg", + "E7Q3R/E7Q3R_5.jpeg", + "E7Q3R/E7Q3R_6.jpeg", + "E7Q3R/E7Q3R_7.jpeg", + "E7Q3R/E7Q3R_8.jpeg", + "E7Q3R/E7Q3R_9.jpeg", + "E7Q3R/E7Q3R_10.jpeg", + "E7Q3R/E7Q3R_11.jpeg", + "E7Q3R/E7Q3R_12.jpeg", + "E7Q3R/E7Q3R_13.jpeg", + "E7Q3R/E7Q3R_14.jpeg", + "E7Q3R/E7Q3R_15.jpeg", + "E7Q3R/E7Q3R_16.jpeg", + "E7Q3R/E7Q3R_17.jpeg", + "E7Q3R/E7Q3R_18.jpeg", + "E7Q3R/E7Q3R_19.jpeg", + "E7Q3R/E7Q3R_20.jpeg", + "E7Q3R/E7Q3R_21.jpeg", + "E7Q3R/E7Q3R_22.jpeg", + "E7Q3R/E7Q3R_23.jpeg", + "E7Q3R/E7Q3R_24.jpeg", + "E7Q3R/E7Q3R_25.jpeg", + "E7Q3R/E7Q3R_26.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 141, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the clothes.\nB. Wash the mirror.\nC. Take the bag.\nD. Put down the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the clothes", + "wash the mirror", + "take the bag", + "put down the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "NKCXF/NKCXF_0.jpeg", + "NKCXF/NKCXF_1.jpeg", + "NKCXF/NKCXF_2.jpeg", + "NKCXF/NKCXF_3.jpeg", + "NKCXF/NKCXF_4.jpeg", + "NKCXF/NKCXF_5.jpeg", + "NKCXF/NKCXF_6.jpeg", + "NKCXF/NKCXF_7.jpeg", + "NKCXF/NKCXF_8.jpeg", + "NKCXF/NKCXF_9.jpeg", + "NKCXF/NKCXF_10.jpeg", + "NKCXF/NKCXF_11.jpeg", + "NKCXF/NKCXF_12.jpeg", + "NKCXF/NKCXF_13.jpeg", + "NKCXF/NKCXF_14.jpeg", + "NKCXF/NKCXF_15.jpeg", + "NKCXF/NKCXF_16.jpeg", + "NKCXF/NKCXF_17.jpeg", + "NKCXF/NKCXF_18.jpeg", + "NKCXF/NKCXF_19.jpeg", + "NKCXF/NKCXF_20.jpeg", + "NKCXF/NKCXF_21.jpeg", + "NKCXF/NKCXF_22.jpeg", + "NKCXF/NKCXF_23.jpeg", + "NKCXF/NKCXF_24.jpeg", + "NKCXF/NKCXF_25.jpeg", + "NKCXF/NKCXF_26.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 82, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the refrigerator.\nB. Take the book.\nC. Open the bag.\nD. Tidy up the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "close the refrigerator", + "take the book", + "open the bag", + "tidy up the blanket" + ], + "image_quantity_level": "Medium", + "image": [ + "3ZC4Y/3ZC4Y_0.jpeg", + "3ZC4Y/3ZC4Y_1.jpeg", + "3ZC4Y/3ZC4Y_2.jpeg", + "3ZC4Y/3ZC4Y_3.jpeg", + "3ZC4Y/3ZC4Y_4.jpeg", + "3ZC4Y/3ZC4Y_5.jpeg", + "3ZC4Y/3ZC4Y_6.jpeg", + "3ZC4Y/3ZC4Y_7.jpeg", + "3ZC4Y/3ZC4Y_8.jpeg", + "3ZC4Y/3ZC4Y_9.jpeg", + "3ZC4Y/3ZC4Y_10.jpeg", + "3ZC4Y/3ZC4Y_11.jpeg", + "3ZC4Y/3ZC4Y_12.jpeg", + "3ZC4Y/3ZC4Y_13.jpeg", + "3ZC4Y/3ZC4Y_14.jpeg", + "3ZC4Y/3ZC4Y_15.jpeg", + "3ZC4Y/3ZC4Y_16.jpeg", + "3ZC4Y/3ZC4Y_17.jpeg", + "3ZC4Y/3ZC4Y_18.jpeg", + "3ZC4Y/3ZC4Y_19.jpeg", + "3ZC4Y/3ZC4Y_20.jpeg", + "3ZC4Y/3ZC4Y_21.jpeg", + "3ZC4Y/3ZC4Y_22.jpeg", + "3ZC4Y/3ZC4Y_23.jpeg", + "3ZC4Y/3ZC4Y_24.jpeg", + "3ZC4Y/3ZC4Y_25.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 89, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the bag.\nB. Throw the shoe.\nC. Put down the cup/glass/bottle.\nD. Put down the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the bag", + "throw the shoe", + "put down the cup glass bottle", + "put down the towel" + ], + "image_quantity_level": "Many", + "image": [ + "2544C/2544C_0.jpeg", + "2544C/2544C_1.jpeg", + "2544C/2544C_2.jpeg", + "2544C/2544C_3.jpeg", + "2544C/2544C_4.jpeg", + "2544C/2544C_5.jpeg", + "2544C/2544C_6.jpeg", + "2544C/2544C_7.jpeg", + "2544C/2544C_8.jpeg", + "2544C/2544C_9.jpeg", + "2544C/2544C_10.jpeg", + "2544C/2544C_11.jpeg", + "2544C/2544C_12.jpeg", + "2544C/2544C_13.jpeg", + "2544C/2544C_14.jpeg", + "2544C/2544C_15.jpeg", + "2544C/2544C_16.jpeg", + "2544C/2544C_17.jpeg", + "2544C/2544C_18.jpeg", + "2544C/2544C_19.jpeg", + "2544C/2544C_20.jpeg", + "2544C/2544C_21.jpeg", + "2544C/2544C_22.jpeg", + "2544C/2544C_23.jpeg", + "2544C/2544C_24.jpeg", + "2544C/2544C_25.jpeg", + "2544C/2544C_26.jpeg", + "2544C/2544C_27.jpeg", + "2544C/2544C_28.jpeg", + "2544C/2544C_29.jpeg", + "2544C/2544C_30.jpeg", + "2544C/2544C_31.jpeg", + "2544C/2544C_32.jpeg", + "2544C/2544C_33.jpeg", + "2544C/2544C_34.jpeg", + "2544C/2544C_35.jpeg", + "2544C/2544C_36.jpeg", + "2544C/2544C_37.jpeg", + "2544C/2544C_38.jpeg", + "2544C/2544C_39.jpeg", + "2544C/2544C_40.jpeg", + "2544C/2544C_41.jpeg", + "2544C/2544C_42.jpeg", + "2544C/2544C_43.jpeg", + "2544C/2544C_44.jpeg", + "2544C/2544C_45.jpeg", + "2544C/2544C_46.jpeg", + "2544C/2544C_47.jpeg", + "2544C/2544C_48.jpeg", + "2544C/2544C_49.jpeg", + "2544C/2544C_50.jpeg", + "2544C/2544C_51.jpeg", + "2544C/2544C_52.jpeg", + "2544C/2544C_53.jpeg", + "2544C/2544C_54.jpeg", + "2544C/2544C_55.jpeg", + "2544C/2544C_56.jpeg", + "2544C/2544C_57.jpeg", + "2544C/2544C_58.jpeg", + "2544C/2544C_59.jpeg", + "2544C/2544C_60.jpeg", + "2544C/2544C_61.jpeg", + "2544C/2544C_62.jpeg", + "2544C/2544C_63.jpeg", + "2544C/2544C_64.jpeg", + "2544C/2544C_65.jpeg", + "2544C/2544C_66.jpeg", + "2544C/2544C_67.jpeg", + "2544C/2544C_68.jpeg", + "2544C/2544C_69.jpeg", + "2544C/2544C_70.jpeg", + "2544C/2544C_71.jpeg", + "2544C/2544C_72.jpeg", + "2544C/2544C_73.jpeg", + "2544C/2544C_74.jpeg", + "2544C/2544C_75.jpeg", + "2544C/2544C_76.jpeg", + "2544C/2544C_77.jpeg", + "2544C/2544C_78.jpeg", + "2544C/2544C_79.jpeg", + "2544C/2544C_80.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 98, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the towel.\nB. Hold the blanket.\nC. Eat the sandwich.\nD. Take the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "eat the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidy up the towel", + "hold the blanket", + "eat the sandwich", + "take the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "SPUPH/SPUPH_0.jpeg", + "SPUPH/SPUPH_1.jpeg", + "SPUPH/SPUPH_2.jpeg", + "SPUPH/SPUPH_3.jpeg", + "SPUPH/SPUPH_4.jpeg", + "SPUPH/SPUPH_5.jpeg", + "SPUPH/SPUPH_6.jpeg", + "SPUPH/SPUPH_7.jpeg", + "SPUPH/SPUPH_8.jpeg", + "SPUPH/SPUPH_9.jpeg", + "SPUPH/SPUPH_10.jpeg", + "SPUPH/SPUPH_11.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 111, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Put down the picture.\nC. Tidy up the blanket.\nD. Open the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the clothes", + "put down the picture", + "tidy up the blanket", + "open the bag" + ], + "image_quantity_level": "Many", + "image": [ + "YVH4J/YVH4J_0.jpeg", + "YVH4J/YVH4J_1.jpeg", + "YVH4J/YVH4J_2.jpeg", + "YVH4J/YVH4J_3.jpeg", + "YVH4J/YVH4J_4.jpeg", + "YVH4J/YVH4J_5.jpeg", + "YVH4J/YVH4J_6.jpeg", + "YVH4J/YVH4J_7.jpeg", + "YVH4J/YVH4J_8.jpeg", + "YVH4J/YVH4J_9.jpeg", + "YVH4J/YVH4J_10.jpeg", + "YVH4J/YVH4J_11.jpeg", + "YVH4J/YVH4J_12.jpeg", + "YVH4J/YVH4J_13.jpeg", + "YVH4J/YVH4J_14.jpeg", + "YVH4J/YVH4J_15.jpeg", + "YVH4J/YVH4J_16.jpeg", + "YVH4J/YVH4J_17.jpeg", + "YVH4J/YVH4J_18.jpeg", + "YVH4J/YVH4J_19.jpeg", + "YVH4J/YVH4J_20.jpeg", + "YVH4J/YVH4J_21.jpeg", + "YVH4J/YVH4J_22.jpeg", + "YVH4J/YVH4J_23.jpeg", + "YVH4J/YVH4J_24.jpeg", + "YVH4J/YVH4J_25.jpeg", + "YVH4J/YVH4J_26.jpeg", + "YVH4J/YVH4J_27.jpeg", + "YVH4J/YVH4J_28.jpeg", + "YVH4J/YVH4J_29.jpeg", + "YVH4J/YVH4J_30.jpeg", + "YVH4J/YVH4J_31.jpeg", + "YVH4J/YVH4J_32.jpeg", + "YVH4J/YVH4J_33.jpeg", + "YVH4J/YVH4J_34.jpeg", + "YVH4J/YVH4J_35.jpeg", + "YVH4J/YVH4J_36.jpeg", + "YVH4J/YVH4J_37.jpeg", + "YVH4J/YVH4J_38.jpeg", + "YVH4J/YVH4J_39.jpeg", + "YVH4J/YVH4J_40.jpeg", + "YVH4J/YVH4J_41.jpeg", + "YVH4J/YVH4J_42.jpeg", + "YVH4J/YVH4J_43.jpeg", + "YVH4J/YVH4J_44.jpeg", + "YVH4J/YVH4J_45.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 133, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the refrigerator.\nB. Open the door.\nC. Tidy up the clothes.\nD. Take the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "close the refrigerator", + "open the door", + "tidy up the clothes", + "take the book" + ], + "image_quantity_level": "Medium", + "image": [ + "X5YL3/X5YL3_0.jpeg", + "X5YL3/X5YL3_1.jpeg", + "X5YL3/X5YL3_2.jpeg", + "X5YL3/X5YL3_3.jpeg", + "X5YL3/X5YL3_4.jpeg", + "X5YL3/X5YL3_5.jpeg", + "X5YL3/X5YL3_6.jpeg", + "X5YL3/X5YL3_7.jpeg", + "X5YL3/X5YL3_8.jpeg", + "X5YL3/X5YL3_9.jpeg", + "X5YL3/X5YL3_10.jpeg", + "X5YL3/X5YL3_11.jpeg", + "X5YL3/X5YL3_12.jpeg", + "X5YL3/X5YL3_13.jpeg", + "X5YL3/X5YL3_14.jpeg", + "X5YL3/X5YL3_15.jpeg", + "X5YL3/X5YL3_16.jpeg", + "X5YL3/X5YL3_17.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 134, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the book.\nB. Take the pillow.\nC. Put down the picture.\nD. Close the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "close the book", + "take the pillow", + "put down the picture", + "close the box" + ], + "image_quantity_level": "Many", + "image": [ + "RBC8N/RBC8N_0.jpeg", + "RBC8N/RBC8N_1.jpeg", + "RBC8N/RBC8N_2.jpeg", + "RBC8N/RBC8N_3.jpeg", + "RBC8N/RBC8N_4.jpeg", + "RBC8N/RBC8N_5.jpeg", + "RBC8N/RBC8N_6.jpeg", + "RBC8N/RBC8N_7.jpeg", + "RBC8N/RBC8N_8.jpeg", + "RBC8N/RBC8N_9.jpeg", + "RBC8N/RBC8N_10.jpeg", + "RBC8N/RBC8N_11.jpeg", + "RBC8N/RBC8N_12.jpeg", + "RBC8N/RBC8N_13.jpeg", + "RBC8N/RBC8N_14.jpeg", + "RBC8N/RBC8N_15.jpeg", + "RBC8N/RBC8N_16.jpeg", + "RBC8N/RBC8N_17.jpeg", + "RBC8N/RBC8N_18.jpeg", + "RBC8N/RBC8N_19.jpeg", + "RBC8N/RBC8N_20.jpeg", + "RBC8N/RBC8N_21.jpeg", + "RBC8N/RBC8N_22.jpeg", + "RBC8N/RBC8N_23.jpeg", + "RBC8N/RBC8N_24.jpeg", + "RBC8N/RBC8N_25.jpeg", + "RBC8N/RBC8N_26.jpeg", + "RBC8N/RBC8N_27.jpeg", + "RBC8N/RBC8N_28.jpeg", + "RBC8N/RBC8N_29.jpeg", + "RBC8N/RBC8N_30.jpeg", + "RBC8N/RBC8N_31.jpeg", + "RBC8N/RBC8N_32.jpeg", + "RBC8N/RBC8N_33.jpeg", + "RBC8N/RBC8N_34.jpeg", + "RBC8N/RBC8N_35.jpeg", + "RBC8N/RBC8N_36.jpeg", + "RBC8N/RBC8N_37.jpeg", + "RBC8N/RBC8N_38.jpeg", + "RBC8N/RBC8N_39.jpeg", + "RBC8N/RBC8N_40.jpeg", + "RBC8N/RBC8N_41.jpeg", + "RBC8N/RBC8N_42.jpeg", + "RBC8N/RBC8N_43.jpeg", + "RBC8N/RBC8N_44.jpeg", + "RBC8N/RBC8N_45.jpeg", + "RBC8N/RBC8N_46.jpeg", + "RBC8N/RBC8N_47.jpeg", + "RBC8N/RBC8N_48.jpeg", + "RBC8N/RBC8N_49.jpeg", + "RBC8N/RBC8N_50.jpeg", + "RBC8N/RBC8N_51.jpeg", + "RBC8N/RBC8N_52.jpeg", + "RBC8N/RBC8N_53.jpeg", + "RBC8N/RBC8N_54.jpeg", + "RBC8N/RBC8N_55.jpeg", + "RBC8N/RBC8N_56.jpeg", + "RBC8N/RBC8N_57.jpeg", + "RBC8N/RBC8N_58.jpeg", + "RBC8N/RBC8N_59.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 138, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the laptop.\nB. Take the phone/camera.\nC. Take the clothes.\nD. Open the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the laptop", + "take the phone camera", + "take the clothes", + "open the laptop" + ], + "image_quantity_level": "Many", + "image": [ + "OY3LS/OY3LS_0.jpeg", + "OY3LS/OY3LS_1.jpeg", + "OY3LS/OY3LS_2.jpeg", + "OY3LS/OY3LS_3.jpeg", + "OY3LS/OY3LS_4.jpeg", + "OY3LS/OY3LS_5.jpeg", + "OY3LS/OY3LS_6.jpeg", + "OY3LS/OY3LS_7.jpeg", + "OY3LS/OY3LS_8.jpeg", + "OY3LS/OY3LS_9.jpeg", + "OY3LS/OY3LS_10.jpeg", + "OY3LS/OY3LS_11.jpeg", + "OY3LS/OY3LS_12.jpeg", + "OY3LS/OY3LS_13.jpeg", + "OY3LS/OY3LS_14.jpeg", + "OY3LS/OY3LS_15.jpeg", + "OY3LS/OY3LS_16.jpeg", + "OY3LS/OY3LS_17.jpeg", + "OY3LS/OY3LS_18.jpeg", + "OY3LS/OY3LS_19.jpeg", + "OY3LS/OY3LS_20.jpeg", + "OY3LS/OY3LS_21.jpeg", + "OY3LS/OY3LS_22.jpeg", + "OY3LS/OY3LS_23.jpeg", + "OY3LS/OY3LS_24.jpeg", + "OY3LS/OY3LS_25.jpeg", + "OY3LS/OY3LS_26.jpeg", + "OY3LS/OY3LS_27.jpeg", + "OY3LS/OY3LS_28.jpeg", + "OY3LS/OY3LS_29.jpeg", + "OY3LS/OY3LS_30.jpeg", + "OY3LS/OY3LS_31.jpeg", + "OY3LS/OY3LS_32.jpeg", + "OY3LS/OY3LS_33.jpeg", + "OY3LS/OY3LS_34.jpeg", + "OY3LS/OY3LS_35.jpeg", + "OY3LS/OY3LS_36.jpeg", + "OY3LS/OY3LS_37.jpeg", + "OY3LS/OY3LS_38.jpeg", + "OY3LS/OY3LS_39.jpeg", + "OY3LS/OY3LS_40.jpeg", + "OY3LS/OY3LS_41.jpeg", + "OY3LS/OY3LS_42.jpeg", + "OY3LS/OY3LS_43.jpeg", + "OY3LS/OY3LS_44.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 150, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the closet/cabinet.\nB. Wash the clothes.\nC. Take the book.\nD. Open the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "open the closet cabinet", + "wash the clothes", + "take the book", + "open the book" + ], + "image_quantity_level": "Medium", + "image": [ + "5XKVP/5XKVP_0.jpeg", + "5XKVP/5XKVP_1.jpeg", + "5XKVP/5XKVP_2.jpeg", + "5XKVP/5XKVP_3.jpeg", + "5XKVP/5XKVP_4.jpeg", + "5XKVP/5XKVP_5.jpeg", + "5XKVP/5XKVP_6.jpeg", + "5XKVP/5XKVP_7.jpeg", + "5XKVP/5XKVP_8.jpeg", + "5XKVP/5XKVP_9.jpeg", + "5XKVP/5XKVP_10.jpeg", + "5XKVP/5XKVP_11.jpeg", + "5XKVP/5XKVP_12.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 158, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the box.\nB. Take the dish.\nC. Put down the sandwich.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on the sofa couch", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open the box", + "take the dish", + "put down the sandwich", + "sit on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "IWTWJ/IWTWJ_0.jpeg", + "IWTWJ/IWTWJ_1.jpeg", + "IWTWJ/IWTWJ_2.jpeg", + "IWTWJ/IWTWJ_3.jpeg", + "IWTWJ/IWTWJ_4.jpeg", + "IWTWJ/IWTWJ_5.jpeg", + "IWTWJ/IWTWJ_6.jpeg", + "IWTWJ/IWTWJ_7.jpeg", + "IWTWJ/IWTWJ_8.jpeg", + "IWTWJ/IWTWJ_9.jpeg", + "IWTWJ/IWTWJ_10.jpeg", + "IWTWJ/IWTWJ_11.jpeg", + "IWTWJ/IWTWJ_12.jpeg", + "IWTWJ/IWTWJ_13.jpeg", + "IWTWJ/IWTWJ_14.jpeg", + "IWTWJ/IWTWJ_15.jpeg", + "IWTWJ/IWTWJ_16.jpeg", + "IWTWJ/IWTWJ_17.jpeg", + "IWTWJ/IWTWJ_18.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 164, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next with the bed?\nChoice list: \nA. Eat.\nB. Take.\nC. Lie on.\nD. Sit on.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lie on", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "eat", + "take", + "lie on", + "sit on" + ], + "image_quantity_level": "Medium", + "image": [ + "31LW5/31LW5_0.jpeg", + "31LW5/31LW5_1.jpeg", + "31LW5/31LW5_2.jpeg", + "31LW5/31LW5_3.jpeg", + "31LW5/31LW5_4.jpeg", + "31LW5/31LW5_5.jpeg", + "31LW5/31LW5_6.jpeg", + "31LW5/31LW5_7.jpeg", + "31LW5/31LW5_8.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 171, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the medicine.\nB. Put down the food.\nC. Close the book.\nD. Put down the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "eat the medicine", + "put down the food", + "close the book", + "put down the book" + ], + "image_quantity_level": "Many", + "image": [ + "NW0KT/NW0KT_0.jpeg", + "NW0KT/NW0KT_1.jpeg", + "NW0KT/NW0KT_2.jpeg", + "NW0KT/NW0KT_3.jpeg", + "NW0KT/NW0KT_4.jpeg", + "NW0KT/NW0KT_5.jpeg", + "NW0KT/NW0KT_6.jpeg", + "NW0KT/NW0KT_7.jpeg", + "NW0KT/NW0KT_8.jpeg", + "NW0KT/NW0KT_9.jpeg", + "NW0KT/NW0KT_10.jpeg", + "NW0KT/NW0KT_11.jpeg", + "NW0KT/NW0KT_12.jpeg", + "NW0KT/NW0KT_13.jpeg", + "NW0KT/NW0KT_14.jpeg", + "NW0KT/NW0KT_15.jpeg", + "NW0KT/NW0KT_16.jpeg", + "NW0KT/NW0KT_17.jpeg", + "NW0KT/NW0KT_18.jpeg", + "NW0KT/NW0KT_19.jpeg", + "NW0KT/NW0KT_20.jpeg", + "NW0KT/NW0KT_21.jpeg", + "NW0KT/NW0KT_22.jpeg", + "NW0KT/NW0KT_23.jpeg", + "NW0KT/NW0KT_24.jpeg", + "NW0KT/NW0KT_25.jpeg", + "NW0KT/NW0KT_26.jpeg", + "NW0KT/NW0KT_27.jpeg", + "NW0KT/NW0KT_28.jpeg", + "NW0KT/NW0KT_29.jpeg", + "NW0KT/NW0KT_30.jpeg", + "NW0KT/NW0KT_31.jpeg", + "NW0KT/NW0KT_32.jpeg", + "NW0KT/NW0KT_33.jpeg", + "NW0KT/NW0KT_34.jpeg", + "NW0KT/NW0KT_35.jpeg", + "NW0KT/NW0KT_36.jpeg", + "NW0KT/NW0KT_37.jpeg", + "NW0KT/NW0KT_38.jpeg", + "NW0KT/NW0KT_39.jpeg", + "NW0KT/NW0KT_40.jpeg", + "NW0KT/NW0KT_41.jpeg", + "NW0KT/NW0KT_42.jpeg", + "NW0KT/NW0KT_43.jpeg", + "NW0KT/NW0KT_44.jpeg", + "NW0KT/NW0KT_45.jpeg", + "NW0KT/NW0KT_46.jpeg", + "NW0KT/NW0KT_47.jpeg", + "NW0KT/NW0KT_48.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 174, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the floor.\nB. Close the refrigerator.\nC. Open the box.\nD. Take the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "lie on the floor", + "close the refrigerator", + "open the box", + "take the book" + ], + "image_quantity_level": "Many", + "image": [ + "L8Y8D/L8Y8D_0.jpeg", + "L8Y8D/L8Y8D_1.jpeg", + "L8Y8D/L8Y8D_2.jpeg", + "L8Y8D/L8Y8D_3.jpeg", + "L8Y8D/L8Y8D_4.jpeg", + "L8Y8D/L8Y8D_5.jpeg", + "L8Y8D/L8Y8D_6.jpeg", + "L8Y8D/L8Y8D_7.jpeg", + "L8Y8D/L8Y8D_8.jpeg", + "L8Y8D/L8Y8D_9.jpeg", + "L8Y8D/L8Y8D_10.jpeg", + "L8Y8D/L8Y8D_11.jpeg", + "L8Y8D/L8Y8D_12.jpeg", + "L8Y8D/L8Y8D_13.jpeg", + "L8Y8D/L8Y8D_14.jpeg", + "L8Y8D/L8Y8D_15.jpeg", + "L8Y8D/L8Y8D_16.jpeg", + "L8Y8D/L8Y8D_17.jpeg", + "L8Y8D/L8Y8D_18.jpeg", + "L8Y8D/L8Y8D_19.jpeg", + "L8Y8D/L8Y8D_20.jpeg", + "L8Y8D/L8Y8D_21.jpeg", + "L8Y8D/L8Y8D_22.jpeg", + "L8Y8D/L8Y8D_23.jpeg", + "L8Y8D/L8Y8D_24.jpeg", + "L8Y8D/L8Y8D_25.jpeg", + "L8Y8D/L8Y8D_26.jpeg", + "L8Y8D/L8Y8D_27.jpeg", + "L8Y8D/L8Y8D_28.jpeg", + "L8Y8D/L8Y8D_29.jpeg", + "L8Y8D/L8Y8D_30.jpeg", + "L8Y8D/L8Y8D_31.jpeg", + "L8Y8D/L8Y8D_32.jpeg", + "L8Y8D/L8Y8D_33.jpeg", + "L8Y8D/L8Y8D_34.jpeg", + "L8Y8D/L8Y8D_35.jpeg", + "L8Y8D/L8Y8D_36.jpeg", + "L8Y8D/L8Y8D_37.jpeg", + "L8Y8D/L8Y8D_38.jpeg", + "L8Y8D/L8Y8D_39.jpeg", + "L8Y8D/L8Y8D_40.jpeg", + "L8Y8D/L8Y8D_41.jpeg", + "L8Y8D/L8Y8D_42.jpeg", + "L8Y8D/L8Y8D_43.jpeg", + "L8Y8D/L8Y8D_44.jpeg", + "L8Y8D/L8Y8D_45.jpeg", + "L8Y8D/L8Y8D_46.jpeg", + "L8Y8D/L8Y8D_47.jpeg", + "L8Y8D/L8Y8D_48.jpeg", + "L8Y8D/L8Y8D_49.jpeg", + "L8Y8D/L8Y8D_50.jpeg", + "L8Y8D/L8Y8D_51.jpeg", + "L8Y8D/L8Y8D_52.jpeg", + "L8Y8D/L8Y8D_53.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 194, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the dish.\nB. Eat the medicine.\nC. Put down the clothes.\nD. Take the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the dish", + "eat the medicine", + "put down the clothes", + "take the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "5T0NX/5T0NX_0.jpeg", + "5T0NX/5T0NX_1.jpeg", + "5T0NX/5T0NX_2.jpeg", + "5T0NX/5T0NX_3.jpeg", + "5T0NX/5T0NX_4.jpeg", + "5T0NX/5T0NX_5.jpeg", + "5T0NX/5T0NX_6.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 62, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the medicine.\nB. Take the box.\nC. Take the book.\nD. Take the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "eat the medicine", + "take the box", + "take the book", + "take the dish" + ], + "image_quantity_level": "Many", + "image": [ + "VWAEL/VWAEL_0.jpeg", + "VWAEL/VWAEL_1.jpeg", + "VWAEL/VWAEL_2.jpeg", + "VWAEL/VWAEL_3.jpeg", + "VWAEL/VWAEL_4.jpeg", + "VWAEL/VWAEL_5.jpeg", + "VWAEL/VWAEL_6.jpeg", + "VWAEL/VWAEL_7.jpeg", + "VWAEL/VWAEL_8.jpeg", + "VWAEL/VWAEL_9.jpeg", + "VWAEL/VWAEL_10.jpeg", + "VWAEL/VWAEL_11.jpeg", + "VWAEL/VWAEL_12.jpeg", + "VWAEL/VWAEL_13.jpeg", + "VWAEL/VWAEL_14.jpeg", + "VWAEL/VWAEL_15.jpeg", + "VWAEL/VWAEL_16.jpeg", + "VWAEL/VWAEL_17.jpeg", + "VWAEL/VWAEL_18.jpeg", + "VWAEL/VWAEL_19.jpeg", + "VWAEL/VWAEL_20.jpeg", + "VWAEL/VWAEL_21.jpeg", + "VWAEL/VWAEL_22.jpeg", + "VWAEL/VWAEL_23.jpeg", + "VWAEL/VWAEL_24.jpeg", + "VWAEL/VWAEL_25.jpeg", + "VWAEL/VWAEL_26.jpeg", + "VWAEL/VWAEL_27.jpeg", + "VWAEL/VWAEL_28.jpeg", + "VWAEL/VWAEL_29.jpeg", + "VWAEL/VWAEL_30.jpeg", + "VWAEL/VWAEL_31.jpeg", + "VWAEL/VWAEL_32.jpeg", + "VWAEL/VWAEL_33.jpeg", + "VWAEL/VWAEL_34.jpeg", + "VWAEL/VWAEL_35.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 40, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the medicine.\nB. Take the shoe.\nC. Take the clothes.\nD. Throw the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "eat the medicine", + "take the shoe", + "take the clothes", + "throw the food" + ], + "image_quantity_level": "Medium", + "image": [ + "8LAK1/8LAK1_0.jpeg", + "8LAK1/8LAK1_1.jpeg", + "8LAK1/8LAK1_2.jpeg", + "8LAK1/8LAK1_3.jpeg", + "8LAK1/8LAK1_4.jpeg", + "8LAK1/8LAK1_5.jpeg", + "8LAK1/8LAK1_6.jpeg", + "8LAK1/8LAK1_7.jpeg", + "8LAK1/8LAK1_8.jpeg", + "8LAK1/8LAK1_9.jpeg", + "8LAK1/8LAK1_10.jpeg", + "8LAK1/8LAK1_11.jpeg", + "8LAK1/8LAK1_12.jpeg", + "8LAK1/8LAK1_13.jpeg", + "8LAK1/8LAK1_14.jpeg", + "8LAK1/8LAK1_15.jpeg", + "8LAK1/8LAK1_16.jpeg", + "8LAK1/8LAK1_17.jpeg", + "8LAK1/8LAK1_18.jpeg", + "8LAK1/8LAK1_19.jpeg", + "8LAK1/8LAK1_20.jpeg", + "8LAK1/8LAK1_21.jpeg", + "8LAK1/8LAK1_22.jpeg", + "8LAK1/8LAK1_23.jpeg", + "8LAK1/8LAK1_24.jpeg", + "8LAK1/8LAK1_25.jpeg", + "8LAK1/8LAK1_26.jpeg", + "8LAK1/8LAK1_27.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 64, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the phone/camera.\nB. Close the book.\nC. Take the towel.\nD. Throw the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the phone camera", + "close the book", + "take the towel", + "throw the food" + ], + "image_quantity_level": "Medium", + "image": [ + "2WGSN/2WGSN_0.jpeg", + "2WGSN/2WGSN_1.jpeg", + "2WGSN/2WGSN_2.jpeg", + "2WGSN/2WGSN_3.jpeg", + "2WGSN/2WGSN_4.jpeg", + "2WGSN/2WGSN_5.jpeg", + "2WGSN/2WGSN_6.jpeg", + "2WGSN/2WGSN_7.jpeg", + "2WGSN/2WGSN_8.jpeg", + "2WGSN/2WGSN_9.jpeg", + "2WGSN/2WGSN_10.jpeg", + "2WGSN/2WGSN_11.jpeg", + "2WGSN/2WGSN_12.jpeg", + "2WGSN/2WGSN_13.jpeg", + "2WGSN/2WGSN_14.jpeg", + "2WGSN/2WGSN_15.jpeg", + "2WGSN/2WGSN_16.jpeg", + "2WGSN/2WGSN_17.jpeg", + "2WGSN/2WGSN_18.jpeg", + "2WGSN/2WGSN_19.jpeg", + "2WGSN/2WGSN_20.jpeg", + "2WGSN/2WGSN_21.jpeg", + "2WGSN/2WGSN_22.jpeg", + "2WGSN/2WGSN_23.jpeg", + "2WGSN/2WGSN_24.jpeg", + "2WGSN/2WGSN_25.jpeg", + "2WGSN/2WGSN_26.jpeg", + "2WGSN/2WGSN_27.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 30, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the bag.\nB. Put down the shoe.\nC. Wash the dish.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the bag", + "put down the shoe", + "wash the dish", + "sit on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "7TA23/7TA23_0.jpeg", + "7TA23/7TA23_1.jpeg", + "7TA23/7TA23_2.jpeg", + "7TA23/7TA23_3.jpeg", + "7TA23/7TA23_4.jpeg", + "7TA23/7TA23_5.jpeg", + "7TA23/7TA23_6.jpeg", + "7TA23/7TA23_7.jpeg", + "7TA23/7TA23_8.jpeg", + "7TA23/7TA23_9.jpeg", + "7TA23/7TA23_10.jpeg", + "7TA23/7TA23_11.jpeg", + "7TA23/7TA23_12.jpeg", + "7TA23/7TA23_13.jpeg", + "7TA23/7TA23_14.jpeg", + "7TA23/7TA23_15.jpeg", + "7TA23/7TA23_16.jpeg", + "7TA23/7TA23_17.jpeg", + "7TA23/7TA23_18.jpeg", + "7TA23/7TA23_19.jpeg", + "7TA23/7TA23_20.jpeg", + "7TA23/7TA23_21.jpeg", + "7TA23/7TA23_22.jpeg", + "7TA23/7TA23_23.jpeg", + "7TA23/7TA23_24.jpeg", + "7TA23/7TA23_25.jpeg", + "7TA23/7TA23_26.jpeg", + "7TA23/7TA23_27.jpeg", + "7TA23/7TA23_28.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 63, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the book.\nB. Take the sandwich.\nC. Open the closet/cabinet.\nD. Hold the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the book", + "take the sandwich", + "open the closet cabinet", + "hold the food" + ], + "image_quantity_level": "Medium", + "image": [ + "KV99H/KV99H_0.jpeg", + "KV99H/KV99H_1.jpeg", + "KV99H/KV99H_2.jpeg", + "KV99H/KV99H_3.jpeg", + "KV99H/KV99H_4.jpeg", + "KV99H/KV99H_5.jpeg", + "KV99H/KV99H_6.jpeg", + "KV99H/KV99H_7.jpeg", + "KV99H/KV99H_8.jpeg", + "KV99H/KV99H_9.jpeg", + "KV99H/KV99H_10.jpeg", + "KV99H/KV99H_11.jpeg", + "KV99H/KV99H_12.jpeg", + "KV99H/KV99H_13.jpeg", + "KV99H/KV99H_14.jpeg", + "KV99H/KV99H_15.jpeg", + "KV99H/KV99H_16.jpeg", + "KV99H/KV99H_17.jpeg", + "KV99H/KV99H_18.jpeg", + "KV99H/KV99H_19.jpeg", + "KV99H/KV99H_20.jpeg", + "KV99H/KV99H_21.jpeg", + "KV99H/KV99H_22.jpeg", + "KV99H/KV99H_23.jpeg", + "KV99H/KV99H_24.jpeg", + "KV99H/KV99H_25.jpeg", + "KV99H/KV99H_26.jpeg", + "KV99H/KV99H_27.jpeg", + "KV99H/KV99H_28.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 99, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the laptop.\nB. Put down the book.\nC. Throw the clothes.\nD. Lie on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "close the laptop", + "put down the book", + "throw the clothes", + "lie on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "JF36Q/JF36Q_0.jpeg", + "JF36Q/JF36Q_1.jpeg", + "JF36Q/JF36Q_2.jpeg", + "JF36Q/JF36Q_3.jpeg", + "JF36Q/JF36Q_4.jpeg", + "JF36Q/JF36Q_5.jpeg", + "JF36Q/JF36Q_6.jpeg", + "JF36Q/JF36Q_7.jpeg", + "JF36Q/JF36Q_8.jpeg", + "JF36Q/JF36Q_9.jpeg", + "JF36Q/JF36Q_10.jpeg", + "JF36Q/JF36Q_11.jpeg", + "JF36Q/JF36Q_12.jpeg", + "JF36Q/JF36Q_13.jpeg", + "JF36Q/JF36Q_14.jpeg", + "JF36Q/JF36Q_15.jpeg", + "JF36Q/JF36Q_16.jpeg", + "JF36Q/JF36Q_17.jpeg", + "JF36Q/JF36Q_18.jpeg", + "JF36Q/JF36Q_19.jpeg", + "JF36Q/JF36Q_20.jpeg", + "JF36Q/JF36Q_21.jpeg", + "JF36Q/JF36Q_22.jpeg", + "JF36Q/JF36Q_23.jpeg", + "JF36Q/JF36Q_24.jpeg", + "JF36Q/JF36Q_25.jpeg", + "JF36Q/JF36Q_26.jpeg", + "JF36Q/JF36Q_27.jpeg", + "JF36Q/JF36Q_28.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 45, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next with the blanket?\nChoice list: \nA. Put down.\nB. Tidy up.\nC. Hold.\nD. Take.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidy up", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down", + "tidy up", + "hold", + "take" + ], + "image_quantity_level": "Many", + "image": [ + "TCN06/TCN06_0.jpeg", + "TCN06/TCN06_1.jpeg", + "TCN06/TCN06_2.jpeg", + "TCN06/TCN06_3.jpeg", + "TCN06/TCN06_4.jpeg", + "TCN06/TCN06_5.jpeg", + "TCN06/TCN06_6.jpeg", + "TCN06/TCN06_7.jpeg", + "TCN06/TCN06_8.jpeg", + "TCN06/TCN06_9.jpeg", + "TCN06/TCN06_10.jpeg", + "TCN06/TCN06_11.jpeg", + "TCN06/TCN06_12.jpeg", + "TCN06/TCN06_13.jpeg", + "TCN06/TCN06_14.jpeg", + "TCN06/TCN06_15.jpeg", + "TCN06/TCN06_16.jpeg", + "TCN06/TCN06_17.jpeg", + "TCN06/TCN06_18.jpeg", + "TCN06/TCN06_19.jpeg", + "TCN06/TCN06_20.jpeg", + "TCN06/TCN06_21.jpeg", + "TCN06/TCN06_22.jpeg", + "TCN06/TCN06_23.jpeg", + "TCN06/TCN06_24.jpeg", + "TCN06/TCN06_25.jpeg", + "TCN06/TCN06_26.jpeg", + "TCN06/TCN06_27.jpeg", + "TCN06/TCN06_28.jpeg", + "TCN06/TCN06_29.jpeg", + "TCN06/TCN06_30.jpeg", + "TCN06/TCN06_31.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 55, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the box.\nB. Throw the food.\nC. Tidy up the towel.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the box", + "throw the food", + "tidy up the towel", + "sit on the sofa couch" + ], + "image_quantity_level": "Many", + "image": [ + "D1NT7/D1NT7_0.jpeg", + "D1NT7/D1NT7_1.jpeg", + "D1NT7/D1NT7_2.jpeg", + "D1NT7/D1NT7_3.jpeg", + "D1NT7/D1NT7_4.jpeg", + "D1NT7/D1NT7_5.jpeg", + "D1NT7/D1NT7_6.jpeg", + "D1NT7/D1NT7_7.jpeg", + "D1NT7/D1NT7_8.jpeg", + "D1NT7/D1NT7_9.jpeg", + "D1NT7/D1NT7_10.jpeg", + "D1NT7/D1NT7_11.jpeg", + "D1NT7/D1NT7_12.jpeg", + "D1NT7/D1NT7_13.jpeg", + "D1NT7/D1NT7_14.jpeg", + "D1NT7/D1NT7_15.jpeg", + "D1NT7/D1NT7_16.jpeg", + "D1NT7/D1NT7_17.jpeg", + "D1NT7/D1NT7_18.jpeg", + "D1NT7/D1NT7_19.jpeg", + "D1NT7/D1NT7_20.jpeg", + "D1NT7/D1NT7_21.jpeg", + "D1NT7/D1NT7_22.jpeg", + "D1NT7/D1NT7_23.jpeg", + "D1NT7/D1NT7_24.jpeg", + "D1NT7/D1NT7_25.jpeg", + "D1NT7/D1NT7_26.jpeg", + "D1NT7/D1NT7_27.jpeg", + "D1NT7/D1NT7_28.jpeg", + "D1NT7/D1NT7_29.jpeg", + "D1NT7/D1NT7_30.jpeg", + "D1NT7/D1NT7_31.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 81, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the book.\nB. Sit on the table.\nC. Take the food.\nD. Put down the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the book", + "sit on the table", + "take the food", + "put down the box" + ], + "image_quantity_level": "Many", + "image": [ + "VNVRA/VNVRA_0.jpeg", + "VNVRA/VNVRA_1.jpeg", + "VNVRA/VNVRA_2.jpeg", + "VNVRA/VNVRA_3.jpeg", + "VNVRA/VNVRA_4.jpeg", + "VNVRA/VNVRA_5.jpeg", + "VNVRA/VNVRA_6.jpeg", + "VNVRA/VNVRA_7.jpeg", + "VNVRA/VNVRA_8.jpeg", + "VNVRA/VNVRA_9.jpeg", + "VNVRA/VNVRA_10.jpeg", + "VNVRA/VNVRA_11.jpeg", + "VNVRA/VNVRA_12.jpeg", + "VNVRA/VNVRA_13.jpeg", + "VNVRA/VNVRA_14.jpeg", + "VNVRA/VNVRA_15.jpeg", + "VNVRA/VNVRA_16.jpeg", + "VNVRA/VNVRA_17.jpeg", + "VNVRA/VNVRA_18.jpeg", + "VNVRA/VNVRA_19.jpeg", + "VNVRA/VNVRA_20.jpeg", + "VNVRA/VNVRA_21.jpeg", + "VNVRA/VNVRA_22.jpeg", + "VNVRA/VNVRA_23.jpeg", + "VNVRA/VNVRA_24.jpeg", + "VNVRA/VNVRA_25.jpeg", + "VNVRA/VNVRA_26.jpeg", + "VNVRA/VNVRA_27.jpeg", + "VNVRA/VNVRA_28.jpeg", + "VNVRA/VNVRA_29.jpeg", + "VNVRA/VNVRA_30.jpeg", + "VNVRA/VNVRA_31.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 94, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the sandwich.\nB. Open the bag.\nC. Take the towel.\nD. Take the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the sandwich", + "open the bag", + "take the towel", + "take the food" + ], + "image_quantity_level": "Many", + "image": [ + "NEM29/NEM29_0.jpeg", + "NEM29/NEM29_1.jpeg", + "NEM29/NEM29_2.jpeg", + "NEM29/NEM29_3.jpeg", + "NEM29/NEM29_4.jpeg", + "NEM29/NEM29_5.jpeg", + "NEM29/NEM29_6.jpeg", + "NEM29/NEM29_7.jpeg", + "NEM29/NEM29_8.jpeg", + "NEM29/NEM29_9.jpeg", + "NEM29/NEM29_10.jpeg", + "NEM29/NEM29_11.jpeg", + "NEM29/NEM29_12.jpeg", + "NEM29/NEM29_13.jpeg", + "NEM29/NEM29_14.jpeg", + "NEM29/NEM29_15.jpeg", + "NEM29/NEM29_16.jpeg", + "NEM29/NEM29_17.jpeg", + "NEM29/NEM29_18.jpeg", + "NEM29/NEM29_19.jpeg", + "NEM29/NEM29_20.jpeg", + "NEM29/NEM29_21.jpeg", + "NEM29/NEM29_22.jpeg", + "NEM29/NEM29_23.jpeg", + "NEM29/NEM29_24.jpeg", + "NEM29/NEM29_25.jpeg", + "NEM29/NEM29_26.jpeg", + "NEM29/NEM29_27.jpeg", + "NEM29/NEM29_28.jpeg", + "NEM29/NEM29_29.jpeg", + "NEM29/NEM29_30.jpeg", + "NEM29/NEM29_31.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 79, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the towel.\nB. Open the laptop.\nC. Wash the clothes.\nD. Sit on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the towel", + "open the laptop", + "wash the clothes", + "sit on the bed" + ], + "image_quantity_level": "Medium", + "image": [ + "BE51K/BE51K_0.jpeg", + "BE51K/BE51K_1.jpeg", + "BE51K/BE51K_2.jpeg", + "BE51K/BE51K_3.jpeg", + "BE51K/BE51K_4.jpeg", + "BE51K/BE51K_5.jpeg", + "BE51K/BE51K_6.jpeg", + "BE51K/BE51K_7.jpeg", + "BE51K/BE51K_8.jpeg", + "BE51K/BE51K_9.jpeg", + "BE51K/BE51K_10.jpeg", + "BE51K/BE51K_11.jpeg", + "BE51K/BE51K_12.jpeg", + "BE51K/BE51K_13.jpeg", + "BE51K/BE51K_14.jpeg", + "BE51K/BE51K_15.jpeg", + "BE51K/BE51K_16.jpeg", + "BE51K/BE51K_17.jpeg", + "BE51K/BE51K_18.jpeg", + "BE51K/BE51K_19.jpeg", + "BE51K/BE51K_20.jpeg", + "BE51K/BE51K_21.jpeg", + "BE51K/BE51K_22.jpeg", + "BE51K/BE51K_23.jpeg", + "BE51K/BE51K_24.jpeg", + "BE51K/BE51K_25.jpeg", + "BE51K/BE51K_26.jpeg", + "BE51K/BE51K_27.jpeg", + "BE51K/BE51K_28.jpeg", + "BE51K/BE51K_29.jpeg", + "BE51K/BE51K_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 95, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the pillow.\nB. Take the book.\nC. Open the book.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take the pillow", + "take the book", + "open the book", + "open the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "3N1I2/3N1I2_0.jpeg", + "3N1I2/3N1I2_1.jpeg", + "3N1I2/3N1I2_2.jpeg", + "3N1I2/3N1I2_3.jpeg", + "3N1I2/3N1I2_4.jpeg", + "3N1I2/3N1I2_5.jpeg", + "3N1I2/3N1I2_6.jpeg", + "3N1I2/3N1I2_7.jpeg", + "3N1I2/3N1I2_8.jpeg", + "3N1I2/3N1I2_9.jpeg", + "3N1I2/3N1I2_10.jpeg", + "3N1I2/3N1I2_11.jpeg", + "3N1I2/3N1I2_12.jpeg", + "3N1I2/3N1I2_13.jpeg", + "3N1I2/3N1I2_14.jpeg", + "3N1I2/3N1I2_15.jpeg", + "3N1I2/3N1I2_16.jpeg", + "3N1I2/3N1I2_17.jpeg", + "3N1I2/3N1I2_18.jpeg", + "3N1I2/3N1I2_19.jpeg", + "3N1I2/3N1I2_20.jpeg", + "3N1I2/3N1I2_21.jpeg", + "3N1I2/3N1I2_22.jpeg", + "3N1I2/3N1I2_23.jpeg", + "3N1I2/3N1I2_24.jpeg", + "3N1I2/3N1I2_25.jpeg", + "3N1I2/3N1I2_26.jpeg", + "3N1I2/3N1I2_27.jpeg", + "3N1I2/3N1I2_28.jpeg", + "3N1I2/3N1I2_29.jpeg", + "3N1I2/3N1I2_30.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 104, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Sit on the sofa/couch.\nB. Take the phone/camera.\nC. Put down the box.\nD. Tidy up the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "sit on the sofa couch", + "take the phone camera", + "put down the box", + "tidy up the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "65UVU/65UVU_0.jpeg", + "65UVU/65UVU_1.jpeg", + "65UVU/65UVU_2.jpeg", + "65UVU/65UVU_3.jpeg", + "65UVU/65UVU_4.jpeg", + "65UVU/65UVU_5.jpeg", + "65UVU/65UVU_6.jpeg", + "65UVU/65UVU_7.jpeg", + "65UVU/65UVU_8.jpeg", + "65UVU/65UVU_9.jpeg", + "65UVU/65UVU_10.jpeg", + "65UVU/65UVU_11.jpeg", + "65UVU/65UVU_12.jpeg", + "65UVU/65UVU_13.jpeg", + "65UVU/65UVU_14.jpeg", + "65UVU/65UVU_15.jpeg", + "65UVU/65UVU_16.jpeg", + "65UVU/65UVU_17.jpeg", + "65UVU/65UVU_18.jpeg", + "65UVU/65UVU_19.jpeg", + "65UVU/65UVU_20.jpeg", + "65UVU/65UVU_21.jpeg", + "65UVU/65UVU_22.jpeg", + "65UVU/65UVU_23.jpeg", + "65UVU/65UVU_24.jpeg", + "65UVU/65UVU_25.jpeg", + "65UVU/65UVU_26.jpeg", + "65UVU/65UVU_27.jpeg", + "65UVU/65UVU_28.jpeg", + "65UVU/65UVU_29.jpeg", + "65UVU/65UVU_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 106, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the sandwich.\nB. Open the refrigerator.\nC. Put down the phone/camera.\nD. Wash the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the sandwich", + "open the refrigerator", + "put down the phone camera", + "wash the window" + ], + "image_quantity_level": "Medium", + "image": [ + "W0XPU/W0XPU_0.jpeg", + "W0XPU/W0XPU_1.jpeg", + "W0XPU/W0XPU_2.jpeg", + "W0XPU/W0XPU_3.jpeg", + "W0XPU/W0XPU_4.jpeg", + "W0XPU/W0XPU_5.jpeg", + "W0XPU/W0XPU_6.jpeg", + "W0XPU/W0XPU_7.jpeg", + "W0XPU/W0XPU_8.jpeg", + "W0XPU/W0XPU_9.jpeg", + "W0XPU/W0XPU_10.jpeg", + "W0XPU/W0XPU_11.jpeg", + "W0XPU/W0XPU_12.jpeg", + "W0XPU/W0XPU_13.jpeg", + "W0XPU/W0XPU_14.jpeg", + "W0XPU/W0XPU_15.jpeg", + "W0XPU/W0XPU_16.jpeg", + "W0XPU/W0XPU_17.jpeg", + "W0XPU/W0XPU_18.jpeg", + "W0XPU/W0XPU_19.jpeg", + "W0XPU/W0XPU_20.jpeg", + "W0XPU/W0XPU_21.jpeg", + "W0XPU/W0XPU_22.jpeg", + "W0XPU/W0XPU_23.jpeg", + "W0XPU/W0XPU_24.jpeg", + "W0XPU/W0XPU_25.jpeg", + "W0XPU/W0XPU_26.jpeg", + "W0XPU/W0XPU_27.jpeg", + "W0XPU/W0XPU_28.jpeg", + "W0XPU/W0XPU_29.jpeg", + "W0XPU/W0XPU_30.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 108, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the shoe.\nB. Take the food.\nC. Close the door.\nD. Put down the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the food", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the shoe", + "take the food", + "close the door", + "put down the cup glass bottle" + ], + "image_quantity_level": "Medium", + "image": [ + "0BZAD/0BZAD_0.jpeg", + "0BZAD/0BZAD_1.jpeg", + "0BZAD/0BZAD_2.jpeg", + "0BZAD/0BZAD_3.jpeg", + "0BZAD/0BZAD_4.jpeg", + "0BZAD/0BZAD_5.jpeg", + "0BZAD/0BZAD_6.jpeg", + "0BZAD/0BZAD_7.jpeg", + "0BZAD/0BZAD_8.jpeg", + "0BZAD/0BZAD_9.jpeg", + "0BZAD/0BZAD_10.jpeg", + "0BZAD/0BZAD_11.jpeg", + "0BZAD/0BZAD_12.jpeg", + "0BZAD/0BZAD_13.jpeg", + "0BZAD/0BZAD_14.jpeg", + "0BZAD/0BZAD_15.jpeg", + "0BZAD/0BZAD_16.jpeg", + "0BZAD/0BZAD_17.jpeg", + "0BZAD/0BZAD_18.jpeg", + "0BZAD/0BZAD_19.jpeg", + "0BZAD/0BZAD_20.jpeg", + "0BZAD/0BZAD_21.jpeg", + "0BZAD/0BZAD_22.jpeg", + "0BZAD/0BZAD_23.jpeg", + "0BZAD/0BZAD_24.jpeg", + "0BZAD/0BZAD_25.jpeg", + "0BZAD/0BZAD_26.jpeg", + "0BZAD/0BZAD_27.jpeg", + "0BZAD/0BZAD_28.jpeg", + "0BZAD/0BZAD_29.jpeg", + "0BZAD/0BZAD_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 116, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the bag.\nB. Put down the phone/camera.\nC. Put down the food.\nD. Close the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the bag", + "put down the phone camera", + "put down the food", + "close the book" + ], + "image_quantity_level": "Medium", + "image": [ + "C6LW1/C6LW1_0.jpeg", + "C6LW1/C6LW1_1.jpeg", + "C6LW1/C6LW1_2.jpeg", + "C6LW1/C6LW1_3.jpeg", + "C6LW1/C6LW1_4.jpeg", + "C6LW1/C6LW1_5.jpeg", + "C6LW1/C6LW1_6.jpeg", + "C6LW1/C6LW1_7.jpeg", + "C6LW1/C6LW1_8.jpeg", + "C6LW1/C6LW1_9.jpeg", + "C6LW1/C6LW1_10.jpeg", + "C6LW1/C6LW1_11.jpeg", + "C6LW1/C6LW1_12.jpeg", + "C6LW1/C6LW1_13.jpeg", + "C6LW1/C6LW1_14.jpeg", + "C6LW1/C6LW1_15.jpeg", + "C6LW1/C6LW1_16.jpeg", + "C6LW1/C6LW1_17.jpeg", + "C6LW1/C6LW1_18.jpeg", + "C6LW1/C6LW1_19.jpeg", + "C6LW1/C6LW1_20.jpeg", + "C6LW1/C6LW1_21.jpeg", + "C6LW1/C6LW1_22.jpeg", + "C6LW1/C6LW1_23.jpeg", + "C6LW1/C6LW1_24.jpeg", + "C6LW1/C6LW1_25.jpeg", + "C6LW1/C6LW1_26.jpeg", + "C6LW1/C6LW1_27.jpeg", + "C6LW1/C6LW1_28.jpeg", + "C6LW1/C6LW1_29.jpeg", + "C6LW1/C6LW1_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 122, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Sit on the bed.\nB. Sit on the sofa/couch.\nC. Throw the blanket.\nD. Take the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "sit on the bed", + "sit on the sofa couch", + "throw the blanket", + "take the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "NV4FC/NV4FC_0.jpeg", + "NV4FC/NV4FC_1.jpeg", + "NV4FC/NV4FC_2.jpeg", + "NV4FC/NV4FC_3.jpeg", + "NV4FC/NV4FC_4.jpeg", + "NV4FC/NV4FC_5.jpeg", + "NV4FC/NV4FC_6.jpeg", + "NV4FC/NV4FC_7.jpeg", + "NV4FC/NV4FC_8.jpeg", + "NV4FC/NV4FC_9.jpeg", + "NV4FC/NV4FC_10.jpeg", + "NV4FC/NV4FC_11.jpeg", + "NV4FC/NV4FC_12.jpeg", + "NV4FC/NV4FC_13.jpeg", + "NV4FC/NV4FC_14.jpeg", + "NV4FC/NV4FC_15.jpeg", + "NV4FC/NV4FC_16.jpeg", + "NV4FC/NV4FC_17.jpeg", + "NV4FC/NV4FC_18.jpeg", + "NV4FC/NV4FC_19.jpeg", + "NV4FC/NV4FC_20.jpeg", + "NV4FC/NV4FC_21.jpeg", + "NV4FC/NV4FC_22.jpeg", + "NV4FC/NV4FC_23.jpeg", + "NV4FC/NV4FC_24.jpeg", + "NV4FC/NV4FC_25.jpeg", + "NV4FC/NV4FC_26.jpeg", + "NV4FC/NV4FC_27.jpeg", + "NV4FC/NV4FC_28.jpeg", + "NV4FC/NV4FC_29.jpeg", + "NV4FC/NV4FC_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 76, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the broom.\nB. Put down the blanket.\nC. Close the refrigerator.\nD. Put down the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the broom", + "put down the blanket", + "close the refrigerator", + "put down the bag" + ], + "image_quantity_level": "Many", + "image": [ + "ZZ89F/ZZ89F_0.jpeg", + "ZZ89F/ZZ89F_1.jpeg", + "ZZ89F/ZZ89F_2.jpeg", + "ZZ89F/ZZ89F_3.jpeg", + "ZZ89F/ZZ89F_4.jpeg", + "ZZ89F/ZZ89F_5.jpeg", + "ZZ89F/ZZ89F_6.jpeg", + "ZZ89F/ZZ89F_7.jpeg", + "ZZ89F/ZZ89F_8.jpeg", + "ZZ89F/ZZ89F_9.jpeg", + "ZZ89F/ZZ89F_10.jpeg", + "ZZ89F/ZZ89F_11.jpeg", + "ZZ89F/ZZ89F_12.jpeg", + "ZZ89F/ZZ89F_13.jpeg", + "ZZ89F/ZZ89F_14.jpeg", + "ZZ89F/ZZ89F_15.jpeg", + "ZZ89F/ZZ89F_16.jpeg", + "ZZ89F/ZZ89F_17.jpeg", + "ZZ89F/ZZ89F_18.jpeg", + "ZZ89F/ZZ89F_19.jpeg", + "ZZ89F/ZZ89F_20.jpeg", + "ZZ89F/ZZ89F_21.jpeg", + "ZZ89F/ZZ89F_22.jpeg", + "ZZ89F/ZZ89F_23.jpeg", + "ZZ89F/ZZ89F_24.jpeg", + "ZZ89F/ZZ89F_25.jpeg", + "ZZ89F/ZZ89F_26.jpeg", + "ZZ89F/ZZ89F_27.jpeg", + "ZZ89F/ZZ89F_28.jpeg", + "ZZ89F/ZZ89F_29.jpeg", + "ZZ89F/ZZ89F_30.jpeg", + "ZZ89F/ZZ89F_31.jpeg", + "ZZ89F/ZZ89F_32.jpeg", + "ZZ89F/ZZ89F_33.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 84, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Open the closet/cabinet.\nC. Put down the paper/notebook.\nD. Put down the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the cup glass bottle", + "open the closet cabinet", + "put down the paper notebook", + "put down the phone camera" + ], + "image_quantity_level": "Many", + "image": [ + "727IZ/727IZ_0.jpeg", + "727IZ/727IZ_1.jpeg", + "727IZ/727IZ_2.jpeg", + "727IZ/727IZ_3.jpeg", + "727IZ/727IZ_4.jpeg", + "727IZ/727IZ_5.jpeg", + "727IZ/727IZ_6.jpeg", + "727IZ/727IZ_7.jpeg", + "727IZ/727IZ_8.jpeg", + "727IZ/727IZ_9.jpeg", + "727IZ/727IZ_10.jpeg", + "727IZ/727IZ_11.jpeg", + "727IZ/727IZ_12.jpeg", + "727IZ/727IZ_13.jpeg", + "727IZ/727IZ_14.jpeg", + "727IZ/727IZ_15.jpeg", + "727IZ/727IZ_16.jpeg", + "727IZ/727IZ_17.jpeg", + "727IZ/727IZ_18.jpeg", + "727IZ/727IZ_19.jpeg", + "727IZ/727IZ_20.jpeg", + "727IZ/727IZ_21.jpeg", + "727IZ/727IZ_22.jpeg", + "727IZ/727IZ_23.jpeg", + "727IZ/727IZ_24.jpeg", + "727IZ/727IZ_25.jpeg", + "727IZ/727IZ_26.jpeg", + "727IZ/727IZ_27.jpeg", + "727IZ/727IZ_28.jpeg", + "727IZ/727IZ_29.jpeg", + "727IZ/727IZ_30.jpeg", + "727IZ/727IZ_31.jpeg", + "727IZ/727IZ_32.jpeg", + "727IZ/727IZ_33.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 113, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the book.\nB. Wash the table.\nC. Throw the pillow.\nD. Take the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "throw the book", + "wash the table", + "throw the pillow", + "take the food" + ], + "image_quantity_level": "Many", + "image": [ + "OUAIS/OUAIS_0.jpeg", + "OUAIS/OUAIS_1.jpeg", + "OUAIS/OUAIS_2.jpeg", + "OUAIS/OUAIS_3.jpeg", + "OUAIS/OUAIS_4.jpeg", + "OUAIS/OUAIS_5.jpeg", + "OUAIS/OUAIS_6.jpeg", + "OUAIS/OUAIS_7.jpeg", + "OUAIS/OUAIS_8.jpeg", + "OUAIS/OUAIS_9.jpeg", + "OUAIS/OUAIS_10.jpeg", + "OUAIS/OUAIS_11.jpeg", + "OUAIS/OUAIS_12.jpeg", + "OUAIS/OUAIS_13.jpeg", + "OUAIS/OUAIS_14.jpeg", + "OUAIS/OUAIS_15.jpeg", + "OUAIS/OUAIS_16.jpeg", + "OUAIS/OUAIS_17.jpeg", + "OUAIS/OUAIS_18.jpeg", + "OUAIS/OUAIS_19.jpeg", + "OUAIS/OUAIS_20.jpeg", + "OUAIS/OUAIS_21.jpeg", + "OUAIS/OUAIS_22.jpeg", + "OUAIS/OUAIS_23.jpeg", + "OUAIS/OUAIS_24.jpeg", + "OUAIS/OUAIS_25.jpeg", + "OUAIS/OUAIS_26.jpeg", + "OUAIS/OUAIS_27.jpeg", + "OUAIS/OUAIS_28.jpeg", + "OUAIS/OUAIS_29.jpeg", + "OUAIS/OUAIS_30.jpeg", + "OUAIS/OUAIS_31.jpeg", + "OUAIS/OUAIS_32.jpeg", + "OUAIS/OUAIS_33.jpeg", + "OUAIS/OUAIS_34.jpeg", + "OUAIS/OUAIS_35.jpeg", + "OUAIS/OUAIS_36.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 127, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the book.\nB. Take the clothes.\nC. Eat the medicine.\nD. Close the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the book", + "take the clothes", + "eat the medicine", + "close the closet cabinet" + ], + "image_quantity_level": "Many", + "image": [ + "V0ZD9/V0ZD9_0.jpeg", + "V0ZD9/V0ZD9_1.jpeg", + "V0ZD9/V0ZD9_2.jpeg", + "V0ZD9/V0ZD9_3.jpeg", + "V0ZD9/V0ZD9_4.jpeg", + "V0ZD9/V0ZD9_5.jpeg", + "V0ZD9/V0ZD9_6.jpeg", + "V0ZD9/V0ZD9_7.jpeg", + "V0ZD9/V0ZD9_8.jpeg", + "V0ZD9/V0ZD9_9.jpeg", + "V0ZD9/V0ZD9_10.jpeg", + "V0ZD9/V0ZD9_11.jpeg", + "V0ZD9/V0ZD9_12.jpeg", + "V0ZD9/V0ZD9_13.jpeg", + "V0ZD9/V0ZD9_14.jpeg", + "V0ZD9/V0ZD9_15.jpeg", + "V0ZD9/V0ZD9_16.jpeg", + "V0ZD9/V0ZD9_17.jpeg", + "V0ZD9/V0ZD9_18.jpeg", + "V0ZD9/V0ZD9_19.jpeg", + "V0ZD9/V0ZD9_20.jpeg", + "V0ZD9/V0ZD9_21.jpeg", + "V0ZD9/V0ZD9_22.jpeg", + "V0ZD9/V0ZD9_23.jpeg", + "V0ZD9/V0ZD9_24.jpeg", + "V0ZD9/V0ZD9_25.jpeg", + "V0ZD9/V0ZD9_26.jpeg", + "V0ZD9/V0ZD9_27.jpeg", + "V0ZD9/V0ZD9_28.jpeg", + "V0ZD9/V0ZD9_29.jpeg", + "V0ZD9/V0ZD9_30.jpeg", + "V0ZD9/V0ZD9_31.jpeg", + "V0ZD9/V0ZD9_32.jpeg", + "V0ZD9/V0ZD9_33.jpeg", + "V0ZD9/V0ZD9_34.jpeg", + "V0ZD9/V0ZD9_35.jpeg", + "V0ZD9/V0ZD9_36.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 72, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next with the food?\nChoice list: \nA. Open.\nB. Put down.\nC. Throw.\nD. Take.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open", + "put down", + "throw", + "take" + ], + "image_quantity_level": "Many", + "image": [ + "BGQIF/BGQIF_0.jpeg", + "BGQIF/BGQIF_1.jpeg", + "BGQIF/BGQIF_2.jpeg", + "BGQIF/BGQIF_3.jpeg", + "BGQIF/BGQIF_4.jpeg", + "BGQIF/BGQIF_5.jpeg", + "BGQIF/BGQIF_6.jpeg", + "BGQIF/BGQIF_7.jpeg", + "BGQIF/BGQIF_8.jpeg", + "BGQIF/BGQIF_9.jpeg", + "BGQIF/BGQIF_10.jpeg", + "BGQIF/BGQIF_11.jpeg", + "BGQIF/BGQIF_12.jpeg", + "BGQIF/BGQIF_13.jpeg", + "BGQIF/BGQIF_14.jpeg", + "BGQIF/BGQIF_15.jpeg", + "BGQIF/BGQIF_16.jpeg", + "BGQIF/BGQIF_17.jpeg", + "BGQIF/BGQIF_18.jpeg", + "BGQIF/BGQIF_19.jpeg", + "BGQIF/BGQIF_20.jpeg", + "BGQIF/BGQIF_21.jpeg", + "BGQIF/BGQIF_22.jpeg", + "BGQIF/BGQIF_23.jpeg", + "BGQIF/BGQIF_24.jpeg", + "BGQIF/BGQIF_25.jpeg", + "BGQIF/BGQIF_26.jpeg", + "BGQIF/BGQIF_27.jpeg", + "BGQIF/BGQIF_28.jpeg", + "BGQIF/BGQIF_29.jpeg", + "BGQIF/BGQIF_30.jpeg", + "BGQIF/BGQIF_31.jpeg", + "BGQIF/BGQIF_32.jpeg", + "BGQIF/BGQIF_33.jpeg", + "BGQIF/BGQIF_34.jpeg", + "BGQIF/BGQIF_35.jpeg", + "BGQIF/BGQIF_36.jpeg", + "BGQIF/BGQIF_37.jpeg", + "BGQIF/BGQIF_38.jpeg", + "BGQIF/BGQIF_39.jpeg", + "BGQIF/BGQIF_40.jpeg", + "BGQIF/BGQIF_41.jpeg", + "BGQIF/BGQIF_42.jpeg", + "BGQIF/BGQIF_43.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 48, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the window.\nB. Take the towel.\nC. Take the shoe.\nD. Take the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the window", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "close the window", + "take the towel", + "take the shoe", + "take the food" + ], + "image_quantity_level": "Medium", + "image": [ + "B69CF/B69CF_0.jpeg", + "B69CF/B69CF_1.jpeg", + "B69CF/B69CF_2.jpeg", + "B69CF/B69CF_3.jpeg", + "B69CF/B69CF_4.jpeg", + "B69CF/B69CF_5.jpeg", + "B69CF/B69CF_6.jpeg", + "B69CF/B69CF_7.jpeg", + "B69CF/B69CF_8.jpeg", + "B69CF/B69CF_9.jpeg", + "B69CF/B69CF_10.jpeg", + "B69CF/B69CF_11.jpeg", + "B69CF/B69CF_12.jpeg", + "B69CF/B69CF_13.jpeg", + "B69CF/B69CF_14.jpeg", + "B69CF/B69CF_15.jpeg", + "B69CF/B69CF_16.jpeg", + "B69CF/B69CF_17.jpeg", + "B69CF/B69CF_18.jpeg", + "B69CF/B69CF_19.jpeg", + "B69CF/B69CF_20.jpeg", + "B69CF/B69CF_21.jpeg", + "B69CF/B69CF_22.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 73, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the pillow.\nB. Tidy up the clothes.\nC. Take the box.\nD. Put down the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the pillow", + "tidy up the clothes", + "take the box", + "put down the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "5ZDNU/5ZDNU_0.jpeg", + "5ZDNU/5ZDNU_1.jpeg", + "5ZDNU/5ZDNU_2.jpeg", + "5ZDNU/5ZDNU_3.jpeg", + "5ZDNU/5ZDNU_4.jpeg", + "5ZDNU/5ZDNU_5.jpeg", + "5ZDNU/5ZDNU_6.jpeg", + "5ZDNU/5ZDNU_7.jpeg", + "5ZDNU/5ZDNU_8.jpeg", + "5ZDNU/5ZDNU_9.jpeg", + "5ZDNU/5ZDNU_10.jpeg", + "5ZDNU/5ZDNU_11.jpeg", + "5ZDNU/5ZDNU_12.jpeg", + "5ZDNU/5ZDNU_13.jpeg", + "5ZDNU/5ZDNU_14.jpeg", + "5ZDNU/5ZDNU_15.jpeg", + "5ZDNU/5ZDNU_16.jpeg", + "5ZDNU/5ZDNU_17.jpeg", + "5ZDNU/5ZDNU_18.jpeg", + "5ZDNU/5ZDNU_19.jpeg", + "5ZDNU/5ZDNU_20.jpeg", + "5ZDNU/5ZDNU_21.jpeg", + "5ZDNU/5ZDNU_22.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 120, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next with the bed?\nChoice list: \nA. Put down.\nB. Throw.\nC. Sit on.\nD. Lie on.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down", + "throw", + "sit on", + "lie on" + ], + "image_quantity_level": "Medium", + "image": [ + "BI31D/BI31D_0.jpeg", + "BI31D/BI31D_1.jpeg", + "BI31D/BI31D_2.jpeg", + "BI31D/BI31D_3.jpeg", + "BI31D/BI31D_4.jpeg", + "BI31D/BI31D_5.jpeg", + "BI31D/BI31D_6.jpeg", + "BI31D/BI31D_7.jpeg", + "BI31D/BI31D_8.jpeg", + "BI31D/BI31D_9.jpeg", + "BI31D/BI31D_10.jpeg", + "BI31D/BI31D_11.jpeg", + "BI31D/BI31D_12.jpeg", + "BI31D/BI31D_13.jpeg", + "BI31D/BI31D_14.jpeg", + "BI31D/BI31D_15.jpeg", + "BI31D/BI31D_16.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 54, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the laptop.\nB. Put down the dish.\nC. Sit on the floor.\nD. Take the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the laptop", + "put down the dish", + "sit on the floor", + "take the box" + ], + "image_quantity_level": "Many", + "image": [ + "P4HXN/P4HXN_0.jpeg", + "P4HXN/P4HXN_1.jpeg", + "P4HXN/P4HXN_2.jpeg", + "P4HXN/P4HXN_3.jpeg", + "P4HXN/P4HXN_4.jpeg", + "P4HXN/P4HXN_5.jpeg", + "P4HXN/P4HXN_6.jpeg", + "P4HXN/P4HXN_7.jpeg", + "P4HXN/P4HXN_8.jpeg", + "P4HXN/P4HXN_9.jpeg", + "P4HXN/P4HXN_10.jpeg", + "P4HXN/P4HXN_11.jpeg", + "P4HXN/P4HXN_12.jpeg", + "P4HXN/P4HXN_13.jpeg", + "P4HXN/P4HXN_14.jpeg", + "P4HXN/P4HXN_15.jpeg", + "P4HXN/P4HXN_16.jpeg", + "P4HXN/P4HXN_17.jpeg", + "P4HXN/P4HXN_18.jpeg", + "P4HXN/P4HXN_19.jpeg", + "P4HXN/P4HXN_20.jpeg", + "P4HXN/P4HXN_21.jpeg", + "P4HXN/P4HXN_22.jpeg", + "P4HXN/P4HXN_23.jpeg", + "P4HXN/P4HXN_24.jpeg", + "P4HXN/P4HXN_25.jpeg", + "P4HXN/P4HXN_26.jpeg", + "P4HXN/P4HXN_27.jpeg", + "P4HXN/P4HXN_28.jpeg", + "P4HXN/P4HXN_29.jpeg", + "P4HXN/P4HXN_30.jpeg", + "P4HXN/P4HXN_31.jpeg", + "P4HXN/P4HXN_32.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 135, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the blanket.\nB. Take the blanket.\nC. Wash the clothes.\nD. Put down the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the blanket", + "take the blanket", + "wash the clothes", + "put down the sandwich" + ], + "image_quantity_level": "Many", + "image": [ + "R0207/R0207_0.jpeg", + "R0207/R0207_1.jpeg", + "R0207/R0207_2.jpeg", + "R0207/R0207_3.jpeg", + "R0207/R0207_4.jpeg", + "R0207/R0207_5.jpeg", + "R0207/R0207_6.jpeg", + "R0207/R0207_7.jpeg", + "R0207/R0207_8.jpeg", + "R0207/R0207_9.jpeg", + "R0207/R0207_10.jpeg", + "R0207/R0207_11.jpeg", + "R0207/R0207_12.jpeg", + "R0207/R0207_13.jpeg", + "R0207/R0207_14.jpeg", + "R0207/R0207_15.jpeg", + "R0207/R0207_16.jpeg", + "R0207/R0207_17.jpeg", + "R0207/R0207_18.jpeg", + "R0207/R0207_19.jpeg", + "R0207/R0207_20.jpeg", + "R0207/R0207_21.jpeg", + "R0207/R0207_22.jpeg", + "R0207/R0207_23.jpeg", + "R0207/R0207_24.jpeg", + "R0207/R0207_25.jpeg", + "R0207/R0207_26.jpeg", + "R0207/R0207_27.jpeg", + "R0207/R0207_28.jpeg", + "R0207/R0207_29.jpeg", + "R0207/R0207_30.jpeg", + "R0207/R0207_31.jpeg", + "R0207/R0207_32.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 19, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the window.\nB. Throw the clothes.\nC. Open the box.\nD. Put down the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "wash the window", + "throw the clothes", + "open the box", + "put down the book" + ], + "image_quantity_level": "Medium", + "image": [ + "TVCQF/TVCQF_0.jpeg", + "TVCQF/TVCQF_1.jpeg", + "TVCQF/TVCQF_2.jpeg", + "TVCQF/TVCQF_3.jpeg", + "TVCQF/TVCQF_4.jpeg", + "TVCQF/TVCQF_5.jpeg", + "TVCQF/TVCQF_6.jpeg", + "TVCQF/TVCQF_7.jpeg", + "TVCQF/TVCQF_8.jpeg", + "TVCQF/TVCQF_9.jpeg", + "TVCQF/TVCQF_10.jpeg", + "TVCQF/TVCQF_11.jpeg", + "TVCQF/TVCQF_12.jpeg", + "TVCQF/TVCQF_13.jpeg", + "TVCQF/TVCQF_14.jpeg", + "TVCQF/TVCQF_15.jpeg", + "TVCQF/TVCQF_16.jpeg", + "TVCQF/TVCQF_17.jpeg", + "TVCQF/TVCQF_18.jpeg", + "TVCQF/TVCQF_19.jpeg", + "TVCQF/TVCQF_20.jpeg", + "TVCQF/TVCQF_21.jpeg", + "TVCQF/TVCQF_22.jpeg", + "TVCQF/TVCQF_23.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 121, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the dish.\nB. Take the clothes.\nC. Lie on the sofa/couch.\nD. Put down the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the dish", + "take the clothes", + "lie on the sofa couch", + "put down the pillow" + ], + "image_quantity_level": "Many", + "image": [ + "R971Z/R971Z_0.jpeg", + "R971Z/R971Z_1.jpeg", + "R971Z/R971Z_2.jpeg", + "R971Z/R971Z_3.jpeg", + "R971Z/R971Z_4.jpeg", + "R971Z/R971Z_5.jpeg", + "R971Z/R971Z_6.jpeg", + "R971Z/R971Z_7.jpeg", + "R971Z/R971Z_8.jpeg", + "R971Z/R971Z_9.jpeg", + "R971Z/R971Z_10.jpeg", + "R971Z/R971Z_11.jpeg", + "R971Z/R971Z_12.jpeg", + "R971Z/R971Z_13.jpeg", + "R971Z/R971Z_14.jpeg", + "R971Z/R971Z_15.jpeg", + "R971Z/R971Z_16.jpeg", + "R971Z/R971Z_17.jpeg", + "R971Z/R971Z_18.jpeg", + "R971Z/R971Z_19.jpeg", + "R971Z/R971Z_20.jpeg", + "R971Z/R971Z_21.jpeg", + "R971Z/R971Z_22.jpeg", + "R971Z/R971Z_23.jpeg", + "R971Z/R971Z_24.jpeg", + "R971Z/R971Z_25.jpeg", + "R971Z/R971Z_26.jpeg", + "R971Z/R971Z_27.jpeg", + "R971Z/R971Z_28.jpeg", + "R971Z/R971Z_29.jpeg", + "R971Z/R971Z_30.jpeg", + "R971Z/R971Z_31.jpeg", + "R971Z/R971Z_32.jpeg", + "R971Z/R971Z_33.jpeg", + "R971Z/R971Z_34.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 74, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the sandwich.\nB. Take the box.\nC. Put down the blanket.\nD. Put down the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "eat the sandwich", + "take the box", + "put down the blanket", + "put down the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "3ND23/3ND23_0.jpeg", + "3ND23/3ND23_1.jpeg", + "3ND23/3ND23_2.jpeg", + "3ND23/3ND23_3.jpeg", + "3ND23/3ND23_4.jpeg", + "3ND23/3ND23_5.jpeg", + "3ND23/3ND23_6.jpeg", + "3ND23/3ND23_7.jpeg", + "3ND23/3ND23_8.jpeg", + "3ND23/3ND23_9.jpeg", + "3ND23/3ND23_10.jpeg", + "3ND23/3ND23_11.jpeg", + "3ND23/3ND23_12.jpeg", + "3ND23/3ND23_13.jpeg", + "3ND23/3ND23_14.jpeg", + "3ND23/3ND23_15.jpeg", + "3ND23/3ND23_16.jpeg", + "3ND23/3ND23_17.jpeg", + "3ND23/3ND23_18.jpeg", + "3ND23/3ND23_19.jpeg", + "3ND23/3ND23_20.jpeg", + "3ND23/3ND23_21.jpeg", + "3ND23/3ND23_22.jpeg", + "3ND23/3ND23_23.jpeg", + "3ND23/3ND23_24.jpeg", + "3ND23/3ND23_25.jpeg", + "3ND23/3ND23_26.jpeg", + "3ND23/3ND23_27.jpeg", + "3ND23/3ND23_28.jpeg", + "3ND23/3ND23_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 77, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next with the table?\nChoice list: \nA. Take.\nB. Tidy up.\nC. Sit at.\nD. Wash.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit at", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take", + "tidy up", + "sit at", + "wash" + ], + "image_quantity_level": "Medium", + "image": [ + "XLR4B/XLR4B_0.jpeg", + "XLR4B/XLR4B_1.jpeg", + "XLR4B/XLR4B_2.jpeg", + "XLR4B/XLR4B_3.jpeg", + "XLR4B/XLR4B_4.jpeg", + "XLR4B/XLR4B_5.jpeg", + "XLR4B/XLR4B_6.jpeg", + "XLR4B/XLR4B_7.jpeg", + "XLR4B/XLR4B_8.jpeg", + "XLR4B/XLR4B_9.jpeg", + "XLR4B/XLR4B_10.jpeg", + "XLR4B/XLR4B_11.jpeg", + "XLR4B/XLR4B_12.jpeg", + "XLR4B/XLR4B_13.jpeg", + "XLR4B/XLR4B_14.jpeg", + "XLR4B/XLR4B_15.jpeg", + "XLR4B/XLR4B_16.jpeg", + "XLR4B/XLR4B_17.jpeg", + "XLR4B/XLR4B_18.jpeg", + "XLR4B/XLR4B_19.jpeg", + "XLR4B/XLR4B_20.jpeg", + "XLR4B/XLR4B_21.jpeg", + "XLR4B/XLR4B_22.jpeg", + "XLR4B/XLR4B_23.jpeg", + "XLR4B/XLR4B_24.jpeg", + "XLR4B/XLR4B_25.jpeg", + "XLR4B/XLR4B_26.jpeg", + "XLR4B/XLR4B_27.jpeg", + "XLR4B/XLR4B_28.jpeg", + "XLR4B/XLR4B_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 80, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next with the sofa/couch?\nChoice list: \nA. Put down.\nB. Lie on.\nC. Throw.\nD. Sit on.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down", + "lie on", + "throw", + "sit on" + ], + "image_quantity_level": "Medium", + "image": [ + "9OMY1/9OMY1_0.jpeg", + "9OMY1/9OMY1_1.jpeg", + "9OMY1/9OMY1_2.jpeg", + "9OMY1/9OMY1_3.jpeg", + "9OMY1/9OMY1_4.jpeg", + "9OMY1/9OMY1_5.jpeg", + "9OMY1/9OMY1_6.jpeg", + "9OMY1/9OMY1_7.jpeg", + "9OMY1/9OMY1_8.jpeg", + "9OMY1/9OMY1_9.jpeg", + "9OMY1/9OMY1_10.jpeg", + "9OMY1/9OMY1_11.jpeg", + "9OMY1/9OMY1_12.jpeg", + "9OMY1/9OMY1_13.jpeg", + "9OMY1/9OMY1_14.jpeg", + "9OMY1/9OMY1_15.jpeg", + "9OMY1/9OMY1_16.jpeg", + "9OMY1/9OMY1_17.jpeg", + "9OMY1/9OMY1_18.jpeg", + "9OMY1/9OMY1_19.jpeg", + "9OMY1/9OMY1_20.jpeg", + "9OMY1/9OMY1_21.jpeg", + "9OMY1/9OMY1_22.jpeg", + "9OMY1/9OMY1_23.jpeg", + "9OMY1/9OMY1_24.jpeg", + "9OMY1/9OMY1_25.jpeg", + "9OMY1/9OMY1_26.jpeg", + "9OMY1/9OMY1_27.jpeg", + "9OMY1/9OMY1_28.jpeg", + "9OMY1/9OMY1_29.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 83, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the book.\nB. Open the closet/cabinet.\nC. Take the sandwich.\nD. Take the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "open the book", + "open the closet cabinet", + "take the sandwich", + "take the food" + ], + "image_quantity_level": "Medium", + "image": [ + "BFH78/BFH78_0.jpeg", + "BFH78/BFH78_1.jpeg", + "BFH78/BFH78_2.jpeg", + "BFH78/BFH78_3.jpeg", + "BFH78/BFH78_4.jpeg", + "BFH78/BFH78_5.jpeg", + "BFH78/BFH78_6.jpeg", + "BFH78/BFH78_7.jpeg", + "BFH78/BFH78_8.jpeg", + "BFH78/BFH78_9.jpeg", + "BFH78/BFH78_10.jpeg", + "BFH78/BFH78_11.jpeg", + "BFH78/BFH78_12.jpeg", + "BFH78/BFH78_13.jpeg", + "BFH78/BFH78_14.jpeg", + "BFH78/BFH78_15.jpeg", + "BFH78/BFH78_16.jpeg", + "BFH78/BFH78_17.jpeg", + "BFH78/BFH78_18.jpeg", + "BFH78/BFH78_19.jpeg", + "BFH78/BFH78_20.jpeg", + "BFH78/BFH78_21.jpeg", + "BFH78/BFH78_22.jpeg", + "BFH78/BFH78_23.jpeg", + "BFH78/BFH78_24.jpeg", + "BFH78/BFH78_25.jpeg", + "BFH78/BFH78_26.jpeg", + "BFH78/BFH78_27.jpeg", + "BFH78/BFH78_28.jpeg", + "BFH78/BFH78_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 88, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the shoe.\nB. Lie on the floor.\nC. Close the box.\nD. Tidy up the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidy up the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the shoe", + "lie on the floor", + "close the box", + "tidy up the blanket" + ], + "image_quantity_level": "Medium", + "image": [ + "84893/84893_0.jpeg", + "84893/84893_1.jpeg", + "84893/84893_2.jpeg", + "84893/84893_3.jpeg", + "84893/84893_4.jpeg", + "84893/84893_5.jpeg", + "84893/84893_6.jpeg", + "84893/84893_7.jpeg", + "84893/84893_8.jpeg", + "84893/84893_9.jpeg", + "84893/84893_10.jpeg", + "84893/84893_11.jpeg", + "84893/84893_12.jpeg", + "84893/84893_13.jpeg", + "84893/84893_14.jpeg", + "84893/84893_15.jpeg", + "84893/84893_16.jpeg", + "84893/84893_17.jpeg", + "84893/84893_18.jpeg", + "84893/84893_19.jpeg", + "84893/84893_20.jpeg", + "84893/84893_21.jpeg", + "84893/84893_22.jpeg", + "84893/84893_23.jpeg", + "84893/84893_24.jpeg", + "84893/84893_25.jpeg", + "84893/84893_26.jpeg", + "84893/84893_27.jpeg", + "84893/84893_28.jpeg", + "84893/84893_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 90, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the clothes.\nB. Take the pillow.\nC. Take the laptop.\nD. Take the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidy up the clothes", + "take the pillow", + "take the laptop", + "take the box" + ], + "image_quantity_level": "Medium", + "image": [ + "3T785/3T785_0.jpeg", + "3T785/3T785_1.jpeg", + "3T785/3T785_2.jpeg", + "3T785/3T785_3.jpeg", + "3T785/3T785_4.jpeg", + "3T785/3T785_5.jpeg", + "3T785/3T785_6.jpeg", + "3T785/3T785_7.jpeg", + "3T785/3T785_8.jpeg", + "3T785/3T785_9.jpeg", + "3T785/3T785_10.jpeg", + "3T785/3T785_11.jpeg", + "3T785/3T785_12.jpeg", + "3T785/3T785_13.jpeg", + "3T785/3T785_14.jpeg", + "3T785/3T785_15.jpeg", + "3T785/3T785_16.jpeg", + "3T785/3T785_17.jpeg", + "3T785/3T785_18.jpeg", + "3T785/3T785_19.jpeg", + "3T785/3T785_20.jpeg", + "3T785/3T785_21.jpeg", + "3T785/3T785_22.jpeg", + "3T785/3T785_23.jpeg", + "3T785/3T785_24.jpeg", + "3T785/3T785_25.jpeg", + "3T785/3T785_26.jpeg", + "3T785/3T785_27.jpeg", + "3T785/3T785_28.jpeg", + "3T785/3T785_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 91, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next with the cup/glass/bottle?\nChoice list: \nA. Take.\nB. Eat.\nC. Put down.\nD. Wash.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take", + "eat", + "put down", + "wash" + ], + "image_quantity_level": "Medium", + "image": [ + "3MLVF/3MLVF_0.jpeg", + "3MLVF/3MLVF_1.jpeg", + "3MLVF/3MLVF_2.jpeg", + "3MLVF/3MLVF_3.jpeg", + "3MLVF/3MLVF_4.jpeg", + "3MLVF/3MLVF_5.jpeg", + "3MLVF/3MLVF_6.jpeg", + "3MLVF/3MLVF_7.jpeg", + "3MLVF/3MLVF_8.jpeg", + "3MLVF/3MLVF_9.jpeg", + "3MLVF/3MLVF_10.jpeg", + "3MLVF/3MLVF_11.jpeg", + "3MLVF/3MLVF_12.jpeg", + "3MLVF/3MLVF_13.jpeg", + "3MLVF/3MLVF_14.jpeg", + "3MLVF/3MLVF_15.jpeg", + "3MLVF/3MLVF_16.jpeg", + "3MLVF/3MLVF_17.jpeg", + "3MLVF/3MLVF_18.jpeg", + "3MLVF/3MLVF_19.jpeg", + "3MLVF/3MLVF_20.jpeg", + "3MLVF/3MLVF_21.jpeg", + "3MLVF/3MLVF_22.jpeg", + "3MLVF/3MLVF_23.jpeg", + "3MLVF/3MLVF_24.jpeg", + "3MLVF/3MLVF_25.jpeg", + "3MLVF/3MLVF_26.jpeg", + "3MLVF/3MLVF_27.jpeg", + "3MLVF/3MLVF_28.jpeg", + "3MLVF/3MLVF_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 93, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the phone/camera.\nB. Open the bag.\nC. Put down the blanket.\nD. Throw the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take the phone camera", + "open the bag", + "put down the blanket", + "throw the food" + ], + "image_quantity_level": "Medium", + "image": [ + "BDWIX/BDWIX_0.jpeg", + "BDWIX/BDWIX_1.jpeg", + "BDWIX/BDWIX_2.jpeg", + "BDWIX/BDWIX_3.jpeg", + "BDWIX/BDWIX_4.jpeg", + "BDWIX/BDWIX_5.jpeg", + "BDWIX/BDWIX_6.jpeg", + "BDWIX/BDWIX_7.jpeg", + "BDWIX/BDWIX_8.jpeg", + "BDWIX/BDWIX_9.jpeg", + "BDWIX/BDWIX_10.jpeg", + "BDWIX/BDWIX_11.jpeg", + "BDWIX/BDWIX_12.jpeg", + "BDWIX/BDWIX_13.jpeg", + "BDWIX/BDWIX_14.jpeg", + "BDWIX/BDWIX_15.jpeg", + "BDWIX/BDWIX_16.jpeg", + "BDWIX/BDWIX_17.jpeg", + "BDWIX/BDWIX_18.jpeg", + "BDWIX/BDWIX_19.jpeg", + "BDWIX/BDWIX_20.jpeg", + "BDWIX/BDWIX_21.jpeg", + "BDWIX/BDWIX_22.jpeg", + "BDWIX/BDWIX_23.jpeg", + "BDWIX/BDWIX_24.jpeg", + "BDWIX/BDWIX_25.jpeg", + "BDWIX/BDWIX_26.jpeg", + "BDWIX/BDWIX_27.jpeg", + "BDWIX/BDWIX_28.jpeg", + "BDWIX/BDWIX_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 97, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the sandwich.\nB. Put down the blanket.\nC. Take the cup/glass/bottle.\nD. Eat the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the sandwich", + "put down the blanket", + "take the cup glass bottle", + "eat the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "UEP20/UEP20_0.jpeg", + "UEP20/UEP20_1.jpeg", + "UEP20/UEP20_2.jpeg", + "UEP20/UEP20_3.jpeg", + "UEP20/UEP20_4.jpeg", + "UEP20/UEP20_5.jpeg", + "UEP20/UEP20_6.jpeg", + "UEP20/UEP20_7.jpeg", + "UEP20/UEP20_8.jpeg", + "UEP20/UEP20_9.jpeg", + "UEP20/UEP20_10.jpeg", + "UEP20/UEP20_11.jpeg", + "UEP20/UEP20_12.jpeg", + "UEP20/UEP20_13.jpeg", + "UEP20/UEP20_14.jpeg", + "UEP20/UEP20_15.jpeg", + "UEP20/UEP20_16.jpeg", + "UEP20/UEP20_17.jpeg", + "UEP20/UEP20_18.jpeg", + "UEP20/UEP20_19.jpeg", + "UEP20/UEP20_20.jpeg", + "UEP20/UEP20_21.jpeg", + "UEP20/UEP20_22.jpeg", + "UEP20/UEP20_23.jpeg", + "UEP20/UEP20_24.jpeg", + "UEP20/UEP20_25.jpeg", + "UEP20/UEP20_26.jpeg", + "UEP20/UEP20_27.jpeg", + "UEP20/UEP20_28.jpeg", + "UEP20/UEP20_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 100, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the book.\nB. Close the closet/cabinet.\nC. Put down the bag.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "open the book", + "close the closet cabinet", + "put down the bag", + "put down the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "HOZ76/HOZ76_0.jpeg", + "HOZ76/HOZ76_1.jpeg", + "HOZ76/HOZ76_2.jpeg", + "HOZ76/HOZ76_3.jpeg", + "HOZ76/HOZ76_4.jpeg", + "HOZ76/HOZ76_5.jpeg", + "HOZ76/HOZ76_6.jpeg", + "HOZ76/HOZ76_7.jpeg", + "HOZ76/HOZ76_8.jpeg", + "HOZ76/HOZ76_9.jpeg", + "HOZ76/HOZ76_10.jpeg", + "HOZ76/HOZ76_11.jpeg", + "HOZ76/HOZ76_12.jpeg", + "HOZ76/HOZ76_13.jpeg", + "HOZ76/HOZ76_14.jpeg", + "HOZ76/HOZ76_15.jpeg", + "HOZ76/HOZ76_16.jpeg", + "HOZ76/HOZ76_17.jpeg", + "HOZ76/HOZ76_18.jpeg", + "HOZ76/HOZ76_19.jpeg", + "HOZ76/HOZ76_20.jpeg", + "HOZ76/HOZ76_21.jpeg", + "HOZ76/HOZ76_22.jpeg", + "HOZ76/HOZ76_23.jpeg", + "HOZ76/HOZ76_24.jpeg", + "HOZ76/HOZ76_25.jpeg", + "HOZ76/HOZ76_26.jpeg", + "HOZ76/HOZ76_27.jpeg", + "HOZ76/HOZ76_28.jpeg", + "HOZ76/HOZ76_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 103, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the box.\nB. Tidy up the blanket.\nC. Close the laptop.\nD. Put down the paper/notebook.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the box", + "tidy up the blanket", + "close the laptop", + "put down the paper notebook" + ], + "image_quantity_level": "Medium", + "image": [ + "O2OUX/O2OUX_0.jpeg", + "O2OUX/O2OUX_1.jpeg", + "O2OUX/O2OUX_2.jpeg", + "O2OUX/O2OUX_3.jpeg", + "O2OUX/O2OUX_4.jpeg", + "O2OUX/O2OUX_5.jpeg", + "O2OUX/O2OUX_6.jpeg", + "O2OUX/O2OUX_7.jpeg", + "O2OUX/O2OUX_8.jpeg", + "O2OUX/O2OUX_9.jpeg", + "O2OUX/O2OUX_10.jpeg", + "O2OUX/O2OUX_11.jpeg", + "O2OUX/O2OUX_12.jpeg", + "O2OUX/O2OUX_13.jpeg", + "O2OUX/O2OUX_14.jpeg", + "O2OUX/O2OUX_15.jpeg", + "O2OUX/O2OUX_16.jpeg", + "O2OUX/O2OUX_17.jpeg", + "O2OUX/O2OUX_18.jpeg", + "O2OUX/O2OUX_19.jpeg", + "O2OUX/O2OUX_20.jpeg", + "O2OUX/O2OUX_21.jpeg", + "O2OUX/O2OUX_22.jpeg", + "O2OUX/O2OUX_23.jpeg", + "O2OUX/O2OUX_24.jpeg", + "O2OUX/O2OUX_25.jpeg", + "O2OUX/O2OUX_26.jpeg", + "O2OUX/O2OUX_27.jpeg", + "O2OUX/O2OUX_28.jpeg", + "O2OUX/O2OUX_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 107, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the blanket.\nB. Open the refrigerator.\nC. Open the closet/cabinet.\nD. Throw the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "tidy up the blanket", + "open the refrigerator", + "open the closet cabinet", + "throw the box" + ], + "image_quantity_level": "Medium", + "image": [ + "I713Z/I713Z_0.jpeg", + "I713Z/I713Z_1.jpeg", + "I713Z/I713Z_2.jpeg", + "I713Z/I713Z_3.jpeg", + "I713Z/I713Z_4.jpeg", + "I713Z/I713Z_5.jpeg", + "I713Z/I713Z_6.jpeg", + "I713Z/I713Z_7.jpeg", + "I713Z/I713Z_8.jpeg", + "I713Z/I713Z_9.jpeg", + "I713Z/I713Z_10.jpeg", + "I713Z/I713Z_11.jpeg", + "I713Z/I713Z_12.jpeg", + "I713Z/I713Z_13.jpeg", + "I713Z/I713Z_14.jpeg", + "I713Z/I713Z_15.jpeg", + "I713Z/I713Z_16.jpeg", + "I713Z/I713Z_17.jpeg", + "I713Z/I713Z_18.jpeg", + "I713Z/I713Z_19.jpeg", + "I713Z/I713Z_20.jpeg", + "I713Z/I713Z_21.jpeg", + "I713Z/I713Z_22.jpeg", + "I713Z/I713Z_23.jpeg", + "I713Z/I713Z_24.jpeg", + "I713Z/I713Z_25.jpeg", + "I713Z/I713Z_26.jpeg", + "I713Z/I713Z_27.jpeg", + "I713Z/I713Z_28.jpeg", + "I713Z/I713Z_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 39, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Tidy up the blanket.\nC. Take the pillow.\nD. Put down the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the cup glass bottle", + "tidy up the blanket", + "take the pillow", + "put down the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "75RPN/75RPN_0.jpeg", + "75RPN/75RPN_1.jpeg", + "75RPN/75RPN_2.jpeg", + "75RPN/75RPN_3.jpeg", + "75RPN/75RPN_4.jpeg", + "75RPN/75RPN_5.jpeg", + "75RPN/75RPN_6.jpeg", + "75RPN/75RPN_7.jpeg", + "75RPN/75RPN_8.jpeg", + "75RPN/75RPN_9.jpeg", + "75RPN/75RPN_10.jpeg", + "75RPN/75RPN_11.jpeg", + "75RPN/75RPN_12.jpeg", + "75RPN/75RPN_13.jpeg", + "75RPN/75RPN_14.jpeg", + "75RPN/75RPN_15.jpeg", + "75RPN/75RPN_16.jpeg", + "75RPN/75RPN_17.jpeg", + "75RPN/75RPN_18.jpeg", + "75RPN/75RPN_19.jpeg", + "75RPN/75RPN_20.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 180, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next with the cup/glass/bottle?\nChoice list: \nA. Take.\nB. Put down.\nC. Wash.\nD. Lie on.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take", + "put down", + "wash", + "lie on" + ], + "image_quantity_level": "Medium", + "image": [ + "KX8VW/KX8VW_0.jpeg", + "KX8VW/KX8VW_1.jpeg", + "KX8VW/KX8VW_2.jpeg", + "KX8VW/KX8VW_3.jpeg", + "KX8VW/KX8VW_4.jpeg", + "KX8VW/KX8VW_5.jpeg", + "KX8VW/KX8VW_6.jpeg", + "KX8VW/KX8VW_7.jpeg", + "KX8VW/KX8VW_8.jpeg", + "KX8VW/KX8VW_9.jpeg", + "KX8VW/KX8VW_10.jpeg", + "KX8VW/KX8VW_11.jpeg", + "KX8VW/KX8VW_12.jpeg", + "KX8VW/KX8VW_13.jpeg", + "KX8VW/KX8VW_14.jpeg", + "KX8VW/KX8VW_15.jpeg", + "KX8VW/KX8VW_16.jpeg", + "KX8VW/KX8VW_17.jpeg", + "KX8VW/KX8VW_18.jpeg", + "KX8VW/KX8VW_19.jpeg", + "KX8VW/KX8VW_20.jpeg", + "KX8VW/KX8VW_21.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 71, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Lie on the floor.\nC. Sit on the table.\nD. Put down the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the clothes", + "lie on the floor", + "sit on the table", + "put down the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "TK76G/TK76G_0.jpeg", + "TK76G/TK76G_1.jpeg", + "TK76G/TK76G_2.jpeg", + "TK76G/TK76G_3.jpeg", + "TK76G/TK76G_4.jpeg", + "TK76G/TK76G_5.jpeg", + "TK76G/TK76G_6.jpeg", + "TK76G/TK76G_7.jpeg", + "TK76G/TK76G_8.jpeg", + "TK76G/TK76G_9.jpeg", + "TK76G/TK76G_10.jpeg", + "TK76G/TK76G_11.jpeg", + "TK76G/TK76G_12.jpeg", + "TK76G/TK76G_13.jpeg", + "TK76G/TK76G_14.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 145, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the clothes.\nB. Close the window.\nC. Put down the food.\nD. Wash the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "throw the clothes", + "close the window", + "put down the food", + "wash the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "337O2/337O2_0.jpeg", + "337O2/337O2_1.jpeg", + "337O2/337O2_2.jpeg", + "337O2/337O2_3.jpeg", + "337O2/337O2_4.jpeg", + "337O2/337O2_5.jpeg", + "337O2/337O2_6.jpeg", + "337O2/337O2_7.jpeg", + "337O2/337O2_8.jpeg", + "337O2/337O2_9.jpeg", + "337O2/337O2_10.jpeg", + "337O2/337O2_11.jpeg", + "337O2/337O2_12.jpeg", + "337O2/337O2_13.jpeg", + "337O2/337O2_14.jpeg", + "337O2/337O2_15.jpeg", + "337O2/337O2_16.jpeg", + "337O2/337O2_17.jpeg", + "337O2/337O2_18.jpeg", + "337O2/337O2_19.jpeg", + "337O2/337O2_20.jpeg", + "337O2/337O2_21.jpeg", + "337O2/337O2_22.jpeg", + "337O2/337O2_23.jpeg", + "337O2/337O2_24.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 117, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Wash the mirror.\nC. Open the box.\nD. Open the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the door", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the cup glass bottle", + "wash the mirror", + "open the box", + "open the door" + ], + "image_quantity_level": "Medium", + "image": [ + "DGPAW/DGPAW_0.jpeg", + "DGPAW/DGPAW_1.jpeg", + "DGPAW/DGPAW_2.jpeg", + "DGPAW/DGPAW_3.jpeg", + "DGPAW/DGPAW_4.jpeg", + "DGPAW/DGPAW_5.jpeg", + "DGPAW/DGPAW_6.jpeg", + "DGPAW/DGPAW_7.jpeg", + "DGPAW/DGPAW_8.jpeg", + "DGPAW/DGPAW_9.jpeg", + "DGPAW/DGPAW_10.jpeg", + "DGPAW/DGPAW_11.jpeg", + "DGPAW/DGPAW_12.jpeg", + "DGPAW/DGPAW_13.jpeg", + "DGPAW/DGPAW_14.jpeg", + "DGPAW/DGPAW_15.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 195, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the book.\nB. Take the towel.\nC. Close the door.\nD. Put down the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "throw the book", + "take the towel", + "close the door", + "put down the book" + ], + "image_quantity_level": "Many", + "image": [ + "U3ANG/U3ANG_0.jpeg", + "U3ANG/U3ANG_1.jpeg", + "U3ANG/U3ANG_2.jpeg", + "U3ANG/U3ANG_3.jpeg", + "U3ANG/U3ANG_4.jpeg", + "U3ANG/U3ANG_5.jpeg", + "U3ANG/U3ANG_6.jpeg", + "U3ANG/U3ANG_7.jpeg", + "U3ANG/U3ANG_8.jpeg", + "U3ANG/U3ANG_9.jpeg", + "U3ANG/U3ANG_10.jpeg", + "U3ANG/U3ANG_11.jpeg", + "U3ANG/U3ANG_12.jpeg", + "U3ANG/U3ANG_13.jpeg", + "U3ANG/U3ANG_14.jpeg", + "U3ANG/U3ANG_15.jpeg", + "U3ANG/U3ANG_16.jpeg", + "U3ANG/U3ANG_17.jpeg", + "U3ANG/U3ANG_18.jpeg", + "U3ANG/U3ANG_19.jpeg", + "U3ANG/U3ANG_20.jpeg", + "U3ANG/U3ANG_21.jpeg", + "U3ANG/U3ANG_22.jpeg", + "U3ANG/U3ANG_23.jpeg", + "U3ANG/U3ANG_24.jpeg", + "U3ANG/U3ANG_25.jpeg", + "U3ANG/U3ANG_26.jpeg", + "U3ANG/U3ANG_27.jpeg", + "U3ANG/U3ANG_28.jpeg", + "U3ANG/U3ANG_29.jpeg", + "U3ANG/U3ANG_30.jpeg", + "U3ANG/U3ANG_31.jpeg", + "U3ANG/U3ANG_32.jpeg", + "U3ANG/U3ANG_33.jpeg", + "U3ANG/U3ANG_34.jpeg", + "U3ANG/U3ANG_35.jpeg", + "U3ANG/U3ANG_36.jpeg", + "U3ANG/U3ANG_37.jpeg", + "U3ANG/U3ANG_38.jpeg", + "U3ANG/U3ANG_39.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 149, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the sandwich.\nB. Take the dish.\nC. Wash the dish.\nD. Close the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the sandwich", + "take the dish", + "wash the dish", + "close the laptop" + ], + "image_quantity_level": "Medium", + "image": [ + "3064K/3064K_0.jpeg", + "3064K/3064K_1.jpeg", + "3064K/3064K_2.jpeg", + "3064K/3064K_3.jpeg", + "3064K/3064K_4.jpeg", + "3064K/3064K_5.jpeg", + "3064K/3064K_6.jpeg", + "3064K/3064K_7.jpeg", + "3064K/3064K_8.jpeg", + "3064K/3064K_9.jpeg", + "3064K/3064K_10.jpeg", + "3064K/3064K_11.jpeg", + "3064K/3064K_12.jpeg", + "3064K/3064K_13.jpeg", + "3064K/3064K_14.jpeg", + "3064K/3064K_15.jpeg", + "3064K/3064K_16.jpeg", + "3064K/3064K_17.jpeg", + "3064K/3064K_18.jpeg", + "3064K/3064K_19.jpeg", + "3064K/3064K_20.jpeg", + "3064K/3064K_21.jpeg", + "3064K/3064K_22.jpeg", + "3064K/3064K_23.jpeg", + "3064K/3064K_24.jpeg", + "3064K/3064K_25.jpeg", + "3064K/3064K_26.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 163, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the food.\nB. Take the box.\nC. Put down the broom.\nD. Close the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the broom", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "throw the food", + "take the box", + "put down the broom", + "close the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "D8OSC/D8OSC_0.jpeg", + "D8OSC/D8OSC_1.jpeg", + "D8OSC/D8OSC_2.jpeg", + "D8OSC/D8OSC_3.jpeg", + "D8OSC/D8OSC_4.jpeg", + "D8OSC/D8OSC_5.jpeg", + "D8OSC/D8OSC_6.jpeg", + "D8OSC/D8OSC_7.jpeg", + "D8OSC/D8OSC_8.jpeg", + "D8OSC/D8OSC_9.jpeg", + "D8OSC/D8OSC_10.jpeg", + "D8OSC/D8OSC_11.jpeg", + "D8OSC/D8OSC_12.jpeg", + "D8OSC/D8OSC_13.jpeg", + "D8OSC/D8OSC_14.jpeg", + "D8OSC/D8OSC_15.jpeg", + "D8OSC/D8OSC_16.jpeg", + "D8OSC/D8OSC_17.jpeg", + "D8OSC/D8OSC_18.jpeg", + "D8OSC/D8OSC_19.jpeg", + "D8OSC/D8OSC_20.jpeg", + "D8OSC/D8OSC_21.jpeg", + "D8OSC/D8OSC_22.jpeg", + "D8OSC/D8OSC_23.jpeg", + "D8OSC/D8OSC_24.jpeg", + "D8OSC/D8OSC_25.jpeg", + "D8OSC/D8OSC_26.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 137, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next with the food?\nChoice list: \nA. Hold.\nB. Throw.\nC. Take.\nD. Put down.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "hold", + "throw", + "take", + "put down" + ], + "image_quantity_level": "Medium", + "image": [ + "QPX3S/QPX3S_0.jpeg", + "QPX3S/QPX3S_1.jpeg", + "QPX3S/QPX3S_2.jpeg", + "QPX3S/QPX3S_3.jpeg", + "QPX3S/QPX3S_4.jpeg", + "QPX3S/QPX3S_5.jpeg", + "QPX3S/QPX3S_6.jpeg", + "QPX3S/QPX3S_7.jpeg", + "QPX3S/QPX3S_8.jpeg", + "QPX3S/QPX3S_9.jpeg", + "QPX3S/QPX3S_10.jpeg", + "QPX3S/QPX3S_11.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 185, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Put down the clothes.\nC. Put down the broom.\nD. Put down the picture.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the broom", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the clothes", + "put down the clothes", + "put down the broom", + "put down the picture" + ], + "image_quantity_level": "Many", + "image": [ + "EO6OI/EO6OI_0.jpeg", + "EO6OI/EO6OI_1.jpeg", + "EO6OI/EO6OI_2.jpeg", + "EO6OI/EO6OI_3.jpeg", + "EO6OI/EO6OI_4.jpeg", + "EO6OI/EO6OI_5.jpeg", + "EO6OI/EO6OI_6.jpeg", + "EO6OI/EO6OI_7.jpeg", + "EO6OI/EO6OI_8.jpeg", + "EO6OI/EO6OI_9.jpeg", + "EO6OI/EO6OI_10.jpeg", + "EO6OI/EO6OI_11.jpeg", + "EO6OI/EO6OI_12.jpeg", + "EO6OI/EO6OI_13.jpeg", + "EO6OI/EO6OI_14.jpeg", + "EO6OI/EO6OI_15.jpeg", + "EO6OI/EO6OI_16.jpeg", + "EO6OI/EO6OI_17.jpeg", + "EO6OI/EO6OI_18.jpeg", + "EO6OI/EO6OI_19.jpeg", + "EO6OI/EO6OI_20.jpeg", + "EO6OI/EO6OI_21.jpeg", + "EO6OI/EO6OI_22.jpeg", + "EO6OI/EO6OI_23.jpeg", + "EO6OI/EO6OI_24.jpeg", + "EO6OI/EO6OI_25.jpeg", + "EO6OI/EO6OI_26.jpeg", + "EO6OI/EO6OI_27.jpeg", + "EO6OI/EO6OI_28.jpeg", + "EO6OI/EO6OI_29.jpeg", + "EO6OI/EO6OI_30.jpeg", + "EO6OI/EO6OI_31.jpeg", + "EO6OI/EO6OI_32.jpeg", + "EO6OI/EO6OI_33.jpeg", + "EO6OI/EO6OI_34.jpeg", + "EO6OI/EO6OI_35.jpeg", + "EO6OI/EO6OI_36.jpeg", + "EO6OI/EO6OI_37.jpeg", + "EO6OI/EO6OI_38.jpeg", + "EO6OI/EO6OI_39.jpeg", + "EO6OI/EO6OI_40.jpeg", + "EO6OI/EO6OI_41.jpeg", + "EO6OI/EO6OI_42.jpeg", + "EO6OI/EO6OI_43.jpeg", + "EO6OI/EO6OI_44.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 129, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next with the food?\nChoice list: \nA. Put down.\nB. Hold.\nC. Take.\nD. Throw.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down", + "hold", + "take", + "throw" + ], + "image_quantity_level": "Many", + "image": [ + "9O0HH/9O0HH_0.jpeg", + "9O0HH/9O0HH_1.jpeg", + "9O0HH/9O0HH_2.jpeg", + "9O0HH/9O0HH_3.jpeg", + "9O0HH/9O0HH_4.jpeg", + "9O0HH/9O0HH_5.jpeg", + "9O0HH/9O0HH_6.jpeg", + "9O0HH/9O0HH_7.jpeg", + "9O0HH/9O0HH_8.jpeg", + "9O0HH/9O0HH_9.jpeg", + "9O0HH/9O0HH_10.jpeg", + "9O0HH/9O0HH_11.jpeg", + "9O0HH/9O0HH_12.jpeg", + "9O0HH/9O0HH_13.jpeg", + "9O0HH/9O0HH_14.jpeg", + "9O0HH/9O0HH_15.jpeg", + "9O0HH/9O0HH_16.jpeg", + "9O0HH/9O0HH_17.jpeg", + "9O0HH/9O0HH_18.jpeg", + "9O0HH/9O0HH_19.jpeg", + "9O0HH/9O0HH_20.jpeg", + "9O0HH/9O0HH_21.jpeg", + "9O0HH/9O0HH_22.jpeg", + "9O0HH/9O0HH_23.jpeg", + "9O0HH/9O0HH_24.jpeg", + "9O0HH/9O0HH_25.jpeg", + "9O0HH/9O0HH_26.jpeg", + "9O0HH/9O0HH_27.jpeg", + "9O0HH/9O0HH_28.jpeg", + "9O0HH/9O0HH_29.jpeg", + "9O0HH/9O0HH_30.jpeg", + "9O0HH/9O0HH_31.jpeg", + "9O0HH/9O0HH_32.jpeg", + "9O0HH/9O0HH_33.jpeg", + "9O0HH/9O0HH_34.jpeg", + "9O0HH/9O0HH_35.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 69, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Take the dish.\nC. Hold the picture.\nD. Take the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the clothes", + "take the dish", + "hold the picture", + "take the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "5X5DT/5X5DT_0.jpeg", + "5X5DT/5X5DT_1.jpeg", + "5X5DT/5X5DT_2.jpeg", + "5X5DT/5X5DT_3.jpeg", + "5X5DT/5X5DT_4.jpeg", + "5X5DT/5X5DT_5.jpeg", + "5X5DT/5X5DT_6.jpeg", + "5X5DT/5X5DT_7.jpeg", + "5X5DT/5X5DT_8.jpeg", + "5X5DT/5X5DT_9.jpeg", + "5X5DT/5X5DT_10.jpeg", + "5X5DT/5X5DT_11.jpeg", + "5X5DT/5X5DT_12.jpeg", + "5X5DT/5X5DT_13.jpeg", + "5X5DT/5X5DT_14.jpeg", + "5X5DT/5X5DT_15.jpeg", + "5X5DT/5X5DT_16.jpeg", + "5X5DT/5X5DT_17.jpeg", + "5X5DT/5X5DT_18.jpeg", + "5X5DT/5X5DT_19.jpeg", + "5X5DT/5X5DT_20.jpeg", + "5X5DT/5X5DT_21.jpeg", + "5X5DT/5X5DT_22.jpeg", + "5X5DT/5X5DT_23.jpeg", + "5X5DT/5X5DT_24.jpeg", + "5X5DT/5X5DT_25.jpeg", + "5X5DT/5X5DT_26.jpeg", + "5X5DT/5X5DT_27.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 87, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Sit at the table.\nB. Close the window.\nC. Take the cup/glass/bottle.\nD. Eat the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "eat the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "sit at the table", + "close the window", + "take the cup glass bottle", + "eat the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "V3RAX/V3RAX_0.jpeg", + "V3RAX/V3RAX_1.jpeg", + "V3RAX/V3RAX_2.jpeg", + "V3RAX/V3RAX_3.jpeg", + "V3RAX/V3RAX_4.jpeg", + "V3RAX/V3RAX_5.jpeg", + "V3RAX/V3RAX_6.jpeg", + "V3RAX/V3RAX_7.jpeg", + "V3RAX/V3RAX_8.jpeg", + "V3RAX/V3RAX_9.jpeg", + "V3RAX/V3RAX_10.jpeg", + "V3RAX/V3RAX_11.jpeg", + "V3RAX/V3RAX_12.jpeg", + "V3RAX/V3RAX_13.jpeg", + "V3RAX/V3RAX_14.jpeg", + "V3RAX/V3RAX_15.jpeg", + "V3RAX/V3RAX_16.jpeg", + "V3RAX/V3RAX_17.jpeg", + "V3RAX/V3RAX_18.jpeg", + "V3RAX/V3RAX_19.jpeg", + "V3RAX/V3RAX_20.jpeg", + "V3RAX/V3RAX_21.jpeg", + "V3RAX/V3RAX_22.jpeg", + "V3RAX/V3RAX_23.jpeg", + "V3RAX/V3RAX_24.jpeg", + "V3RAX/V3RAX_25.jpeg", + "V3RAX/V3RAX_26.jpeg", + "V3RAX/V3RAX_27.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 131, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the food.\nB. Take the blanket.\nC. Eat the medicine.\nD. Open the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the food", + "take the blanket", + "eat the medicine", + "open the refrigerator" + ], + "image_quantity_level": "Medium", + "image": [ + "K71S1/K71S1_0.jpeg", + "K71S1/K71S1_1.jpeg", + "K71S1/K71S1_2.jpeg", + "K71S1/K71S1_3.jpeg", + "K71S1/K71S1_4.jpeg", + "K71S1/K71S1_5.jpeg", + "K71S1/K71S1_6.jpeg", + "K71S1/K71S1_7.jpeg", + "K71S1/K71S1_8.jpeg", + "K71S1/K71S1_9.jpeg", + "K71S1/K71S1_10.jpeg", + "K71S1/K71S1_11.jpeg", + "K71S1/K71S1_12.jpeg", + "K71S1/K71S1_13.jpeg", + "K71S1/K71S1_14.jpeg", + "K71S1/K71S1_15.jpeg", + "K71S1/K71S1_16.jpeg", + "K71S1/K71S1_17.jpeg", + "K71S1/K71S1_18.jpeg", + "K71S1/K71S1_19.jpeg", + "K71S1/K71S1_20.jpeg", + "K71S1/K71S1_21.jpeg", + "K71S1/K71S1_22.jpeg", + "K71S1/K71S1_23.jpeg", + "K71S1/K71S1_24.jpeg", + "K71S1/K71S1_25.jpeg", + "K71S1/K71S1_26.jpeg", + "K71S1/K71S1_27.jpeg", + "K71S1/K71S1_28.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 132, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the cup/glass/bottle.\nB. Put down the laptop.\nC. Put down the cup/glass/bottle.\nD. Put down the shoe.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the cup glass bottle", + "put down the laptop", + "put down the cup glass bottle", + "put down the shoe" + ], + "image_quantity_level": "Medium", + "image": [ + "KV99H/KV99H_0.jpeg", + "KV99H/KV99H_1.jpeg", + "KV99H/KV99H_2.jpeg", + "KV99H/KV99H_3.jpeg", + "KV99H/KV99H_4.jpeg", + "KV99H/KV99H_5.jpeg", + "KV99H/KV99H_6.jpeg", + "KV99H/KV99H_7.jpeg", + "KV99H/KV99H_8.jpeg", + "KV99H/KV99H_9.jpeg", + "KV99H/KV99H_10.jpeg", + "KV99H/KV99H_11.jpeg", + "KV99H/KV99H_12.jpeg", + "KV99H/KV99H_13.jpeg", + "KV99H/KV99H_14.jpeg", + "KV99H/KV99H_15.jpeg", + "KV99H/KV99H_16.jpeg", + "KV99H/KV99H_17.jpeg", + "KV99H/KV99H_18.jpeg", + "KV99H/KV99H_19.jpeg", + "KV99H/KV99H_20.jpeg", + "KV99H/KV99H_21.jpeg", + "KV99H/KV99H_22.jpeg", + "KV99H/KV99H_23.jpeg", + "KV99H/KV99H_24.jpeg", + "KV99H/KV99H_25.jpeg", + "KV99H/KV99H_26.jpeg", + "KV99H/KV99H_27.jpeg", + "KV99H/KV99H_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 155, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the shoe.\nB. Take the book.\nC. Sit on the table.\nD. Take the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the shoe", + "take the book", + "sit on the table", + "take the box" + ], + "image_quantity_level": "Medium", + "image": [ + "YACA8/YACA8_0.jpeg", + "YACA8/YACA8_1.jpeg", + "YACA8/YACA8_2.jpeg", + "YACA8/YACA8_3.jpeg", + "YACA8/YACA8_4.jpeg", + "YACA8/YACA8_5.jpeg", + "YACA8/YACA8_6.jpeg", + "YACA8/YACA8_7.jpeg", + "YACA8/YACA8_8.jpeg", + "YACA8/YACA8_9.jpeg", + "YACA8/YACA8_10.jpeg", + "YACA8/YACA8_11.jpeg", + "YACA8/YACA8_12.jpeg", + "YACA8/YACA8_13.jpeg", + "YACA8/YACA8_14.jpeg", + "YACA8/YACA8_15.jpeg", + "YACA8/YACA8_16.jpeg", + "YACA8/YACA8_17.jpeg", + "YACA8/YACA8_18.jpeg", + "YACA8/YACA8_19.jpeg", + "YACA8/YACA8_20.jpeg", + "YACA8/YACA8_21.jpeg", + "YACA8/YACA8_22.jpeg", + "YACA8/YACA8_23.jpeg", + "YACA8/YACA8_24.jpeg", + "YACA8/YACA8_25.jpeg", + "YACA8/YACA8_26.jpeg", + "YACA8/YACA8_27.jpeg", + "YACA8/YACA8_28.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 101, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the door.\nB. Open the refrigerator.\nC. Tidy up the towel.\nD. Lie on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "open the door", + "open the refrigerator", + "tidy up the towel", + "lie on the bed" + ], + "image_quantity_level": "Many", + "image": [ + "FVITB/FVITB_0.jpeg", + "FVITB/FVITB_1.jpeg", + "FVITB/FVITB_2.jpeg", + "FVITB/FVITB_3.jpeg", + "FVITB/FVITB_4.jpeg", + "FVITB/FVITB_5.jpeg", + "FVITB/FVITB_6.jpeg", + "FVITB/FVITB_7.jpeg", + "FVITB/FVITB_8.jpeg", + "FVITB/FVITB_9.jpeg", + "FVITB/FVITB_10.jpeg", + "FVITB/FVITB_11.jpeg", + "FVITB/FVITB_12.jpeg", + "FVITB/FVITB_13.jpeg", + "FVITB/FVITB_14.jpeg", + "FVITB/FVITB_15.jpeg", + "FVITB/FVITB_16.jpeg", + "FVITB/FVITB_17.jpeg", + "FVITB/FVITB_18.jpeg", + "FVITB/FVITB_19.jpeg", + "FVITB/FVITB_20.jpeg", + "FVITB/FVITB_21.jpeg", + "FVITB/FVITB_22.jpeg", + "FVITB/FVITB_23.jpeg", + "FVITB/FVITB_24.jpeg", + "FVITB/FVITB_25.jpeg", + "FVITB/FVITB_26.jpeg", + "FVITB/FVITB_27.jpeg", + "FVITB/FVITB_28.jpeg", + "FVITB/FVITB_29.jpeg", + "FVITB/FVITB_30.jpeg", + "FVITB/FVITB_31.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 102, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the cup/glass/bottle.\nB. Close the box.\nC. Put down the cup/glass/bottle.\nD. Open the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "take the cup glass bottle", + "close the box", + "put down the cup glass bottle", + "open the laptop" + ], + "image_quantity_level": "Many", + "image": [ + "A0ZW3/A0ZW3_0.jpeg", + "A0ZW3/A0ZW3_1.jpeg", + "A0ZW3/A0ZW3_2.jpeg", + "A0ZW3/A0ZW3_3.jpeg", + "A0ZW3/A0ZW3_4.jpeg", + "A0ZW3/A0ZW3_5.jpeg", + "A0ZW3/A0ZW3_6.jpeg", + "A0ZW3/A0ZW3_7.jpeg", + "A0ZW3/A0ZW3_8.jpeg", + "A0ZW3/A0ZW3_9.jpeg", + "A0ZW3/A0ZW3_10.jpeg", + "A0ZW3/A0ZW3_11.jpeg", + "A0ZW3/A0ZW3_12.jpeg", + "A0ZW3/A0ZW3_13.jpeg", + "A0ZW3/A0ZW3_14.jpeg", + "A0ZW3/A0ZW3_15.jpeg", + "A0ZW3/A0ZW3_16.jpeg", + "A0ZW3/A0ZW3_17.jpeg", + "A0ZW3/A0ZW3_18.jpeg", + "A0ZW3/A0ZW3_19.jpeg", + "A0ZW3/A0ZW3_20.jpeg", + "A0ZW3/A0ZW3_21.jpeg", + "A0ZW3/A0ZW3_22.jpeg", + "A0ZW3/A0ZW3_23.jpeg", + "A0ZW3/A0ZW3_24.jpeg", + "A0ZW3/A0ZW3_25.jpeg", + "A0ZW3/A0ZW3_26.jpeg", + "A0ZW3/A0ZW3_27.jpeg", + "A0ZW3/A0ZW3_28.jpeg", + "A0ZW3/A0ZW3_29.jpeg", + "A0ZW3/A0ZW3_30.jpeg", + "A0ZW3/A0ZW3_31.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 112, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the bed.\nB. Put down the dish.\nC. Throw the clothes.\nD. Sit on the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "lie on the bed", + "put down the dish", + "throw the clothes", + "sit on the table" + ], + "image_quantity_level": "Many", + "image": [ + "U0X4P/U0X4P_0.jpeg", + "U0X4P/U0X4P_1.jpeg", + "U0X4P/U0X4P_2.jpeg", + "U0X4P/U0X4P_3.jpeg", + "U0X4P/U0X4P_4.jpeg", + "U0X4P/U0X4P_5.jpeg", + "U0X4P/U0X4P_6.jpeg", + "U0X4P/U0X4P_7.jpeg", + "U0X4P/U0X4P_8.jpeg", + "U0X4P/U0X4P_9.jpeg", + "U0X4P/U0X4P_10.jpeg", + "U0X4P/U0X4P_11.jpeg", + "U0X4P/U0X4P_12.jpeg", + "U0X4P/U0X4P_13.jpeg", + "U0X4P/U0X4P_14.jpeg", + "U0X4P/U0X4P_15.jpeg", + "U0X4P/U0X4P_16.jpeg", + "U0X4P/U0X4P_17.jpeg", + "U0X4P/U0X4P_18.jpeg", + "U0X4P/U0X4P_19.jpeg", + "U0X4P/U0X4P_20.jpeg", + "U0X4P/U0X4P_21.jpeg", + "U0X4P/U0X4P_22.jpeg", + "U0X4P/U0X4P_23.jpeg", + "U0X4P/U0X4P_24.jpeg", + "U0X4P/U0X4P_25.jpeg", + "U0X4P/U0X4P_26.jpeg", + "U0X4P/U0X4P_27.jpeg", + "U0X4P/U0X4P_28.jpeg", + "U0X4P/U0X4P_29.jpeg", + "U0X4P/U0X4P_30.jpeg", + "U0X4P/U0X4P_31.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 118, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the book.\nB. Eat the sandwich.\nC. Open the door.\nD. Throw the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the book", + "eat the sandwich", + "open the door", + "throw the blanket" + ], + "image_quantity_level": "Many", + "image": [ + "3DO95/3DO95_0.jpeg", + "3DO95/3DO95_1.jpeg", + "3DO95/3DO95_2.jpeg", + "3DO95/3DO95_3.jpeg", + "3DO95/3DO95_4.jpeg", + "3DO95/3DO95_5.jpeg", + "3DO95/3DO95_6.jpeg", + "3DO95/3DO95_7.jpeg", + "3DO95/3DO95_8.jpeg", + "3DO95/3DO95_9.jpeg", + "3DO95/3DO95_10.jpeg", + "3DO95/3DO95_11.jpeg", + "3DO95/3DO95_12.jpeg", + "3DO95/3DO95_13.jpeg", + "3DO95/3DO95_14.jpeg", + "3DO95/3DO95_15.jpeg", + "3DO95/3DO95_16.jpeg", + "3DO95/3DO95_17.jpeg", + "3DO95/3DO95_18.jpeg", + "3DO95/3DO95_19.jpeg", + "3DO95/3DO95_20.jpeg", + "3DO95/3DO95_21.jpeg", + "3DO95/3DO95_22.jpeg", + "3DO95/3DO95_23.jpeg", + "3DO95/3DO95_24.jpeg", + "3DO95/3DO95_25.jpeg", + "3DO95/3DO95_26.jpeg", + "3DO95/3DO95_27.jpeg", + "3DO95/3DO95_28.jpeg", + "3DO95/3DO95_29.jpeg", + "3DO95/3DO95_30.jpeg", + "3DO95/3DO95_31.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 126, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the book.\nB. Put down the pillow.\nC. Open the bag.\nD. Tidy up the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "close the book", + "put down the pillow", + "open the bag", + "tidy up the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "AZMVM/AZMVM_0.jpeg", + "AZMVM/AZMVM_1.jpeg", + "AZMVM/AZMVM_2.jpeg", + "AZMVM/AZMVM_3.jpeg", + "AZMVM/AZMVM_4.jpeg", + "AZMVM/AZMVM_5.jpeg", + "AZMVM/AZMVM_6.jpeg", + "AZMVM/AZMVM_7.jpeg", + "AZMVM/AZMVM_8.jpeg", + "AZMVM/AZMVM_9.jpeg", + "AZMVM/AZMVM_10.jpeg", + "AZMVM/AZMVM_11.jpeg", + "AZMVM/AZMVM_12.jpeg", + "AZMVM/AZMVM_13.jpeg", + "AZMVM/AZMVM_14.jpeg", + "AZMVM/AZMVM_15.jpeg", + "AZMVM/AZMVM_16.jpeg", + "AZMVM/AZMVM_17.jpeg", + "AZMVM/AZMVM_18.jpeg", + "AZMVM/AZMVM_19.jpeg", + "AZMVM/AZMVM_20.jpeg", + "AZMVM/AZMVM_21.jpeg", + "AZMVM/AZMVM_22.jpeg", + "AZMVM/AZMVM_23.jpeg", + "AZMVM/AZMVM_24.jpeg", + "AZMVM/AZMVM_25.jpeg", + "AZMVM/AZMVM_26.jpeg", + "AZMVM/AZMVM_27.jpeg", + "AZMVM/AZMVM_28.jpeg", + "AZMVM/AZMVM_29.jpeg", + "AZMVM/AZMVM_30.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 128, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next with the food?\nChoice list: \nA. Hold.\nB. Put down.\nC. Take.\nD. Throw.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "hold", + "put down", + "take", + "throw" + ], + "image_quantity_level": "Medium", + "image": [ + "UDGP2/UDGP2_0.jpeg", + "UDGP2/UDGP2_1.jpeg", + "UDGP2/UDGP2_2.jpeg", + "UDGP2/UDGP2_3.jpeg", + "UDGP2/UDGP2_4.jpeg", + "UDGP2/UDGP2_5.jpeg", + "UDGP2/UDGP2_6.jpeg", + "UDGP2/UDGP2_7.jpeg", + "UDGP2/UDGP2_8.jpeg", + "UDGP2/UDGP2_9.jpeg", + "UDGP2/UDGP2_10.jpeg", + "UDGP2/UDGP2_11.jpeg", + "UDGP2/UDGP2_12.jpeg", + "UDGP2/UDGP2_13.jpeg", + "UDGP2/UDGP2_14.jpeg", + "UDGP2/UDGP2_15.jpeg", + "UDGP2/UDGP2_16.jpeg", + "UDGP2/UDGP2_17.jpeg", + "UDGP2/UDGP2_18.jpeg", + "UDGP2/UDGP2_19.jpeg", + "UDGP2/UDGP2_20.jpeg", + "UDGP2/UDGP2_21.jpeg", + "UDGP2/UDGP2_22.jpeg", + "UDGP2/UDGP2_23.jpeg", + "UDGP2/UDGP2_24.jpeg", + "UDGP2/UDGP2_25.jpeg", + "UDGP2/UDGP2_26.jpeg", + "UDGP2/UDGP2_27.jpeg", + "UDGP2/UDGP2_28.jpeg", + "UDGP2/UDGP2_29.jpeg", + "UDGP2/UDGP2_30.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 130, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the closet/cabinet.\nB. Take the food.\nC. Close the refrigerator.\nD. Close the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidy up the closet cabinet", + "take the food", + "close the refrigerator", + "close the laptop" + ], + "image_quantity_level": "Medium", + "image": [ + "3N1I2/3N1I2_0.jpeg", + "3N1I2/3N1I2_1.jpeg", + "3N1I2/3N1I2_2.jpeg", + "3N1I2/3N1I2_3.jpeg", + "3N1I2/3N1I2_4.jpeg", + "3N1I2/3N1I2_5.jpeg", + "3N1I2/3N1I2_6.jpeg", + "3N1I2/3N1I2_7.jpeg", + "3N1I2/3N1I2_8.jpeg", + "3N1I2/3N1I2_9.jpeg", + "3N1I2/3N1I2_10.jpeg", + "3N1I2/3N1I2_11.jpeg", + "3N1I2/3N1I2_12.jpeg", + "3N1I2/3N1I2_13.jpeg", + "3N1I2/3N1I2_14.jpeg", + "3N1I2/3N1I2_15.jpeg", + "3N1I2/3N1I2_16.jpeg", + "3N1I2/3N1I2_17.jpeg", + "3N1I2/3N1I2_18.jpeg", + "3N1I2/3N1I2_19.jpeg", + "3N1I2/3N1I2_20.jpeg", + "3N1I2/3N1I2_21.jpeg", + "3N1I2/3N1I2_22.jpeg", + "3N1I2/3N1I2_23.jpeg", + "3N1I2/3N1I2_24.jpeg", + "3N1I2/3N1I2_25.jpeg", + "3N1I2/3N1I2_26.jpeg", + "3N1I2/3N1I2_27.jpeg", + "3N1I2/3N1I2_28.jpeg", + "3N1I2/3N1I2_29.jpeg", + "3N1I2/3N1I2_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 148, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the box.\nB. Tidy up the clothes.\nC. Take the phone/camera.\nD. Open the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidy up the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the box", + "tidy up the clothes", + "take the phone camera", + "open the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "TPSUY/TPSUY_0.jpeg", + "TPSUY/TPSUY_1.jpeg", + "TPSUY/TPSUY_2.jpeg", + "TPSUY/TPSUY_3.jpeg", + "TPSUY/TPSUY_4.jpeg", + "TPSUY/TPSUY_5.jpeg", + "TPSUY/TPSUY_6.jpeg", + "TPSUY/TPSUY_7.jpeg", + "TPSUY/TPSUY_8.jpeg", + "TPSUY/TPSUY_9.jpeg", + "TPSUY/TPSUY_10.jpeg", + "TPSUY/TPSUY_11.jpeg", + "TPSUY/TPSUY_12.jpeg", + "TPSUY/TPSUY_13.jpeg", + "TPSUY/TPSUY_14.jpeg", + "TPSUY/TPSUY_15.jpeg", + "TPSUY/TPSUY_16.jpeg", + "TPSUY/TPSUY_17.jpeg", + "TPSUY/TPSUY_18.jpeg", + "TPSUY/TPSUY_19.jpeg", + "TPSUY/TPSUY_20.jpeg", + "TPSUY/TPSUY_21.jpeg", + "TPSUY/TPSUY_22.jpeg", + "TPSUY/TPSUY_23.jpeg", + "TPSUY/TPSUY_24.jpeg", + "TPSUY/TPSUY_25.jpeg", + "TPSUY/TPSUY_26.jpeg", + "TPSUY/TPSUY_27.jpeg", + "TPSUY/TPSUY_28.jpeg", + "TPSUY/TPSUY_29.jpeg", + "TPSUY/TPSUY_30.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 152, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the floor.\nB. Put down the shoe.\nC. Take the paper/notebook.\nD. Take the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "lie on the floor", + "put down the shoe", + "take the paper notebook", + "take the book" + ], + "image_quantity_level": "Medium", + "image": [ + "NVBBD/NVBBD_0.jpeg", + "NVBBD/NVBBD_1.jpeg", + "NVBBD/NVBBD_2.jpeg", + "NVBBD/NVBBD_3.jpeg", + "NVBBD/NVBBD_4.jpeg", + "NVBBD/NVBBD_5.jpeg", + "NVBBD/NVBBD_6.jpeg", + "NVBBD/NVBBD_7.jpeg", + "NVBBD/NVBBD_8.jpeg", + "NVBBD/NVBBD_9.jpeg", + "NVBBD/NVBBD_10.jpeg", + "NVBBD/NVBBD_11.jpeg", + "NVBBD/NVBBD_12.jpeg", + "NVBBD/NVBBD_13.jpeg", + "NVBBD/NVBBD_14.jpeg", + "NVBBD/NVBBD_15.jpeg", + "NVBBD/NVBBD_16.jpeg", + "NVBBD/NVBBD_17.jpeg", + "NVBBD/NVBBD_18.jpeg", + "NVBBD/NVBBD_19.jpeg", + "NVBBD/NVBBD_20.jpeg", + "NVBBD/NVBBD_21.jpeg", + "NVBBD/NVBBD_22.jpeg", + "NVBBD/NVBBD_23.jpeg", + "NVBBD/NVBBD_24.jpeg", + "NVBBD/NVBBD_25.jpeg", + "NVBBD/NVBBD_26.jpeg", + "NVBBD/NVBBD_27.jpeg", + "NVBBD/NVBBD_28.jpeg", + "NVBBD/NVBBD_29.jpeg", + "NVBBD/NVBBD_30.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 153, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Take the food.\nC. Close the book.\nD. Throw the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the clothes", + "take the food", + "close the book", + "throw the box" + ], + "image_quantity_level": "Medium", + "image": [ + "8IPWO/8IPWO_0.jpeg", + "8IPWO/8IPWO_1.jpeg", + "8IPWO/8IPWO_2.jpeg", + "8IPWO/8IPWO_3.jpeg", + "8IPWO/8IPWO_4.jpeg", + "8IPWO/8IPWO_5.jpeg", + "8IPWO/8IPWO_6.jpeg", + "8IPWO/8IPWO_7.jpeg", + "8IPWO/8IPWO_8.jpeg", + "8IPWO/8IPWO_9.jpeg", + "8IPWO/8IPWO_10.jpeg", + "8IPWO/8IPWO_11.jpeg", + "8IPWO/8IPWO_12.jpeg", + "8IPWO/8IPWO_13.jpeg", + "8IPWO/8IPWO_14.jpeg", + "8IPWO/8IPWO_15.jpeg", + "8IPWO/8IPWO_16.jpeg", + "8IPWO/8IPWO_17.jpeg", + "8IPWO/8IPWO_18.jpeg", + "8IPWO/8IPWO_19.jpeg", + "8IPWO/8IPWO_20.jpeg", + "8IPWO/8IPWO_21.jpeg", + "8IPWO/8IPWO_22.jpeg", + "8IPWO/8IPWO_23.jpeg", + "8IPWO/8IPWO_24.jpeg", + "8IPWO/8IPWO_25.jpeg", + "8IPWO/8IPWO_26.jpeg", + "8IPWO/8IPWO_27.jpeg", + "8IPWO/8IPWO_28.jpeg", + "8IPWO/8IPWO_29.jpeg", + "8IPWO/8IPWO_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 86, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next with the table?\nChoice list: \nA. Close.\nB. Sit at.\nC. Tidy up.\nD. Wash.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit at", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "close", + "sit at", + "tidy up", + "wash" + ], + "image_quantity_level": "Many", + "image": [ + "727IZ/727IZ_0.jpeg", + "727IZ/727IZ_1.jpeg", + "727IZ/727IZ_2.jpeg", + "727IZ/727IZ_3.jpeg", + "727IZ/727IZ_4.jpeg", + "727IZ/727IZ_5.jpeg", + "727IZ/727IZ_6.jpeg", + "727IZ/727IZ_7.jpeg", + "727IZ/727IZ_8.jpeg", + "727IZ/727IZ_9.jpeg", + "727IZ/727IZ_10.jpeg", + "727IZ/727IZ_11.jpeg", + "727IZ/727IZ_12.jpeg", + "727IZ/727IZ_13.jpeg", + "727IZ/727IZ_14.jpeg", + "727IZ/727IZ_15.jpeg", + "727IZ/727IZ_16.jpeg", + "727IZ/727IZ_17.jpeg", + "727IZ/727IZ_18.jpeg", + "727IZ/727IZ_19.jpeg", + "727IZ/727IZ_20.jpeg", + "727IZ/727IZ_21.jpeg", + "727IZ/727IZ_22.jpeg", + "727IZ/727IZ_23.jpeg", + "727IZ/727IZ_24.jpeg", + "727IZ/727IZ_25.jpeg", + "727IZ/727IZ_26.jpeg", + "727IZ/727IZ_27.jpeg", + "727IZ/727IZ_28.jpeg", + "727IZ/727IZ_29.jpeg", + "727IZ/727IZ_30.jpeg", + "727IZ/727IZ_31.jpeg", + "727IZ/727IZ_32.jpeg", + "727IZ/727IZ_33.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 96, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the book.\nB. Eat the sandwich.\nC. Lie on the bed.\nD. Take the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "eat the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the book", + "eat the sandwich", + "lie on the bed", + "take the box" + ], + "image_quantity_level": "Many", + "image": [ + "S673O/S673O_0.jpeg", + "S673O/S673O_1.jpeg", + "S673O/S673O_2.jpeg", + "S673O/S673O_3.jpeg", + "S673O/S673O_4.jpeg", + "S673O/S673O_5.jpeg", + "S673O/S673O_6.jpeg", + "S673O/S673O_7.jpeg", + "S673O/S673O_8.jpeg", + "S673O/S673O_9.jpeg", + "S673O/S673O_10.jpeg", + "S673O/S673O_11.jpeg", + "S673O/S673O_12.jpeg", + "S673O/S673O_13.jpeg", + "S673O/S673O_14.jpeg", + "S673O/S673O_15.jpeg", + "S673O/S673O_16.jpeg", + "S673O/S673O_17.jpeg", + "S673O/S673O_18.jpeg", + "S673O/S673O_19.jpeg", + "S673O/S673O_20.jpeg", + "S673O/S673O_21.jpeg", + "S673O/S673O_22.jpeg", + "S673O/S673O_23.jpeg", + "S673O/S673O_24.jpeg", + "S673O/S673O_25.jpeg", + "S673O/S673O_26.jpeg", + "S673O/S673O_27.jpeg", + "S673O/S673O_28.jpeg", + "S673O/S673O_29.jpeg", + "S673O/S673O_30.jpeg", + "S673O/S673O_31.jpeg", + "S673O/S673O_32.jpeg", + "S673O/S673O_33.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 146, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the laptop.\nB. Put down the paper/notebook.\nC. Put down the picture.\nD. Hold the blanket.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the picture", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the laptop", + "put down the paper notebook", + "put down the picture", + "hold the blanket" + ], + "image_quantity_level": "Many", + "image": [ + "2BO2G/2BO2G_0.jpeg", + "2BO2G/2BO2G_1.jpeg", + "2BO2G/2BO2G_2.jpeg", + "2BO2G/2BO2G_3.jpeg", + "2BO2G/2BO2G_4.jpeg", + "2BO2G/2BO2G_5.jpeg", + "2BO2G/2BO2G_6.jpeg", + "2BO2G/2BO2G_7.jpeg", + "2BO2G/2BO2G_8.jpeg", + "2BO2G/2BO2G_9.jpeg", + "2BO2G/2BO2G_10.jpeg", + "2BO2G/2BO2G_11.jpeg", + "2BO2G/2BO2G_12.jpeg", + "2BO2G/2BO2G_13.jpeg", + "2BO2G/2BO2G_14.jpeg", + "2BO2G/2BO2G_15.jpeg", + "2BO2G/2BO2G_16.jpeg", + "2BO2G/2BO2G_17.jpeg", + "2BO2G/2BO2G_18.jpeg", + "2BO2G/2BO2G_19.jpeg", + "2BO2G/2BO2G_20.jpeg", + "2BO2G/2BO2G_21.jpeg", + "2BO2G/2BO2G_22.jpeg", + "2BO2G/2BO2G_23.jpeg", + "2BO2G/2BO2G_24.jpeg", + "2BO2G/2BO2G_25.jpeg", + "2BO2G/2BO2G_26.jpeg", + "2BO2G/2BO2G_27.jpeg", + "2BO2G/2BO2G_28.jpeg", + "2BO2G/2BO2G_29.jpeg", + "2BO2G/2BO2G_30.jpeg", + "2BO2G/2BO2G_31.jpeg", + "2BO2G/2BO2G_32.jpeg", + "2BO2G/2BO2G_33.jpeg", + "2BO2G/2BO2G_34.jpeg", + "2BO2G/2BO2G_35.jpeg", + "2BO2G/2BO2G_36.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 78, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the box.\nB. Take the phone/camera.\nC. Put down the book.\nD. Throw the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "close the box", + "take the phone camera", + "put down the book", + "throw the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "TOOYI/TOOYI_0.jpeg", + "TOOYI/TOOYI_1.jpeg", + "TOOYI/TOOYI_2.jpeg", + "TOOYI/TOOYI_3.jpeg", + "TOOYI/TOOYI_4.jpeg", + "TOOYI/TOOYI_5.jpeg", + "TOOYI/TOOYI_6.jpeg", + "TOOYI/TOOYI_7.jpeg", + "TOOYI/TOOYI_8.jpeg", + "TOOYI/TOOYI_9.jpeg", + "TOOYI/TOOYI_10.jpeg", + "TOOYI/TOOYI_11.jpeg", + "TOOYI/TOOYI_12.jpeg", + "TOOYI/TOOYI_13.jpeg", + "TOOYI/TOOYI_14.jpeg", + "TOOYI/TOOYI_15.jpeg", + "TOOYI/TOOYI_16.jpeg", + "TOOYI/TOOYI_17.jpeg", + "TOOYI/TOOYI_18.jpeg", + "TOOYI/TOOYI_19.jpeg", + "TOOYI/TOOYI_20.jpeg", + "TOOYI/TOOYI_21.jpeg", + "TOOYI/TOOYI_22.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 144, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the sofa/couch.\nB. Open the door.\nC. Sit on the bed.\nD. Close the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "lie on the sofa couch", + "open the door", + "sit on the bed", + "close the laptop" + ], + "image_quantity_level": "Medium", + "image": [ + "BI31D/BI31D_0.jpeg", + "BI31D/BI31D_1.jpeg", + "BI31D/BI31D_2.jpeg", + "BI31D/BI31D_3.jpeg", + "BI31D/BI31D_4.jpeg", + "BI31D/BI31D_5.jpeg", + "BI31D/BI31D_6.jpeg", + "BI31D/BI31D_7.jpeg", + "BI31D/BI31D_8.jpeg", + "BI31D/BI31D_9.jpeg", + "BI31D/BI31D_10.jpeg", + "BI31D/BI31D_11.jpeg", + "BI31D/BI31D_12.jpeg", + "BI31D/BI31D_13.jpeg", + "BI31D/BI31D_14.jpeg", + "BI31D/BI31D_15.jpeg", + "BI31D/BI31D_16.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 151, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the sofa/couch.\nB. Put down the bag.\nC. Take the shoe.\nD. Take the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "lie on the sofa couch", + "put down the bag", + "take the shoe", + "take the towel" + ], + "image_quantity_level": "Many", + "image": [ + "P2UBC/P2UBC_0.jpeg", + "P2UBC/P2UBC_1.jpeg", + "P2UBC/P2UBC_2.jpeg", + "P2UBC/P2UBC_3.jpeg", + "P2UBC/P2UBC_4.jpeg", + "P2UBC/P2UBC_5.jpeg", + "P2UBC/P2UBC_6.jpeg", + "P2UBC/P2UBC_7.jpeg", + "P2UBC/P2UBC_8.jpeg", + "P2UBC/P2UBC_9.jpeg", + "P2UBC/P2UBC_10.jpeg", + "P2UBC/P2UBC_11.jpeg", + "P2UBC/P2UBC_12.jpeg", + "P2UBC/P2UBC_13.jpeg", + "P2UBC/P2UBC_14.jpeg", + "P2UBC/P2UBC_15.jpeg", + "P2UBC/P2UBC_16.jpeg", + "P2UBC/P2UBC_17.jpeg", + "P2UBC/P2UBC_18.jpeg", + "P2UBC/P2UBC_19.jpeg", + "P2UBC/P2UBC_20.jpeg", + "P2UBC/P2UBC_21.jpeg", + "P2UBC/P2UBC_22.jpeg", + "P2UBC/P2UBC_23.jpeg", + "P2UBC/P2UBC_24.jpeg", + "P2UBC/P2UBC_25.jpeg", + "P2UBC/P2UBC_26.jpeg", + "P2UBC/P2UBC_27.jpeg", + "P2UBC/P2UBC_28.jpeg", + "P2UBC/P2UBC_29.jpeg", + "P2UBC/P2UBC_30.jpeg", + "P2UBC/P2UBC_31.jpeg", + "P2UBC/P2UBC_32.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 161, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the phone/camera.\nB. Take the clothes.\nC. Sit on the sofa/couch.\nD. Take the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take the phone camera", + "take the clothes", + "sit on the sofa couch", + "take the box" + ], + "image_quantity_level": "Many", + "image": [ + "9BCZU/9BCZU_0.jpeg", + "9BCZU/9BCZU_1.jpeg", + "9BCZU/9BCZU_2.jpeg", + "9BCZU/9BCZU_3.jpeg", + "9BCZU/9BCZU_4.jpeg", + "9BCZU/9BCZU_5.jpeg", + "9BCZU/9BCZU_6.jpeg", + "9BCZU/9BCZU_7.jpeg", + "9BCZU/9BCZU_8.jpeg", + "9BCZU/9BCZU_9.jpeg", + "9BCZU/9BCZU_10.jpeg", + "9BCZU/9BCZU_11.jpeg", + "9BCZU/9BCZU_12.jpeg", + "9BCZU/9BCZU_13.jpeg", + "9BCZU/9BCZU_14.jpeg", + "9BCZU/9BCZU_15.jpeg", + "9BCZU/9BCZU_16.jpeg", + "9BCZU/9BCZU_17.jpeg", + "9BCZU/9BCZU_18.jpeg", + "9BCZU/9BCZU_19.jpeg", + "9BCZU/9BCZU_20.jpeg", + "9BCZU/9BCZU_21.jpeg", + "9BCZU/9BCZU_22.jpeg", + "9BCZU/9BCZU_23.jpeg", + "9BCZU/9BCZU_24.jpeg", + "9BCZU/9BCZU_25.jpeg", + "9BCZU/9BCZU_26.jpeg", + "9BCZU/9BCZU_27.jpeg", + "9BCZU/9BCZU_28.jpeg", + "9BCZU/9BCZU_29.jpeg", + "9BCZU/9BCZU_30.jpeg", + "9BCZU/9BCZU_31.jpeg", + "9BCZU/9BCZU_32.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 66, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Eat the sandwich.\nC. Lie on the floor.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the closet cabinet", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the cup glass bottle", + "eat the sandwich", + "lie on the floor", + "open the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "2PREF/2PREF_0.jpeg", + "2PREF/2PREF_1.jpeg", + "2PREF/2PREF_2.jpeg", + "2PREF/2PREF_3.jpeg", + "2PREF/2PREF_4.jpeg", + "2PREF/2PREF_5.jpeg", + "2PREF/2PREF_6.jpeg", + "2PREF/2PREF_7.jpeg", + "2PREF/2PREF_8.jpeg", + "2PREF/2PREF_9.jpeg", + "2PREF/2PREF_10.jpeg", + "2PREF/2PREF_11.jpeg", + "2PREF/2PREF_12.jpeg", + "2PREF/2PREF_13.jpeg", + "2PREF/2PREF_14.jpeg", + "2PREF/2PREF_15.jpeg", + "2PREF/2PREF_16.jpeg", + "2PREF/2PREF_17.jpeg", + "2PREF/2PREF_18.jpeg", + "2PREF/2PREF_19.jpeg", + "2PREF/2PREF_20.jpeg", + "2PREF/2PREF_21.jpeg", + "2PREF/2PREF_22.jpeg", + "2PREF/2PREF_23.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 140, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the door.\nB. Close the closet/cabinet.\nC. Tidy up the table.\nD. Take the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the dish", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "close the door", + "close the closet cabinet", + "tidy up the table", + "take the dish" + ], + "image_quantity_level": "Many", + "image": [ + "BJXRT/BJXRT_0.jpeg", + "BJXRT/BJXRT_1.jpeg", + "BJXRT/BJXRT_2.jpeg", + "BJXRT/BJXRT_3.jpeg", + "BJXRT/BJXRT_4.jpeg", + "BJXRT/BJXRT_5.jpeg", + "BJXRT/BJXRT_6.jpeg", + "BJXRT/BJXRT_7.jpeg", + "BJXRT/BJXRT_8.jpeg", + "BJXRT/BJXRT_9.jpeg", + "BJXRT/BJXRT_10.jpeg", + "BJXRT/BJXRT_11.jpeg", + "BJXRT/BJXRT_12.jpeg", + "BJXRT/BJXRT_13.jpeg", + "BJXRT/BJXRT_14.jpeg", + "BJXRT/BJXRT_15.jpeg", + "BJXRT/BJXRT_16.jpeg", + "BJXRT/BJXRT_17.jpeg", + "BJXRT/BJXRT_18.jpeg", + "BJXRT/BJXRT_19.jpeg", + "BJXRT/BJXRT_20.jpeg", + "BJXRT/BJXRT_21.jpeg", + "BJXRT/BJXRT_22.jpeg", + "BJXRT/BJXRT_23.jpeg", + "BJXRT/BJXRT_24.jpeg", + "BJXRT/BJXRT_25.jpeg", + "BJXRT/BJXRT_26.jpeg", + "BJXRT/BJXRT_27.jpeg", + "BJXRT/BJXRT_28.jpeg", + "BJXRT/BJXRT_29.jpeg", + "BJXRT/BJXRT_30.jpeg", + "BJXRT/BJXRT_31.jpeg", + "BJXRT/BJXRT_32.jpeg", + "BJXRT/BJXRT_33.jpeg", + "BJXRT/BJXRT_34.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 109, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the box.\nB. Sit at the table.\nC. Take the sandwich.\nD. Lie on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "open the box", + "sit at the table", + "take the sandwich", + "lie on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "9PXC0/9PXC0_0.jpeg", + "9PXC0/9PXC0_1.jpeg", + "9PXC0/9PXC0_2.jpeg", + "9PXC0/9PXC0_3.jpeg", + "9PXC0/9PXC0_4.jpeg", + "9PXC0/9PXC0_5.jpeg", + "9PXC0/9PXC0_6.jpeg", + "9PXC0/9PXC0_7.jpeg", + "9PXC0/9PXC0_8.jpeg", + "9PXC0/9PXC0_9.jpeg", + "9PXC0/9PXC0_10.jpeg", + "9PXC0/9PXC0_11.jpeg", + "9PXC0/9PXC0_12.jpeg", + "9PXC0/9PXC0_13.jpeg", + "9PXC0/9PXC0_14.jpeg", + "9PXC0/9PXC0_15.jpeg", + "9PXC0/9PXC0_16.jpeg", + "9PXC0/9PXC0_17.jpeg", + "9PXC0/9PXC0_18.jpeg", + "9PXC0/9PXC0_19.jpeg", + "9PXC0/9PXC0_20.jpeg", + "9PXC0/9PXC0_21.jpeg", + "9PXC0/9PXC0_22.jpeg", + "9PXC0/9PXC0_23.jpeg", + "9PXC0/9PXC0_24.jpeg", + "9PXC0/9PXC0_25.jpeg", + "9PXC0/9PXC0_26.jpeg", + "9PXC0/9PXC0_27.jpeg", + "9PXC0/9PXC0_28.jpeg", + "9PXC0/9PXC0_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 110, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the book.\nB. Throw the clothes.\nC. Wash the table.\nD. Open the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "throw the book", + "throw the clothes", + "wash the table", + "open the door" + ], + "image_quantity_level": "Medium", + "image": [ + "WOD0G/WOD0G_0.jpeg", + "WOD0G/WOD0G_1.jpeg", + "WOD0G/WOD0G_2.jpeg", + "WOD0G/WOD0G_3.jpeg", + "WOD0G/WOD0G_4.jpeg", + "WOD0G/WOD0G_5.jpeg", + "WOD0G/WOD0G_6.jpeg", + "WOD0G/WOD0G_7.jpeg", + "WOD0G/WOD0G_8.jpeg", + "WOD0G/WOD0G_9.jpeg", + "WOD0G/WOD0G_10.jpeg", + "WOD0G/WOD0G_11.jpeg", + "WOD0G/WOD0G_12.jpeg", + "WOD0G/WOD0G_13.jpeg", + "WOD0G/WOD0G_14.jpeg", + "WOD0G/WOD0G_15.jpeg", + "WOD0G/WOD0G_16.jpeg", + "WOD0G/WOD0G_17.jpeg", + "WOD0G/WOD0G_18.jpeg", + "WOD0G/WOD0G_19.jpeg", + "WOD0G/WOD0G_20.jpeg", + "WOD0G/WOD0G_21.jpeg", + "WOD0G/WOD0G_22.jpeg", + "WOD0G/WOD0G_23.jpeg", + "WOD0G/WOD0G_24.jpeg", + "WOD0G/WOD0G_25.jpeg", + "WOD0G/WOD0G_26.jpeg", + "WOD0G/WOD0G_27.jpeg", + "WOD0G/WOD0G_28.jpeg", + "WOD0G/WOD0G_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 114, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the bag.\nB. Hold the box.\nC. Throw the blanket.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the bag", + "hold the box", + "throw the blanket", + "sit on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "7ZCXJ/7ZCXJ_0.jpeg", + "7ZCXJ/7ZCXJ_1.jpeg", + "7ZCXJ/7ZCXJ_2.jpeg", + "7ZCXJ/7ZCXJ_3.jpeg", + "7ZCXJ/7ZCXJ_4.jpeg", + "7ZCXJ/7ZCXJ_5.jpeg", + "7ZCXJ/7ZCXJ_6.jpeg", + "7ZCXJ/7ZCXJ_7.jpeg", + "7ZCXJ/7ZCXJ_8.jpeg", + "7ZCXJ/7ZCXJ_9.jpeg", + "7ZCXJ/7ZCXJ_10.jpeg", + "7ZCXJ/7ZCXJ_11.jpeg", + "7ZCXJ/7ZCXJ_12.jpeg", + "7ZCXJ/7ZCXJ_13.jpeg", + "7ZCXJ/7ZCXJ_14.jpeg", + "7ZCXJ/7ZCXJ_15.jpeg", + "7ZCXJ/7ZCXJ_16.jpeg", + "7ZCXJ/7ZCXJ_17.jpeg", + "7ZCXJ/7ZCXJ_18.jpeg", + "7ZCXJ/7ZCXJ_19.jpeg", + "7ZCXJ/7ZCXJ_20.jpeg", + "7ZCXJ/7ZCXJ_21.jpeg", + "7ZCXJ/7ZCXJ_22.jpeg", + "7ZCXJ/7ZCXJ_23.jpeg", + "7ZCXJ/7ZCXJ_24.jpeg", + "7ZCXJ/7ZCXJ_25.jpeg", + "7ZCXJ/7ZCXJ_26.jpeg", + "7ZCXJ/7ZCXJ_27.jpeg", + "7ZCXJ/7ZCXJ_28.jpeg", + "7ZCXJ/7ZCXJ_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 119, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the shoe.\nB. Throw the clothes.\nC. Open the book.\nD. Throw the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the shoe", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the shoe", + "throw the clothes", + "open the book", + "throw the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "6ZWSU/6ZWSU_0.jpeg", + "6ZWSU/6ZWSU_1.jpeg", + "6ZWSU/6ZWSU_2.jpeg", + "6ZWSU/6ZWSU_3.jpeg", + "6ZWSU/6ZWSU_4.jpeg", + "6ZWSU/6ZWSU_5.jpeg", + "6ZWSU/6ZWSU_6.jpeg", + "6ZWSU/6ZWSU_7.jpeg", + "6ZWSU/6ZWSU_8.jpeg", + "6ZWSU/6ZWSU_9.jpeg", + "6ZWSU/6ZWSU_10.jpeg", + "6ZWSU/6ZWSU_11.jpeg", + "6ZWSU/6ZWSU_12.jpeg", + "6ZWSU/6ZWSU_13.jpeg", + "6ZWSU/6ZWSU_14.jpeg", + "6ZWSU/6ZWSU_15.jpeg", + "6ZWSU/6ZWSU_16.jpeg", + "6ZWSU/6ZWSU_17.jpeg", + "6ZWSU/6ZWSU_18.jpeg", + "6ZWSU/6ZWSU_19.jpeg", + "6ZWSU/6ZWSU_20.jpeg", + "6ZWSU/6ZWSU_21.jpeg", + "6ZWSU/6ZWSU_22.jpeg", + "6ZWSU/6ZWSU_23.jpeg", + "6ZWSU/6ZWSU_24.jpeg", + "6ZWSU/6ZWSU_25.jpeg", + "6ZWSU/6ZWSU_26.jpeg", + "6ZWSU/6ZWSU_27.jpeg", + "6ZWSU/6ZWSU_28.jpeg", + "6ZWSU/6ZWSU_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 123, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the dish.\nB. Open the closet/cabinet.\nC. Take the food.\nD. Throw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "wash the dish", + "open the closet cabinet", + "take the food", + "throw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "3CLVI/3CLVI_0.jpeg", + "3CLVI/3CLVI_1.jpeg", + "3CLVI/3CLVI_2.jpeg", + "3CLVI/3CLVI_3.jpeg", + "3CLVI/3CLVI_4.jpeg", + "3CLVI/3CLVI_5.jpeg", + "3CLVI/3CLVI_6.jpeg", + "3CLVI/3CLVI_7.jpeg", + "3CLVI/3CLVI_8.jpeg", + "3CLVI/3CLVI_9.jpeg", + "3CLVI/3CLVI_10.jpeg", + "3CLVI/3CLVI_11.jpeg", + "3CLVI/3CLVI_12.jpeg", + "3CLVI/3CLVI_13.jpeg", + "3CLVI/3CLVI_14.jpeg", + "3CLVI/3CLVI_15.jpeg", + "3CLVI/3CLVI_16.jpeg", + "3CLVI/3CLVI_17.jpeg", + "3CLVI/3CLVI_18.jpeg", + "3CLVI/3CLVI_19.jpeg", + "3CLVI/3CLVI_20.jpeg", + "3CLVI/3CLVI_21.jpeg", + "3CLVI/3CLVI_22.jpeg", + "3CLVI/3CLVI_23.jpeg", + "3CLVI/3CLVI_24.jpeg", + "3CLVI/3CLVI_25.jpeg", + "3CLVI/3CLVI_26.jpeg", + "3CLVI/3CLVI_27.jpeg", + "3CLVI/3CLVI_28.jpeg", + "3CLVI/3CLVI_29.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 124, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next with the pillow?\nChoice list: \nA. Tidy up.\nB. Throw.\nC. Put down.\nD. Take.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "tidy up", + "throw", + "put down", + "take" + ], + "image_quantity_level": "Medium", + "image": [ + "92UB5/92UB5_0.jpeg", + "92UB5/92UB5_1.jpeg", + "92UB5/92UB5_2.jpeg", + "92UB5/92UB5_3.jpeg", + "92UB5/92UB5_4.jpeg", + "92UB5/92UB5_5.jpeg", + "92UB5/92UB5_6.jpeg", + "92UB5/92UB5_7.jpeg", + "92UB5/92UB5_8.jpeg", + "92UB5/92UB5_9.jpeg", + "92UB5/92UB5_10.jpeg", + "92UB5/92UB5_11.jpeg", + "92UB5/92UB5_12.jpeg", + "92UB5/92UB5_13.jpeg", + "92UB5/92UB5_14.jpeg", + "92UB5/92UB5_15.jpeg", + "92UB5/92UB5_16.jpeg", + "92UB5/92UB5_17.jpeg", + "92UB5/92UB5_18.jpeg", + "92UB5/92UB5_19.jpeg", + "92UB5/92UB5_20.jpeg", + "92UB5/92UB5_21.jpeg", + "92UB5/92UB5_22.jpeg", + "92UB5/92UB5_23.jpeg", + "92UB5/92UB5_24.jpeg", + "92UB5/92UB5_25.jpeg", + "92UB5/92UB5_26.jpeg", + "92UB5/92UB5_27.jpeg", + "92UB5/92UB5_28.jpeg", + "92UB5/92UB5_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 125, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next with the blanket?\nChoice list: \nA. Put down.\nB. Throw.\nC. Lie on.\nD. Tidy up.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidy up", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down", + "throw", + "lie on", + "tidy up" + ], + "image_quantity_level": "Medium", + "image": [ + "84893/84893_0.jpeg", + "84893/84893_1.jpeg", + "84893/84893_2.jpeg", + "84893/84893_3.jpeg", + "84893/84893_4.jpeg", + "84893/84893_5.jpeg", + "84893/84893_6.jpeg", + "84893/84893_7.jpeg", + "84893/84893_8.jpeg", + "84893/84893_9.jpeg", + "84893/84893_10.jpeg", + "84893/84893_11.jpeg", + "84893/84893_12.jpeg", + "84893/84893_13.jpeg", + "84893/84893_14.jpeg", + "84893/84893_15.jpeg", + "84893/84893_16.jpeg", + "84893/84893_17.jpeg", + "84893/84893_18.jpeg", + "84893/84893_19.jpeg", + "84893/84893_20.jpeg", + "84893/84893_21.jpeg", + "84893/84893_22.jpeg", + "84893/84893_23.jpeg", + "84893/84893_24.jpeg", + "84893/84893_25.jpeg", + "84893/84893_26.jpeg", + "84893/84893_27.jpeg", + "84893/84893_28.jpeg", + "84893/84893_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 136, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the door.\nB. Sit at the table.\nC. Lie on the sofa/couch.\nD. Throw the food.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit at the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open the door", + "sit at the table", + "lie on the sofa couch", + "throw the food" + ], + "image_quantity_level": "Medium", + "image": [ + "XLR4B/XLR4B_0.jpeg", + "XLR4B/XLR4B_1.jpeg", + "XLR4B/XLR4B_2.jpeg", + "XLR4B/XLR4B_3.jpeg", + "XLR4B/XLR4B_4.jpeg", + "XLR4B/XLR4B_5.jpeg", + "XLR4B/XLR4B_6.jpeg", + "XLR4B/XLR4B_7.jpeg", + "XLR4B/XLR4B_8.jpeg", + "XLR4B/XLR4B_9.jpeg", + "XLR4B/XLR4B_10.jpeg", + "XLR4B/XLR4B_11.jpeg", + "XLR4B/XLR4B_12.jpeg", + "XLR4B/XLR4B_13.jpeg", + "XLR4B/XLR4B_14.jpeg", + "XLR4B/XLR4B_15.jpeg", + "XLR4B/XLR4B_16.jpeg", + "XLR4B/XLR4B_17.jpeg", + "XLR4B/XLR4B_18.jpeg", + "XLR4B/XLR4B_19.jpeg", + "XLR4B/XLR4B_20.jpeg", + "XLR4B/XLR4B_21.jpeg", + "XLR4B/XLR4B_22.jpeg", + "XLR4B/XLR4B_23.jpeg", + "XLR4B/XLR4B_24.jpeg", + "XLR4B/XLR4B_25.jpeg", + "XLR4B/XLR4B_26.jpeg", + "XLR4B/XLR4B_27.jpeg", + "XLR4B/XLR4B_28.jpeg", + "XLR4B/XLR4B_29.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 142, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the window.\nB. Open the book.\nC. Wash the mirror.\nD. Close the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "wash the mirror", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "open the window", + "open the book", + "wash the mirror", + "close the door" + ], + "image_quantity_level": "Medium", + "image": [ + "TZL6H/TZL6H_0.jpeg", + "TZL6H/TZL6H_1.jpeg", + "TZL6H/TZL6H_2.jpeg", + "TZL6H/TZL6H_3.jpeg", + "TZL6H/TZL6H_4.jpeg", + "TZL6H/TZL6H_5.jpeg", + "TZL6H/TZL6H_6.jpeg", + "TZL6H/TZL6H_7.jpeg", + "TZL6H/TZL6H_8.jpeg", + "TZL6H/TZL6H_9.jpeg", + "TZL6H/TZL6H_10.jpeg", + "TZL6H/TZL6H_11.jpeg", + "TZL6H/TZL6H_12.jpeg", + "TZL6H/TZL6H_13.jpeg", + "TZL6H/TZL6H_14.jpeg", + "TZL6H/TZL6H_15.jpeg", + "TZL6H/TZL6H_16.jpeg", + "TZL6H/TZL6H_17.jpeg", + "TZL6H/TZL6H_18.jpeg", + "TZL6H/TZL6H_19.jpeg", + "TZL6H/TZL6H_20.jpeg", + "TZL6H/TZL6H_21.jpeg", + "TZL6H/TZL6H_22.jpeg", + "TZL6H/TZL6H_23.jpeg", + "TZL6H/TZL6H_24.jpeg", + "TZL6H/TZL6H_25.jpeg", + "TZL6H/TZL6H_26.jpeg", + "TZL6H/TZL6H_27.jpeg", + "TZL6H/TZL6H_28.jpeg", + "TZL6H/TZL6H_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 143, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the box.\nB. Take the book.\nC. Tidy up the broom.\nD. Put down the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open the box", + "take the book", + "tidy up the broom", + "put down the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "CJ58B/CJ58B_0.jpeg", + "CJ58B/CJ58B_1.jpeg", + "CJ58B/CJ58B_2.jpeg", + "CJ58B/CJ58B_3.jpeg", + "CJ58B/CJ58B_4.jpeg", + "CJ58B/CJ58B_5.jpeg", + "CJ58B/CJ58B_6.jpeg", + "CJ58B/CJ58B_7.jpeg", + "CJ58B/CJ58B_8.jpeg", + "CJ58B/CJ58B_9.jpeg", + "CJ58B/CJ58B_10.jpeg", + "CJ58B/CJ58B_11.jpeg", + "CJ58B/CJ58B_12.jpeg", + "CJ58B/CJ58B_13.jpeg", + "CJ58B/CJ58B_14.jpeg", + "CJ58B/CJ58B_15.jpeg", + "CJ58B/CJ58B_16.jpeg", + "CJ58B/CJ58B_17.jpeg", + "CJ58B/CJ58B_18.jpeg", + "CJ58B/CJ58B_19.jpeg", + "CJ58B/CJ58B_20.jpeg", + "CJ58B/CJ58B_21.jpeg", + "CJ58B/CJ58B_22.jpeg", + "CJ58B/CJ58B_23.jpeg", + "CJ58B/CJ58B_24.jpeg", + "CJ58B/CJ58B_25.jpeg", + "CJ58B/CJ58B_26.jpeg", + "CJ58B/CJ58B_27.jpeg", + "CJ58B/CJ58B_28.jpeg", + "CJ58B/CJ58B_29.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 154, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the box.\nB. Take the bag.\nC. Lie on the bed.\nD. Sit on the floor.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "lie on the bed", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "throw the box", + "take the bag", + "lie on the bed", + "sit on the floor" + ], + "image_quantity_level": "Medium", + "image": [ + "UEP20/UEP20_0.jpeg", + "UEP20/UEP20_1.jpeg", + "UEP20/UEP20_2.jpeg", + "UEP20/UEP20_3.jpeg", + "UEP20/UEP20_4.jpeg", + "UEP20/UEP20_5.jpeg", + "UEP20/UEP20_6.jpeg", + "UEP20/UEP20_7.jpeg", + "UEP20/UEP20_8.jpeg", + "UEP20/UEP20_9.jpeg", + "UEP20/UEP20_10.jpeg", + "UEP20/UEP20_11.jpeg", + "UEP20/UEP20_12.jpeg", + "UEP20/UEP20_13.jpeg", + "UEP20/UEP20_14.jpeg", + "UEP20/UEP20_15.jpeg", + "UEP20/UEP20_16.jpeg", + "UEP20/UEP20_17.jpeg", + "UEP20/UEP20_18.jpeg", + "UEP20/UEP20_19.jpeg", + "UEP20/UEP20_20.jpeg", + "UEP20/UEP20_21.jpeg", + "UEP20/UEP20_22.jpeg", + "UEP20/UEP20_23.jpeg", + "UEP20/UEP20_24.jpeg", + "UEP20/UEP20_25.jpeg", + "UEP20/UEP20_26.jpeg", + "UEP20/UEP20_27.jpeg", + "UEP20/UEP20_28.jpeg", + "UEP20/UEP20_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 182, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the paper/notebook.\nB. Hold the phone/camera.\nC. Throw the blanket.\nD. Close the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "close the window", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the paper notebook", + "hold the phone camera", + "throw the blanket", + "close the window" + ], + "image_quantity_level": "Medium", + "image": [ + "EDVAM/EDVAM_0.jpeg", + "EDVAM/EDVAM_1.jpeg", + "EDVAM/EDVAM_2.jpeg", + "EDVAM/EDVAM_3.jpeg", + "EDVAM/EDVAM_4.jpeg", + "EDVAM/EDVAM_5.jpeg", + "EDVAM/EDVAM_6.jpeg", + "EDVAM/EDVAM_7.jpeg", + "EDVAM/EDVAM_8.jpeg", + "EDVAM/EDVAM_9.jpeg", + "EDVAM/EDVAM_10.jpeg", + "EDVAM/EDVAM_11.jpeg", + "EDVAM/EDVAM_12.jpeg", + "EDVAM/EDVAM_13.jpeg", + "EDVAM/EDVAM_14.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 139, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the bag.\nB. Throw the bag.\nC. Open the door.\nD. Put down the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the pillow", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the bag", + "throw the bag", + "open the door", + "put down the pillow" + ], + "image_quantity_level": "Medium", + "image": [ + "2ZXJ5/2ZXJ5_0.jpeg", + "2ZXJ5/2ZXJ5_1.jpeg", + "2ZXJ5/2ZXJ5_2.jpeg", + "2ZXJ5/2ZXJ5_3.jpeg", + "2ZXJ5/2ZXJ5_4.jpeg", + "2ZXJ5/2ZXJ5_5.jpeg", + "2ZXJ5/2ZXJ5_6.jpeg", + "2ZXJ5/2ZXJ5_7.jpeg", + "2ZXJ5/2ZXJ5_8.jpeg", + "2ZXJ5/2ZXJ5_9.jpeg", + "2ZXJ5/2ZXJ5_10.jpeg", + "2ZXJ5/2ZXJ5_11.jpeg", + "2ZXJ5/2ZXJ5_12.jpeg", + "2ZXJ5/2ZXJ5_13.jpeg", + "2ZXJ5/2ZXJ5_14.jpeg", + "2ZXJ5/2ZXJ5_15.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 179, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the bag.\nB. Take the shoe.\nC. Put down the phone/camera.\nD. Throw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "open the bag", + "take the shoe", + "put down the phone camera", + "throw the towel" + ], + "image_quantity_level": "Medium", + "image": [ + "C1DK7/C1DK7_0.jpeg", + "C1DK7/C1DK7_1.jpeg", + "C1DK7/C1DK7_2.jpeg", + "C1DK7/C1DK7_3.jpeg", + "C1DK7/C1DK7_4.jpeg", + "C1DK7/C1DK7_5.jpeg", + "C1DK7/C1DK7_6.jpeg", + "C1DK7/C1DK7_7.jpeg", + "C1DK7/C1DK7_8.jpeg", + "C1DK7/C1DK7_9.jpeg", + "C1DK7/C1DK7_10.jpeg", + "C1DK7/C1DK7_11.jpeg", + "C1DK7/C1DK7_12.jpeg", + "C1DK7/C1DK7_13.jpeg", + "C1DK7/C1DK7_14.jpeg", + "C1DK7/C1DK7_15.jpeg", + "C1DK7/C1DK7_16.jpeg", + "C1DK7/C1DK7_17.jpeg", + "C1DK7/C1DK7_18.jpeg", + "C1DK7/C1DK7_19.jpeg", + "C1DK7/C1DK7_20.jpeg", + "C1DK7/C1DK7_21.jpeg", + "C1DK7/C1DK7_22.jpeg", + "C1DK7/C1DK7_23.jpeg", + "C1DK7/C1DK7_24.jpeg", + "C1DK7/C1DK7_25.jpeg", + "C1DK7/C1DK7_26.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 186, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the box.\nB. Sit on the table.\nC. Close the refrigerator.\nD. Wash the cup/glass/bottle.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the box", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "open the box", + "sit on the table", + "close the refrigerator", + "wash the cup glass bottle" + ], + "image_quantity_level": "Medium", + "image": [ + "JCNHL/JCNHL_0.jpeg", + "JCNHL/JCNHL_1.jpeg", + "JCNHL/JCNHL_2.jpeg", + "JCNHL/JCNHL_3.jpeg", + "JCNHL/JCNHL_4.jpeg", + "JCNHL/JCNHL_5.jpeg", + "JCNHL/JCNHL_6.jpeg", + "JCNHL/JCNHL_7.jpeg", + "JCNHL/JCNHL_8.jpeg", + "JCNHL/JCNHL_9.jpeg", + "JCNHL/JCNHL_10.jpeg", + "JCNHL/JCNHL_11.jpeg", + "JCNHL/JCNHL_12.jpeg", + "JCNHL/JCNHL_13.jpeg", + "JCNHL/JCNHL_14.jpeg", + "JCNHL/JCNHL_15.jpeg", + "JCNHL/JCNHL_16.jpeg", + "JCNHL/JCNHL_17.jpeg", + "JCNHL/JCNHL_18.jpeg", + "JCNHL/JCNHL_19.jpeg", + "JCNHL/JCNHL_20.jpeg", + "JCNHL/JCNHL_21.jpeg", + "JCNHL/JCNHL_22.jpeg", + "JCNHL/JCNHL_23.jpeg", + "JCNHL/JCNHL_24.jpeg", + "JCNHL/JCNHL_25.jpeg", + "JCNHL/JCNHL_26.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 160, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the food.\nB. Throw the blanket.\nC. Open the door.\nD. Tidy up the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the food", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the food", + "throw the blanket", + "open the door", + "tidy up the closet cabinet" + ], + "image_quantity_level": "Many", + "image": [ + "35ZZP/35ZZP_0.jpeg", + "35ZZP/35ZZP_1.jpeg", + "35ZZP/35ZZP_2.jpeg", + "35ZZP/35ZZP_3.jpeg", + "35ZZP/35ZZP_4.jpeg", + "35ZZP/35ZZP_5.jpeg", + "35ZZP/35ZZP_6.jpeg", + "35ZZP/35ZZP_7.jpeg", + "35ZZP/35ZZP_8.jpeg", + "35ZZP/35ZZP_9.jpeg", + "35ZZP/35ZZP_10.jpeg", + "35ZZP/35ZZP_11.jpeg", + "35ZZP/35ZZP_12.jpeg", + "35ZZP/35ZZP_13.jpeg", + "35ZZP/35ZZP_14.jpeg", + "35ZZP/35ZZP_15.jpeg", + "35ZZP/35ZZP_16.jpeg", + "35ZZP/35ZZP_17.jpeg", + "35ZZP/35ZZP_18.jpeg", + "35ZZP/35ZZP_19.jpeg", + "35ZZP/35ZZP_20.jpeg", + "35ZZP/35ZZP_21.jpeg", + "35ZZP/35ZZP_22.jpeg", + "35ZZP/35ZZP_23.jpeg", + "35ZZP/35ZZP_24.jpeg", + "35ZZP/35ZZP_25.jpeg", + "35ZZP/35ZZP_26.jpeg", + "35ZZP/35ZZP_27.jpeg", + "35ZZP/35ZZP_28.jpeg", + "35ZZP/35ZZP_29.jpeg", + "35ZZP/35ZZP_30.jpeg", + "35ZZP/35ZZP_31.jpeg", + "35ZZP/35ZZP_32.jpeg", + "35ZZP/35ZZP_33.jpeg", + "35ZZP/35ZZP_34.jpeg", + "35ZZP/35ZZP_35.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 92, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the phone/camera.\nB. Put down the bag.\nC. Throw the clothes.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "take the phone camera", + "put down the bag", + "throw the clothes", + "open the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "Y79PC/Y79PC_0.jpeg", + "Y79PC/Y79PC_1.jpeg", + "Y79PC/Y79PC_2.jpeg", + "Y79PC/Y79PC_3.jpeg", + "Y79PC/Y79PC_4.jpeg", + "Y79PC/Y79PC_5.jpeg", + "Y79PC/Y79PC_6.jpeg", + "Y79PC/Y79PC_7.jpeg", + "Y79PC/Y79PC_8.jpeg", + "Y79PC/Y79PC_9.jpeg", + "Y79PC/Y79PC_10.jpeg", + "Y79PC/Y79PC_11.jpeg", + "Y79PC/Y79PC_12.jpeg", + "Y79PC/Y79PC_13.jpeg", + "Y79PC/Y79PC_14.jpeg", + "Y79PC/Y79PC_15.jpeg", + "Y79PC/Y79PC_16.jpeg", + "Y79PC/Y79PC_17.jpeg", + "Y79PC/Y79PC_18.jpeg", + "Y79PC/Y79PC_19.jpeg", + "Y79PC/Y79PC_20.jpeg", + "Y79PC/Y79PC_21.jpeg", + "Y79PC/Y79PC_22.jpeg", + "Y79PC/Y79PC_23.jpeg", + "Y79PC/Y79PC_24.jpeg", + "Y79PC/Y79PC_25.jpeg", + "Y79PC/Y79PC_26.jpeg", + "Y79PC/Y79PC_27.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 199, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the clothes.\nB. Take the blanket.\nC. Open the door.\nD. Sit on the bed.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "tidy up the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidy up the clothes", + "take the blanket", + "open the door", + "sit on the bed" + ], + "image_quantity_level": "Medium", + "image": [ + "V3RAX/V3RAX_0.jpeg", + "V3RAX/V3RAX_1.jpeg", + "V3RAX/V3RAX_2.jpeg", + "V3RAX/V3RAX_3.jpeg", + "V3RAX/V3RAX_4.jpeg", + "V3RAX/V3RAX_5.jpeg", + "V3RAX/V3RAX_6.jpeg", + "V3RAX/V3RAX_7.jpeg", + "V3RAX/V3RAX_8.jpeg", + "V3RAX/V3RAX_9.jpeg", + "V3RAX/V3RAX_10.jpeg", + "V3RAX/V3RAX_11.jpeg", + "V3RAX/V3RAX_12.jpeg", + "V3RAX/V3RAX_13.jpeg", + "V3RAX/V3RAX_14.jpeg", + "V3RAX/V3RAX_15.jpeg", + "V3RAX/V3RAX_16.jpeg", + "V3RAX/V3RAX_17.jpeg", + "V3RAX/V3RAX_18.jpeg", + "V3RAX/V3RAX_19.jpeg", + "V3RAX/V3RAX_20.jpeg", + "V3RAX/V3RAX_21.jpeg", + "V3RAX/V3RAX_22.jpeg", + "V3RAX/V3RAX_23.jpeg", + "V3RAX/V3RAX_24.jpeg", + "V3RAX/V3RAX_25.jpeg", + "V3RAX/V3RAX_26.jpeg", + "V3RAX/V3RAX_27.jpeg" + ], + "extracted": "C", + "result": 0 + }, + { + "sample_id": 157, + "question": "With the images at your disposal, foresee the person's impending maneuver. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the clothes.\nB. Put down the cup/glass/bottle.\nC. Take the food.\nD. Take the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the clothes", + "put down the cup glass bottle", + "take the food", + "take the book" + ], + "image_quantity_level": "Medium", + "image": [ + "OO4FE/OO4FE_0.jpeg", + "OO4FE/OO4FE_1.jpeg", + "OO4FE/OO4FE_2.jpeg", + "OO4FE/OO4FE_3.jpeg", + "OO4FE/OO4FE_4.jpeg", + "OO4FE/OO4FE_5.jpeg", + "OO4FE/OO4FE_6.jpeg", + "OO4FE/OO4FE_7.jpeg", + "OO4FE/OO4FE_8.jpeg", + "OO4FE/OO4FE_9.jpeg", + "OO4FE/OO4FE_10.jpeg", + "OO4FE/OO4FE_11.jpeg", + "OO4FE/OO4FE_12.jpeg", + "OO4FE/OO4FE_13.jpeg", + "OO4FE/OO4FE_14.jpeg", + "OO4FE/OO4FE_15.jpeg", + "OO4FE/OO4FE_16.jpeg", + "OO4FE/OO4FE_17.jpeg", + "OO4FE/OO4FE_18.jpeg", + "OO4FE/OO4FE_19.jpeg", + "OO4FE/OO4FE_20.jpeg", + "OO4FE/OO4FE_21.jpeg", + "OO4FE/OO4FE_22.jpeg", + "OO4FE/OO4FE_23.jpeg", + "OO4FE/OO4FE_24.jpeg", + "OO4FE/OO4FE_25.jpeg", + "OO4FE/OO4FE_26.jpeg", + "OO4FE/OO4FE_27.jpeg", + "OO4FE/OO4FE_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 183, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the window.\nB. Take the laptop.\nC. Close the box.\nD. Eat the medicine.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the window", + "take the laptop", + "close the box", + "eat the medicine" + ], + "image_quantity_level": "Medium", + "image": [ + "XZ9C0/XZ9C0_0.jpeg", + "XZ9C0/XZ9C0_1.jpeg", + "XZ9C0/XZ9C0_2.jpeg", + "XZ9C0/XZ9C0_3.jpeg", + "XZ9C0/XZ9C0_4.jpeg", + "XZ9C0/XZ9C0_5.jpeg", + "XZ9C0/XZ9C0_6.jpeg", + "XZ9C0/XZ9C0_7.jpeg", + "XZ9C0/XZ9C0_8.jpeg", + "XZ9C0/XZ9C0_9.jpeg", + "XZ9C0/XZ9C0_10.jpeg", + "XZ9C0/XZ9C0_11.jpeg", + "XZ9C0/XZ9C0_12.jpeg", + "XZ9C0/XZ9C0_13.jpeg", + "XZ9C0/XZ9C0_14.jpeg", + "XZ9C0/XZ9C0_15.jpeg", + "XZ9C0/XZ9C0_16.jpeg", + "XZ9C0/XZ9C0_17.jpeg", + "XZ9C0/XZ9C0_18.jpeg", + "XZ9C0/XZ9C0_19.jpeg", + "XZ9C0/XZ9C0_20.jpeg", + "XZ9C0/XZ9C0_21.jpeg", + "XZ9C0/XZ9C0_22.jpeg", + "XZ9C0/XZ9C0_23.jpeg", + "XZ9C0/XZ9C0_24.jpeg", + "XZ9C0/XZ9C0_25.jpeg", + "XZ9C0/XZ9C0_26.jpeg", + "XZ9C0/XZ9C0_27.jpeg", + "XZ9C0/XZ9C0_28.jpeg" + ], + "extracted": "D", + "result": 0 + }, + { + "sample_id": 191, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the cup/glass/bottle.\nB. Put down the sandwich.\nC. Put down the shoe.\nD. Throw the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the cup glass bottle", + "put down the sandwich", + "put down the shoe", + "throw the book" + ], + "image_quantity_level": "Medium", + "image": [ + "VS7VS/VS7VS_0.jpeg", + "VS7VS/VS7VS_1.jpeg", + "VS7VS/VS7VS_2.jpeg", + "VS7VS/VS7VS_3.jpeg", + "VS7VS/VS7VS_4.jpeg", + "VS7VS/VS7VS_5.jpeg", + "VS7VS/VS7VS_6.jpeg", + "VS7VS/VS7VS_7.jpeg", + "VS7VS/VS7VS_8.jpeg", + "VS7VS/VS7VS_9.jpeg", + "VS7VS/VS7VS_10.jpeg", + "VS7VS/VS7VS_11.jpeg", + "VS7VS/VS7VS_12.jpeg", + "VS7VS/VS7VS_13.jpeg", + "VS7VS/VS7VS_14.jpeg", + "VS7VS/VS7VS_15.jpeg", + "VS7VS/VS7VS_16.jpeg", + "VS7VS/VS7VS_17.jpeg", + "VS7VS/VS7VS_18.jpeg", + "VS7VS/VS7VS_19.jpeg", + "VS7VS/VS7VS_20.jpeg", + "VS7VS/VS7VS_21.jpeg", + "VS7VS/VS7VS_22.jpeg", + "VS7VS/VS7VS_23.jpeg", + "VS7VS/VS7VS_24.jpeg", + "VS7VS/VS7VS_25.jpeg", + "VS7VS/VS7VS_26.jpeg", + "VS7VS/VS7VS_27.jpeg", + "VS7VS/VS7VS_28.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 147, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the box.\nB. Open the bag.\nC. Close the door.\nD. Throw the towel.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take the box", + "open the bag", + "close the door", + "throw the towel" + ], + "image_quantity_level": "Many", + "image": [ + "DH66I/DH66I_0.jpeg", + "DH66I/DH66I_1.jpeg", + "DH66I/DH66I_2.jpeg", + "DH66I/DH66I_3.jpeg", + "DH66I/DH66I_4.jpeg", + "DH66I/DH66I_5.jpeg", + "DH66I/DH66I_6.jpeg", + "DH66I/DH66I_7.jpeg", + "DH66I/DH66I_8.jpeg", + "DH66I/DH66I_9.jpeg", + "DH66I/DH66I_10.jpeg", + "DH66I/DH66I_11.jpeg", + "DH66I/DH66I_12.jpeg", + "DH66I/DH66I_13.jpeg", + "DH66I/DH66I_14.jpeg", + "DH66I/DH66I_15.jpeg", + "DH66I/DH66I_16.jpeg", + "DH66I/DH66I_17.jpeg", + "DH66I/DH66I_18.jpeg", + "DH66I/DH66I_19.jpeg", + "DH66I/DH66I_20.jpeg", + "DH66I/DH66I_21.jpeg", + "DH66I/DH66I_22.jpeg", + "DH66I/DH66I_23.jpeg", + "DH66I/DH66I_24.jpeg", + "DH66I/DH66I_25.jpeg", + "DH66I/DH66I_26.jpeg", + "DH66I/DH66I_27.jpeg", + "DH66I/DH66I_28.jpeg", + "DH66I/DH66I_29.jpeg", + "DH66I/DH66I_30.jpeg", + "DH66I/DH66I_31.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 167, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the phone/camera.\nB. Close the door.\nC. Close the refrigerator.\nD. Throw the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the phone camera", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the phone camera", + "close the door", + "close the refrigerator", + "throw the bag" + ], + "image_quantity_level": "Many", + "image": [ + "P4DL9/P4DL9_0.jpeg", + "P4DL9/P4DL9_1.jpeg", + "P4DL9/P4DL9_2.jpeg", + "P4DL9/P4DL9_3.jpeg", + "P4DL9/P4DL9_4.jpeg", + "P4DL9/P4DL9_5.jpeg", + "P4DL9/P4DL9_6.jpeg", + "P4DL9/P4DL9_7.jpeg", + "P4DL9/P4DL9_8.jpeg", + "P4DL9/P4DL9_9.jpeg", + "P4DL9/P4DL9_10.jpeg", + "P4DL9/P4DL9_11.jpeg", + "P4DL9/P4DL9_12.jpeg", + "P4DL9/P4DL9_13.jpeg", + "P4DL9/P4DL9_14.jpeg", + "P4DL9/P4DL9_15.jpeg", + "P4DL9/P4DL9_16.jpeg", + "P4DL9/P4DL9_17.jpeg", + "P4DL9/P4DL9_18.jpeg", + "P4DL9/P4DL9_19.jpeg", + "P4DL9/P4DL9_20.jpeg", + "P4DL9/P4DL9_21.jpeg", + "P4DL9/P4DL9_22.jpeg", + "P4DL9/P4DL9_23.jpeg", + "P4DL9/P4DL9_24.jpeg", + "P4DL9/P4DL9_25.jpeg", + "P4DL9/P4DL9_26.jpeg", + "P4DL9/P4DL9_27.jpeg", + "P4DL9/P4DL9_28.jpeg", + "P4DL9/P4DL9_29.jpeg", + "P4DL9/P4DL9_30.jpeg", + "P4DL9/P4DL9_31.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 187, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the towel.\nB. Open the door.\nC. Put down the dish.\nD. Throw the broom.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the towel", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the towel", + "open the door", + "put down the dish", + "throw the broom" + ], + "image_quantity_level": "Many", + "image": [ + "BQKW5/BQKW5_0.jpeg", + "BQKW5/BQKW5_1.jpeg", + "BQKW5/BQKW5_2.jpeg", + "BQKW5/BQKW5_3.jpeg", + "BQKW5/BQKW5_4.jpeg", + "BQKW5/BQKW5_5.jpeg", + "BQKW5/BQKW5_6.jpeg", + "BQKW5/BQKW5_7.jpeg", + "BQKW5/BQKW5_8.jpeg", + "BQKW5/BQKW5_9.jpeg", + "BQKW5/BQKW5_10.jpeg", + "BQKW5/BQKW5_11.jpeg", + "BQKW5/BQKW5_12.jpeg", + "BQKW5/BQKW5_13.jpeg", + "BQKW5/BQKW5_14.jpeg", + "BQKW5/BQKW5_15.jpeg", + "BQKW5/BQKW5_16.jpeg", + "BQKW5/BQKW5_17.jpeg", + "BQKW5/BQKW5_18.jpeg", + "BQKW5/BQKW5_19.jpeg", + "BQKW5/BQKW5_20.jpeg", + "BQKW5/BQKW5_21.jpeg", + "BQKW5/BQKW5_22.jpeg", + "BQKW5/BQKW5_23.jpeg", + "BQKW5/BQKW5_24.jpeg", + "BQKW5/BQKW5_25.jpeg", + "BQKW5/BQKW5_26.jpeg", + "BQKW5/BQKW5_27.jpeg", + "BQKW5/BQKW5_28.jpeg", + "BQKW5/BQKW5_29.jpeg", + "BQKW5/BQKW5_30.jpeg", + "BQKW5/BQKW5_31.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 168, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the door.\nB. Close the door.\nC. Sit on the table.\nD. Put down the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "open the door", + "close the door", + "sit on the table", + "put down the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "EBTD6/EBTD6_0.jpeg", + "EBTD6/EBTD6_1.jpeg", + "EBTD6/EBTD6_2.jpeg", + "EBTD6/EBTD6_3.jpeg", + "EBTD6/EBTD6_4.jpeg", + "EBTD6/EBTD6_5.jpeg", + "EBTD6/EBTD6_6.jpeg", + "EBTD6/EBTD6_7.jpeg", + "EBTD6/EBTD6_8.jpeg", + "EBTD6/EBTD6_9.jpeg", + "EBTD6/EBTD6_10.jpeg", + "EBTD6/EBTD6_11.jpeg", + "EBTD6/EBTD6_12.jpeg", + "EBTD6/EBTD6_13.jpeg", + "EBTD6/EBTD6_14.jpeg", + "EBTD6/EBTD6_15.jpeg", + "EBTD6/EBTD6_16.jpeg", + "EBTD6/EBTD6_17.jpeg", + "EBTD6/EBTD6_18.jpeg", + "EBTD6/EBTD6_19.jpeg", + "EBTD6/EBTD6_20.jpeg", + "EBTD6/EBTD6_21.jpeg", + "EBTD6/EBTD6_22.jpeg", + "EBTD6/EBTD6_23.jpeg", + "EBTD6/EBTD6_24.jpeg", + "EBTD6/EBTD6_25.jpeg", + "EBTD6/EBTD6_26.jpeg", + "EBTD6/EBTD6_27.jpeg", + "EBTD6/EBTD6_28.jpeg", + "EBTD6/EBTD6_29.jpeg", + "EBTD6/EBTD6_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 170, + "question": "Study the provided images and predict the individual's forthcoming behavior. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the phone/camera.\nB. Throw the bag.\nC. Put down the food.\nD. Open the laptop.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the laptop", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "put down the phone camera", + "throw the bag", + "put down the food", + "open the laptop" + ], + "image_quantity_level": "Medium", + "image": [ + "DBT6E/DBT6E_0.jpeg", + "DBT6E/DBT6E_1.jpeg", + "DBT6E/DBT6E_2.jpeg", + "DBT6E/DBT6E_3.jpeg", + "DBT6E/DBT6E_4.jpeg", + "DBT6E/DBT6E_5.jpeg", + "DBT6E/DBT6E_6.jpeg", + "DBT6E/DBT6E_7.jpeg", + "DBT6E/DBT6E_8.jpeg", + "DBT6E/DBT6E_9.jpeg", + "DBT6E/DBT6E_10.jpeg", + "DBT6E/DBT6E_11.jpeg", + "DBT6E/DBT6E_12.jpeg", + "DBT6E/DBT6E_13.jpeg", + "DBT6E/DBT6E_14.jpeg", + "DBT6E/DBT6E_15.jpeg", + "DBT6E/DBT6E_16.jpeg", + "DBT6E/DBT6E_17.jpeg", + "DBT6E/DBT6E_18.jpeg", + "DBT6E/DBT6E_19.jpeg", + "DBT6E/DBT6E_20.jpeg", + "DBT6E/DBT6E_21.jpeg", + "DBT6E/DBT6E_22.jpeg", + "DBT6E/DBT6E_23.jpeg", + "DBT6E/DBT6E_24.jpeg", + "DBT6E/DBT6E_25.jpeg", + "DBT6E/DBT6E_26.jpeg", + "DBT6E/DBT6E_27.jpeg", + "DBT6E/DBT6E_28.jpeg", + "DBT6E/DBT6E_29.jpeg", + "DBT6E/DBT6E_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 173, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Close the laptop.\nB. Sit on the table.\nC. Throw the clothes.\nD. Take the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "close the laptop", + "sit on the table", + "throw the clothes", + "take the dish" + ], + "image_quantity_level": "Medium", + "image": [ + "ZOL0R/ZOL0R_0.jpeg", + "ZOL0R/ZOL0R_1.jpeg", + "ZOL0R/ZOL0R_2.jpeg", + "ZOL0R/ZOL0R_3.jpeg", + "ZOL0R/ZOL0R_4.jpeg", + "ZOL0R/ZOL0R_5.jpeg", + "ZOL0R/ZOL0R_6.jpeg", + "ZOL0R/ZOL0R_7.jpeg", + "ZOL0R/ZOL0R_8.jpeg", + "ZOL0R/ZOL0R_9.jpeg", + "ZOL0R/ZOL0R_10.jpeg", + "ZOL0R/ZOL0R_11.jpeg", + "ZOL0R/ZOL0R_12.jpeg", + "ZOL0R/ZOL0R_13.jpeg", + "ZOL0R/ZOL0R_14.jpeg", + "ZOL0R/ZOL0R_15.jpeg", + "ZOL0R/ZOL0R_16.jpeg", + "ZOL0R/ZOL0R_17.jpeg", + "ZOL0R/ZOL0R_18.jpeg", + "ZOL0R/ZOL0R_19.jpeg", + "ZOL0R/ZOL0R_20.jpeg", + "ZOL0R/ZOL0R_21.jpeg", + "ZOL0R/ZOL0R_22.jpeg", + "ZOL0R/ZOL0R_23.jpeg", + "ZOL0R/ZOL0R_24.jpeg", + "ZOL0R/ZOL0R_25.jpeg", + "ZOL0R/ZOL0R_26.jpeg", + "ZOL0R/ZOL0R_27.jpeg", + "ZOL0R/ZOL0R_28.jpeg", + "ZOL0R/ZOL0R_29.jpeg", + "ZOL0R/ZOL0R_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 176, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Lie on the sofa/couch.\nB. Throw the clothes.\nC. Take the clothes.\nD. Close the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "lie on the sofa couch", + "throw the clothes", + "take the clothes", + "close the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "VS5IY/VS5IY_0.jpeg", + "VS5IY/VS5IY_1.jpeg", + "VS5IY/VS5IY_2.jpeg", + "VS5IY/VS5IY_3.jpeg", + "VS5IY/VS5IY_4.jpeg", + "VS5IY/VS5IY_5.jpeg", + "VS5IY/VS5IY_6.jpeg", + "VS5IY/VS5IY_7.jpeg", + "VS5IY/VS5IY_8.jpeg", + "VS5IY/VS5IY_9.jpeg", + "VS5IY/VS5IY_10.jpeg", + "VS5IY/VS5IY_11.jpeg", + "VS5IY/VS5IY_12.jpeg", + "VS5IY/VS5IY_13.jpeg", + "VS5IY/VS5IY_14.jpeg", + "VS5IY/VS5IY_15.jpeg", + "VS5IY/VS5IY_16.jpeg", + "VS5IY/VS5IY_17.jpeg", + "VS5IY/VS5IY_18.jpeg", + "VS5IY/VS5IY_19.jpeg", + "VS5IY/VS5IY_20.jpeg", + "VS5IY/VS5IY_21.jpeg", + "VS5IY/VS5IY_22.jpeg", + "VS5IY/VS5IY_23.jpeg", + "VS5IY/VS5IY_24.jpeg", + "VS5IY/VS5IY_25.jpeg", + "VS5IY/VS5IY_26.jpeg", + "VS5IY/VS5IY_27.jpeg", + "VS5IY/VS5IY_28.jpeg", + "VS5IY/VS5IY_29.jpeg", + "VS5IY/VS5IY_30.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 177, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Wash the mirror.\nB. Close the box.\nC. Throw the clothes.\nD. Take the sandwich.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "wash the mirror", + "close the box", + "throw the clothes", + "take the sandwich" + ], + "image_quantity_level": "Medium", + "image": [ + "6CAZU/6CAZU_0.jpeg", + "6CAZU/6CAZU_1.jpeg", + "6CAZU/6CAZU_2.jpeg", + "6CAZU/6CAZU_3.jpeg", + "6CAZU/6CAZU_4.jpeg", + "6CAZU/6CAZU_5.jpeg", + "6CAZU/6CAZU_6.jpeg", + "6CAZU/6CAZU_7.jpeg", + "6CAZU/6CAZU_8.jpeg", + "6CAZU/6CAZU_9.jpeg", + "6CAZU/6CAZU_10.jpeg", + "6CAZU/6CAZU_11.jpeg", + "6CAZU/6CAZU_12.jpeg", + "6CAZU/6CAZU_13.jpeg", + "6CAZU/6CAZU_14.jpeg", + "6CAZU/6CAZU_15.jpeg", + "6CAZU/6CAZU_16.jpeg", + "6CAZU/6CAZU_17.jpeg", + "6CAZU/6CAZU_18.jpeg", + "6CAZU/6CAZU_19.jpeg", + "6CAZU/6CAZU_20.jpeg", + "6CAZU/6CAZU_21.jpeg", + "6CAZU/6CAZU_22.jpeg", + "6CAZU/6CAZU_23.jpeg", + "6CAZU/6CAZU_24.jpeg", + "6CAZU/6CAZU_25.jpeg", + "6CAZU/6CAZU_26.jpeg", + "6CAZU/6CAZU_27.jpeg", + "6CAZU/6CAZU_28.jpeg", + "6CAZU/6CAZU_29.jpeg", + "6CAZU/6CAZU_30.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 190, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the book.\nB. Put down the cup/glass/bottle.\nC. Open the door.\nD. Close the door.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the book", + "put down the cup glass bottle", + "open the door", + "close the door" + ], + "image_quantity_level": "Medium", + "image": [ + "G6ZOB/G6ZOB_0.jpeg", + "G6ZOB/G6ZOB_1.jpeg", + "G6ZOB/G6ZOB_2.jpeg", + "G6ZOB/G6ZOB_3.jpeg", + "G6ZOB/G6ZOB_4.jpeg", + "G6ZOB/G6ZOB_5.jpeg", + "G6ZOB/G6ZOB_6.jpeg", + "G6ZOB/G6ZOB_7.jpeg", + "G6ZOB/G6ZOB_8.jpeg", + "G6ZOB/G6ZOB_9.jpeg", + "G6ZOB/G6ZOB_10.jpeg", + "G6ZOB/G6ZOB_11.jpeg", + "G6ZOB/G6ZOB_12.jpeg", + "G6ZOB/G6ZOB_13.jpeg", + "G6ZOB/G6ZOB_14.jpeg", + "G6ZOB/G6ZOB_15.jpeg", + "G6ZOB/G6ZOB_16.jpeg", + "G6ZOB/G6ZOB_17.jpeg", + "G6ZOB/G6ZOB_18.jpeg", + "G6ZOB/G6ZOB_19.jpeg", + "G6ZOB/G6ZOB_20.jpeg", + "G6ZOB/G6ZOB_21.jpeg", + "G6ZOB/G6ZOB_22.jpeg", + "G6ZOB/G6ZOB_23.jpeg", + "G6ZOB/G6ZOB_24.jpeg", + "G6ZOB/G6ZOB_25.jpeg", + "G6ZOB/G6ZOB_26.jpeg", + "G6ZOB/G6ZOB_27.jpeg", + "G6ZOB/G6ZOB_28.jpeg", + "G6ZOB/G6ZOB_29.jpeg", + "G6ZOB/G6ZOB_30.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 115, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the sandwich.\nB. Put down the book.\nC. Lie on the floor.\nD. Take the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the sandwich", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the sandwich", + "put down the book", + "lie on the floor", + "take the dish" + ], + "image_quantity_level": "Many", + "image": [ + "OA9W5/OA9W5_0.jpeg", + "OA9W5/OA9W5_1.jpeg", + "OA9W5/OA9W5_2.jpeg", + "OA9W5/OA9W5_3.jpeg", + "OA9W5/OA9W5_4.jpeg", + "OA9W5/OA9W5_5.jpeg", + "OA9W5/OA9W5_6.jpeg", + "OA9W5/OA9W5_7.jpeg", + "OA9W5/OA9W5_8.jpeg", + "OA9W5/OA9W5_9.jpeg", + "OA9W5/OA9W5_10.jpeg", + "OA9W5/OA9W5_11.jpeg", + "OA9W5/OA9W5_12.jpeg", + "OA9W5/OA9W5_13.jpeg", + "OA9W5/OA9W5_14.jpeg", + "OA9W5/OA9W5_15.jpeg", + "OA9W5/OA9W5_16.jpeg", + "OA9W5/OA9W5_17.jpeg", + "OA9W5/OA9W5_18.jpeg", + "OA9W5/OA9W5_19.jpeg", + "OA9W5/OA9W5_20.jpeg", + "OA9W5/OA9W5_21.jpeg", + "OA9W5/OA9W5_22.jpeg", + "OA9W5/OA9W5_23.jpeg", + "OA9W5/OA9W5_24.jpeg", + "OA9W5/OA9W5_25.jpeg", + "OA9W5/OA9W5_26.jpeg", + "OA9W5/OA9W5_27.jpeg", + "OA9W5/OA9W5_28.jpeg", + "OA9W5/OA9W5_29.jpeg", + "OA9W5/OA9W5_30.jpeg", + "OA9W5/OA9W5_31.jpeg", + "OA9W5/OA9W5_32.jpeg", + "OA9W5/OA9W5_33.jpeg" + ], + "extracted": "B", + "result": 0 + }, + { + "sample_id": 169, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the window.\nB. Take the food.\nC. Put down the clothes.\nD. Open the book.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "open the window", + "take the food", + "put down the clothes", + "open the book" + ], + "image_quantity_level": "Many", + "image": [ + "6VF2L/6VF2L_0.jpeg", + "6VF2L/6VF2L_1.jpeg", + "6VF2L/6VF2L_2.jpeg", + "6VF2L/6VF2L_3.jpeg", + "6VF2L/6VF2L_4.jpeg", + "6VF2L/6VF2L_5.jpeg", + "6VF2L/6VF2L_6.jpeg", + "6VF2L/6VF2L_7.jpeg", + "6VF2L/6VF2L_8.jpeg", + "6VF2L/6VF2L_9.jpeg", + "6VF2L/6VF2L_10.jpeg", + "6VF2L/6VF2L_11.jpeg", + "6VF2L/6VF2L_12.jpeg", + "6VF2L/6VF2L_13.jpeg", + "6VF2L/6VF2L_14.jpeg", + "6VF2L/6VF2L_15.jpeg", + "6VF2L/6VF2L_16.jpeg", + "6VF2L/6VF2L_17.jpeg", + "6VF2L/6VF2L_18.jpeg", + "6VF2L/6VF2L_19.jpeg", + "6VF2L/6VF2L_20.jpeg", + "6VF2L/6VF2L_21.jpeg", + "6VF2L/6VF2L_22.jpeg", + "6VF2L/6VF2L_23.jpeg", + "6VF2L/6VF2L_24.jpeg", + "6VF2L/6VF2L_25.jpeg", + "6VF2L/6VF2L_26.jpeg", + "6VF2L/6VF2L_27.jpeg", + "6VF2L/6VF2L_28.jpeg", + "6VF2L/6VF2L_29.jpeg", + "6VF2L/6VF2L_30.jpeg", + "6VF2L/6VF2L_31.jpeg", + "6VF2L/6VF2L_32.jpeg", + "6VF2L/6VF2L_33.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 192, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Open the box.\nB. Take the bag.\nC. Wash the table.\nD. Put down the dish.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "open the box", + "take the bag", + "wash the table", + "put down the dish" + ], + "image_quantity_level": "Many", + "image": [ + "GTK8W/GTK8W_0.jpeg", + "GTK8W/GTK8W_1.jpeg", + "GTK8W/GTK8W_2.jpeg", + "GTK8W/GTK8W_3.jpeg", + "GTK8W/GTK8W_4.jpeg", + "GTK8W/GTK8W_5.jpeg", + "GTK8W/GTK8W_6.jpeg", + "GTK8W/GTK8W_7.jpeg", + "GTK8W/GTK8W_8.jpeg", + "GTK8W/GTK8W_9.jpeg", + "GTK8W/GTK8W_10.jpeg", + "GTK8W/GTK8W_11.jpeg", + "GTK8W/GTK8W_12.jpeg", + "GTK8W/GTK8W_13.jpeg", + "GTK8W/GTK8W_14.jpeg", + "GTK8W/GTK8W_15.jpeg", + "GTK8W/GTK8W_16.jpeg", + "GTK8W/GTK8W_17.jpeg", + "GTK8W/GTK8W_18.jpeg", + "GTK8W/GTK8W_19.jpeg", + "GTK8W/GTK8W_20.jpeg", + "GTK8W/GTK8W_21.jpeg", + "GTK8W/GTK8W_22.jpeg", + "GTK8W/GTK8W_23.jpeg", + "GTK8W/GTK8W_24.jpeg", + "GTK8W/GTK8W_25.jpeg", + "GTK8W/GTK8W_26.jpeg", + "GTK8W/GTK8W_27.jpeg", + "GTK8W/GTK8W_28.jpeg", + "GTK8W/GTK8W_29.jpeg", + "GTK8W/GTK8W_30.jpeg", + "GTK8W/GTK8W_31.jpeg", + "GTK8W/GTK8W_32.jpeg", + "GTK8W/GTK8W_33.jpeg", + "GTK8W/GTK8W_34.jpeg", + "GTK8W/GTK8W_35.jpeg", + "GTK8W/GTK8W_36.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 105, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the dish.\nB. Take the book.\nC. Eat the sandwich.\nD. Throw the box.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the book", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the dish", + "take the book", + "eat the sandwich", + "throw the box" + ], + "image_quantity_level": "Medium", + "image": [ + "KQYR0/KQYR0_0.jpeg", + "KQYR0/KQYR0_1.jpeg", + "KQYR0/KQYR0_2.jpeg", + "KQYR0/KQYR0_3.jpeg", + "KQYR0/KQYR0_4.jpeg", + "KQYR0/KQYR0_5.jpeg", + "KQYR0/KQYR0_6.jpeg", + "KQYR0/KQYR0_7.jpeg", + "KQYR0/KQYR0_8.jpeg", + "KQYR0/KQYR0_9.jpeg", + "KQYR0/KQYR0_10.jpeg", + "KQYR0/KQYR0_11.jpeg", + "KQYR0/KQYR0_12.jpeg", + "KQYR0/KQYR0_13.jpeg", + "KQYR0/KQYR0_14.jpeg", + "KQYR0/KQYR0_15.jpeg", + "KQYR0/KQYR0_16.jpeg", + "KQYR0/KQYR0_17.jpeg", + "KQYR0/KQYR0_18.jpeg", + "KQYR0/KQYR0_19.jpeg", + "KQYR0/KQYR0_20.jpeg", + "KQYR0/KQYR0_21.jpeg", + "KQYR0/KQYR0_22.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 165, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Throw the bag.\nB. Take the book.\nC. Put down the phone/camera.\nD. Open the refrigerator.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "open the refrigerator", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "throw the bag", + "take the book", + "put down the phone camera", + "open the refrigerator" + ], + "image_quantity_level": "Many", + "image": [ + "24XHS/24XHS_0.jpeg", + "24XHS/24XHS_1.jpeg", + "24XHS/24XHS_2.jpeg", + "24XHS/24XHS_3.jpeg", + "24XHS/24XHS_4.jpeg", + "24XHS/24XHS_5.jpeg", + "24XHS/24XHS_6.jpeg", + "24XHS/24XHS_7.jpeg", + "24XHS/24XHS_8.jpeg", + "24XHS/24XHS_9.jpeg", + "24XHS/24XHS_10.jpeg", + "24XHS/24XHS_11.jpeg", + "24XHS/24XHS_12.jpeg", + "24XHS/24XHS_13.jpeg", + "24XHS/24XHS_14.jpeg", + "24XHS/24XHS_15.jpeg", + "24XHS/24XHS_16.jpeg", + "24XHS/24XHS_17.jpeg", + "24XHS/24XHS_18.jpeg", + "24XHS/24XHS_19.jpeg", + "24XHS/24XHS_20.jpeg", + "24XHS/24XHS_21.jpeg", + "24XHS/24XHS_22.jpeg", + "24XHS/24XHS_23.jpeg", + "24XHS/24XHS_24.jpeg", + "24XHS/24XHS_25.jpeg", + "24XHS/24XHS_26.jpeg", + "24XHS/24XHS_27.jpeg", + "24XHS/24XHS_28.jpeg", + "24XHS/24XHS_29.jpeg", + "24XHS/24XHS_30.jpeg", + "24XHS/24XHS_31.jpeg", + "24XHS/24XHS_32.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 198, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the clothes.\nB. Close the door.\nC. Tidy up the table.\nD. Wash the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the clothes", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the clothes", + "close the door", + "tidy up the table", + "wash the clothes" + ], + "image_quantity_level": "Many", + "image": [ + "MC6J7/MC6J7_0.jpeg", + "MC6J7/MC6J7_1.jpeg", + "MC6J7/MC6J7_2.jpeg", + "MC6J7/MC6J7_3.jpeg", + "MC6J7/MC6J7_4.jpeg", + "MC6J7/MC6J7_5.jpeg", + "MC6J7/MC6J7_6.jpeg", + "MC6J7/MC6J7_7.jpeg", + "MC6J7/MC6J7_8.jpeg", + "MC6J7/MC6J7_9.jpeg", + "MC6J7/MC6J7_10.jpeg", + "MC6J7/MC6J7_11.jpeg", + "MC6J7/MC6J7_12.jpeg", + "MC6J7/MC6J7_13.jpeg", + "MC6J7/MC6J7_14.jpeg", + "MC6J7/MC6J7_15.jpeg", + "MC6J7/MC6J7_16.jpeg", + "MC6J7/MC6J7_17.jpeg", + "MC6J7/MC6J7_18.jpeg", + "MC6J7/MC6J7_19.jpeg", + "MC6J7/MC6J7_20.jpeg", + "MC6J7/MC6J7_21.jpeg", + "MC6J7/MC6J7_22.jpeg", + "MC6J7/MC6J7_23.jpeg", + "MC6J7/MC6J7_24.jpeg", + "MC6J7/MC6J7_25.jpeg", + "MC6J7/MC6J7_26.jpeg", + "MC6J7/MC6J7_27.jpeg", + "MC6J7/MC6J7_28.jpeg", + "MC6J7/MC6J7_29.jpeg", + "MC6J7/MC6J7_30.jpeg", + "MC6J7/MC6J7_31.jpeg", + "MC6J7/MC6J7_32.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 197, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the cup/glass/bottle.\nB. Close the box.\nC. Put down the blanket.\nD. Take the pillow.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the cup glass bottle", + "close the box", + "put down the blanket", + "take the pillow" + ], + "image_quantity_level": "Many", + "image": [ + "WLA52/WLA52_0.jpeg", + "WLA52/WLA52_1.jpeg", + "WLA52/WLA52_2.jpeg", + "WLA52/WLA52_3.jpeg", + "WLA52/WLA52_4.jpeg", + "WLA52/WLA52_5.jpeg", + "WLA52/WLA52_6.jpeg", + "WLA52/WLA52_7.jpeg", + "WLA52/WLA52_8.jpeg", + "WLA52/WLA52_9.jpeg", + "WLA52/WLA52_10.jpeg", + "WLA52/WLA52_11.jpeg", + "WLA52/WLA52_12.jpeg", + "WLA52/WLA52_13.jpeg", + "WLA52/WLA52_14.jpeg", + "WLA52/WLA52_15.jpeg", + "WLA52/WLA52_16.jpeg", + "WLA52/WLA52_17.jpeg", + "WLA52/WLA52_18.jpeg", + "WLA52/WLA52_19.jpeg", + "WLA52/WLA52_20.jpeg", + "WLA52/WLA52_21.jpeg", + "WLA52/WLA52_22.jpeg", + "WLA52/WLA52_23.jpeg", + "WLA52/WLA52_24.jpeg", + "WLA52/WLA52_25.jpeg", + "WLA52/WLA52_26.jpeg", + "WLA52/WLA52_27.jpeg", + "WLA52/WLA52_28.jpeg", + "WLA52/WLA52_29.jpeg", + "WLA52/WLA52_30.jpeg", + "WLA52/WLA52_31.jpeg", + "WLA52/WLA52_32.jpeg", + "WLA52/WLA52_33.jpeg", + "WLA52/WLA52_34.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 156, + "question": "Using the given pictures as a reference, anticipate the person's upcoming action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Eat the sandwich.\nB. Close the refrigerator.\nC. Take the bag.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "eat the sandwich", + "close the refrigerator", + "take the bag", + "sit on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "Q3HZI/Q3HZI_0.jpeg", + "Q3HZI/Q3HZI_1.jpeg", + "Q3HZI/Q3HZI_2.jpeg", + "Q3HZI/Q3HZI_3.jpeg", + "Q3HZI/Q3HZI_4.jpeg", + "Q3HZI/Q3HZI_5.jpeg", + "Q3HZI/Q3HZI_6.jpeg", + "Q3HZI/Q3HZI_7.jpeg", + "Q3HZI/Q3HZI_8.jpeg", + "Q3HZI/Q3HZI_9.jpeg", + "Q3HZI/Q3HZI_10.jpeg", + "Q3HZI/Q3HZI_11.jpeg", + "Q3HZI/Q3HZI_12.jpeg", + "Q3HZI/Q3HZI_13.jpeg", + "Q3HZI/Q3HZI_14.jpeg", + "Q3HZI/Q3HZI_15.jpeg", + "Q3HZI/Q3HZI_16.jpeg", + "Q3HZI/Q3HZI_17.jpeg", + "Q3HZI/Q3HZI_18.jpeg", + "Q3HZI/Q3HZI_19.jpeg", + "Q3HZI/Q3HZI_20.jpeg", + "Q3HZI/Q3HZI_21.jpeg", + "Q3HZI/Q3HZI_22.jpeg", + "Q3HZI/Q3HZI_23.jpeg", + "Q3HZI/Q3HZI_24.jpeg", + "Q3HZI/Q3HZI_25.jpeg", + "Q3HZI/Q3HZI_26.jpeg", + "Q3HZI/Q3HZI_27.jpeg", + "Q3HZI/Q3HZI_28.jpeg", + "Q3HZI/Q3HZI_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 159, + "question": "Evaluate the given images and infer the individual's next step. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the cup/glass/bottle.\nB. Put down the bag.\nC. Put down the cup/glass/bottle.\nD. Take the clothes.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the bag", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take the cup glass bottle", + "put down the bag", + "put down the cup glass bottle", + "take the clothes" + ], + "image_quantity_level": "Medium", + "image": [ + "0Z36L/0Z36L_0.jpeg", + "0Z36L/0Z36L_1.jpeg", + "0Z36L/0Z36L_2.jpeg", + "0Z36L/0Z36L_3.jpeg", + "0Z36L/0Z36L_4.jpeg", + "0Z36L/0Z36L_5.jpeg", + "0Z36L/0Z36L_6.jpeg", + "0Z36L/0Z36L_7.jpeg", + "0Z36L/0Z36L_8.jpeg", + "0Z36L/0Z36L_9.jpeg", + "0Z36L/0Z36L_10.jpeg", + "0Z36L/0Z36L_11.jpeg", + "0Z36L/0Z36L_12.jpeg", + "0Z36L/0Z36L_13.jpeg", + "0Z36L/0Z36L_14.jpeg", + "0Z36L/0Z36L_15.jpeg", + "0Z36L/0Z36L_16.jpeg", + "0Z36L/0Z36L_17.jpeg", + "0Z36L/0Z36L_18.jpeg", + "0Z36L/0Z36L_19.jpeg", + "0Z36L/0Z36L_20.jpeg", + "0Z36L/0Z36L_21.jpeg", + "0Z36L/0Z36L_22.jpeg", + "0Z36L/0Z36L_23.jpeg", + "0Z36L/0Z36L_24.jpeg", + "0Z36L/0Z36L_25.jpeg", + "0Z36L/0Z36L_26.jpeg", + "0Z36L/0Z36L_27.jpeg", + "0Z36L/0Z36L_28.jpeg", + "0Z36L/0Z36L_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 162, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next with the floor?\nChoice list: \nA. Take.\nB. Sit on.\nC. Lie on.\nD. Close.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "take", + "sit on", + "lie on", + "close" + ], + "image_quantity_level": "Medium", + "image": [ + "Z6HEA/Z6HEA_0.jpeg", + "Z6HEA/Z6HEA_1.jpeg", + "Z6HEA/Z6HEA_2.jpeg", + "Z6HEA/Z6HEA_3.jpeg", + "Z6HEA/Z6HEA_4.jpeg", + "Z6HEA/Z6HEA_5.jpeg", + "Z6HEA/Z6HEA_6.jpeg", + "Z6HEA/Z6HEA_7.jpeg", + "Z6HEA/Z6HEA_8.jpeg", + "Z6HEA/Z6HEA_9.jpeg", + "Z6HEA/Z6HEA_10.jpeg", + "Z6HEA/Z6HEA_11.jpeg", + "Z6HEA/Z6HEA_12.jpeg", + "Z6HEA/Z6HEA_13.jpeg", + "Z6HEA/Z6HEA_14.jpeg", + "Z6HEA/Z6HEA_15.jpeg", + "Z6HEA/Z6HEA_16.jpeg", + "Z6HEA/Z6HEA_17.jpeg", + "Z6HEA/Z6HEA_18.jpeg", + "Z6HEA/Z6HEA_19.jpeg", + "Z6HEA/Z6HEA_20.jpeg", + "Z6HEA/Z6HEA_21.jpeg", + "Z6HEA/Z6HEA_22.jpeg", + "Z6HEA/Z6HEA_23.jpeg", + "Z6HEA/Z6HEA_24.jpeg", + "Z6HEA/Z6HEA_25.jpeg", + "Z6HEA/Z6HEA_26.jpeg", + "Z6HEA/Z6HEA_27.jpeg", + "Z6HEA/Z6HEA_28.jpeg", + "Z6HEA/Z6HEA_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 166, + "question": "With the given visuals as your guide, infer the individual's impending action. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Tidy up the table.\nB. Take the sandwich.\nC. Throw the blanket.\nD. Open the bag.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "throw the blanket", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "tidy up the table", + "take the sandwich", + "throw the blanket", + "open the bag" + ], + "image_quantity_level": "Medium", + "image": [ + "UYZKX/UYZKX_0.jpeg", + "UYZKX/UYZKX_1.jpeg", + "UYZKX/UYZKX_2.jpeg", + "UYZKX/UYZKX_3.jpeg", + "UYZKX/UYZKX_4.jpeg", + "UYZKX/UYZKX_5.jpeg", + "UYZKX/UYZKX_6.jpeg", + "UYZKX/UYZKX_7.jpeg", + "UYZKX/UYZKX_8.jpeg", + "UYZKX/UYZKX_9.jpeg", + "UYZKX/UYZKX_10.jpeg", + "UYZKX/UYZKX_11.jpeg", + "UYZKX/UYZKX_12.jpeg", + "UYZKX/UYZKX_13.jpeg", + "UYZKX/UYZKX_14.jpeg", + "UYZKX/UYZKX_15.jpeg", + "UYZKX/UYZKX_16.jpeg", + "UYZKX/UYZKX_17.jpeg", + "UYZKX/UYZKX_18.jpeg", + "UYZKX/UYZKX_19.jpeg", + "UYZKX/UYZKX_20.jpeg", + "UYZKX/UYZKX_21.jpeg", + "UYZKX/UYZKX_22.jpeg", + "UYZKX/UYZKX_23.jpeg", + "UYZKX/UYZKX_24.jpeg", + "UYZKX/UYZKX_25.jpeg", + "UYZKX/UYZKX_26.jpeg", + "UYZKX/UYZKX_27.jpeg", + "UYZKX/UYZKX_28.jpeg", + "UYZKX/UYZKX_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 172, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the food.\nB. Take the paper/notebook.\nC. Lie on the bed.\nD. Sit on the sofa/couch.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "b", + "choice_list": [ + "put down the food", + "take the paper notebook", + "lie on the bed", + "sit on the sofa couch" + ], + "image_quantity_level": "Medium", + "image": [ + "AZFKK/AZFKK_0.jpeg", + "AZFKK/AZFKK_1.jpeg", + "AZFKK/AZFKK_2.jpeg", + "AZFKK/AZFKK_3.jpeg", + "AZFKK/AZFKK_4.jpeg", + "AZFKK/AZFKK_5.jpeg", + "AZFKK/AZFKK_6.jpeg", + "AZFKK/AZFKK_7.jpeg", + "AZFKK/AZFKK_8.jpeg", + "AZFKK/AZFKK_9.jpeg", + "AZFKK/AZFKK_10.jpeg", + "AZFKK/AZFKK_11.jpeg", + "AZFKK/AZFKK_12.jpeg", + "AZFKK/AZFKK_13.jpeg", + "AZFKK/AZFKK_14.jpeg", + "AZFKK/AZFKK_15.jpeg", + "AZFKK/AZFKK_16.jpeg", + "AZFKK/AZFKK_17.jpeg", + "AZFKK/AZFKK_18.jpeg", + "AZFKK/AZFKK_19.jpeg", + "AZFKK/AZFKK_20.jpeg", + "AZFKK/AZFKK_21.jpeg", + "AZFKK/AZFKK_22.jpeg", + "AZFKK/AZFKK_23.jpeg", + "AZFKK/AZFKK_24.jpeg", + "AZFKK/AZFKK_25.jpeg", + "AZFKK/AZFKK_26.jpeg", + "AZFKK/AZFKK_27.jpeg", + "AZFKK/AZFKK_28.jpeg", + "AZFKK/AZFKK_29.jpeg" + ], + "extracted": "B", + "result": 1 + }, + { + "sample_id": 175, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the paper/notebook.\nB. Take the cup/glass/bottle.\nC. Eat the sandwich.\nD. Open the window.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the paper notebook", + "take the cup glass bottle", + "eat the sandwich", + "open the window" + ], + "image_quantity_level": "Medium", + "image": [ + "F9YMU/F9YMU_0.jpeg", + "F9YMU/F9YMU_1.jpeg", + "F9YMU/F9YMU_2.jpeg", + "F9YMU/F9YMU_3.jpeg", + "F9YMU/F9YMU_4.jpeg", + "F9YMU/F9YMU_5.jpeg", + "F9YMU/F9YMU_6.jpeg", + "F9YMU/F9YMU_7.jpeg", + "F9YMU/F9YMU_8.jpeg", + "F9YMU/F9YMU_9.jpeg", + "F9YMU/F9YMU_10.jpeg", + "F9YMU/F9YMU_11.jpeg", + "F9YMU/F9YMU_12.jpeg", + "F9YMU/F9YMU_13.jpeg", + "F9YMU/F9YMU_14.jpeg", + "F9YMU/F9YMU_15.jpeg", + "F9YMU/F9YMU_16.jpeg", + "F9YMU/F9YMU_17.jpeg", + "F9YMU/F9YMU_18.jpeg", + "F9YMU/F9YMU_19.jpeg", + "F9YMU/F9YMU_20.jpeg", + "F9YMU/F9YMU_21.jpeg", + "F9YMU/F9YMU_22.jpeg", + "F9YMU/F9YMU_23.jpeg", + "F9YMU/F9YMU_24.jpeg", + "F9YMU/F9YMU_25.jpeg", + "F9YMU/F9YMU_26.jpeg", + "F9YMU/F9YMU_27.jpeg", + "F9YMU/F9YMU_28.jpeg", + "F9YMU/F9YMU_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 178, + "question": "Analyze the provided visuals and forecast the individual's subsequent move. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the box.\nB. Open the window.\nC. Open the refrigerator.\nD. Sit at the table.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit at the table", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down the box", + "open the window", + "open the refrigerator", + "sit at the table" + ], + "image_quantity_level": "Medium", + "image": [ + "YCGJS/YCGJS_0.jpeg", + "YCGJS/YCGJS_1.jpeg", + "YCGJS/YCGJS_2.jpeg", + "YCGJS/YCGJS_3.jpeg", + "YCGJS/YCGJS_4.jpeg", + "YCGJS/YCGJS_5.jpeg", + "YCGJS/YCGJS_6.jpeg", + "YCGJS/YCGJS_7.jpeg", + "YCGJS/YCGJS_8.jpeg", + "YCGJS/YCGJS_9.jpeg", + "YCGJS/YCGJS_10.jpeg", + "YCGJS/YCGJS_11.jpeg", + "YCGJS/YCGJS_12.jpeg", + "YCGJS/YCGJS_13.jpeg", + "YCGJS/YCGJS_14.jpeg", + "YCGJS/YCGJS_15.jpeg", + "YCGJS/YCGJS_16.jpeg", + "YCGJS/YCGJS_17.jpeg", + "YCGJS/YCGJS_18.jpeg", + "YCGJS/YCGJS_19.jpeg", + "YCGJS/YCGJS_20.jpeg", + "YCGJS/YCGJS_21.jpeg", + "YCGJS/YCGJS_22.jpeg", + "YCGJS/YCGJS_23.jpeg", + "YCGJS/YCGJS_24.jpeg", + "YCGJS/YCGJS_25.jpeg", + "YCGJS/YCGJS_26.jpeg", + "YCGJS/YCGJS_27.jpeg", + "YCGJS/YCGJS_28.jpeg", + "YCGJS/YCGJS_29.jpeg" + ], + "extracted": "A", + "result": 0 + }, + { + "sample_id": 181, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next with the blanket?\nChoice list: \nA. Sit at.\nB. Throw.\nC. Tidy up.\nD. Put down.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "sit at", + "throw", + "tidy up", + "put down" + ], + "image_quantity_level": "Medium", + "image": [ + "3ND23/3ND23_0.jpeg", + "3ND23/3ND23_1.jpeg", + "3ND23/3ND23_2.jpeg", + "3ND23/3ND23_3.jpeg", + "3ND23/3ND23_4.jpeg", + "3ND23/3ND23_5.jpeg", + "3ND23/3ND23_6.jpeg", + "3ND23/3ND23_7.jpeg", + "3ND23/3ND23_8.jpeg", + "3ND23/3ND23_9.jpeg", + "3ND23/3ND23_10.jpeg", + "3ND23/3ND23_11.jpeg", + "3ND23/3ND23_12.jpeg", + "3ND23/3ND23_13.jpeg", + "3ND23/3ND23_14.jpeg", + "3ND23/3ND23_15.jpeg", + "3ND23/3ND23_16.jpeg", + "3ND23/3ND23_17.jpeg", + "3ND23/3ND23_18.jpeg", + "3ND23/3ND23_19.jpeg", + "3ND23/3ND23_20.jpeg", + "3ND23/3ND23_21.jpeg", + "3ND23/3ND23_22.jpeg", + "3ND23/3ND23_23.jpeg", + "3ND23/3ND23_24.jpeg", + "3ND23/3ND23_25.jpeg", + "3ND23/3ND23_26.jpeg", + "3ND23/3ND23_27.jpeg", + "3ND23/3ND23_28.jpeg", + "3ND23/3ND23_29.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 184, + "question": "Infer the next action of the person from the presented images. You must choose your answer from the Choice List.\nWhat will the person do next with the clothes?\nChoice list: \nA. Put down.\nB. Take.\nC. Throw.\nD. Wash.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "put down", + "take", + "throw", + "wash" + ], + "image_quantity_level": "Medium", + "image": [ + "15PMU/15PMU_0.jpeg", + "15PMU/15PMU_1.jpeg", + "15PMU/15PMU_2.jpeg", + "15PMU/15PMU_3.jpeg", + "15PMU/15PMU_4.jpeg", + "15PMU/15PMU_5.jpeg", + "15PMU/15PMU_6.jpeg", + "15PMU/15PMU_7.jpeg", + "15PMU/15PMU_8.jpeg", + "15PMU/15PMU_9.jpeg", + "15PMU/15PMU_10.jpeg", + "15PMU/15PMU_11.jpeg", + "15PMU/15PMU_12.jpeg", + "15PMU/15PMU_13.jpeg", + "15PMU/15PMU_14.jpeg", + "15PMU/15PMU_15.jpeg", + "15PMU/15PMU_16.jpeg", + "15PMU/15PMU_17.jpeg", + "15PMU/15PMU_18.jpeg", + "15PMU/15PMU_19.jpeg", + "15PMU/15PMU_20.jpeg", + "15PMU/15PMU_21.jpeg", + "15PMU/15PMU_22.jpeg", + "15PMU/15PMU_23.jpeg", + "15PMU/15PMU_24.jpeg", + "15PMU/15PMU_25.jpeg", + "15PMU/15PMU_26.jpeg", + "15PMU/15PMU_27.jpeg", + "15PMU/15PMU_28.jpeg", + "15PMU/15PMU_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 188, + "question": "Based on the given images, prediction the person's next action You must choose your answer from the Choice List.\nWhat will the person do next with the floor?\nChoice list: \nA. Put down.\nB. Lie on.\nC. Sit on.\nD. Eat.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "sit on", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down", + "lie on", + "sit on", + "eat" + ], + "image_quantity_level": "Medium", + "image": [ + "ATNBT/ATNBT_0.jpeg", + "ATNBT/ATNBT_1.jpeg", + "ATNBT/ATNBT_2.jpeg", + "ATNBT/ATNBT_3.jpeg", + "ATNBT/ATNBT_4.jpeg", + "ATNBT/ATNBT_5.jpeg", + "ATNBT/ATNBT_6.jpeg", + "ATNBT/ATNBT_7.jpeg", + "ATNBT/ATNBT_8.jpeg", + "ATNBT/ATNBT_9.jpeg", + "ATNBT/ATNBT_10.jpeg", + "ATNBT/ATNBT_11.jpeg", + "ATNBT/ATNBT_12.jpeg", + "ATNBT/ATNBT_13.jpeg", + "ATNBT/ATNBT_14.jpeg", + "ATNBT/ATNBT_15.jpeg", + "ATNBT/ATNBT_16.jpeg", + "ATNBT/ATNBT_17.jpeg", + "ATNBT/ATNBT_18.jpeg", + "ATNBT/ATNBT_19.jpeg", + "ATNBT/ATNBT_20.jpeg", + "ATNBT/ATNBT_21.jpeg", + "ATNBT/ATNBT_22.jpeg", + "ATNBT/ATNBT_23.jpeg", + "ATNBT/ATNBT_24.jpeg", + "ATNBT/ATNBT_25.jpeg", + "ATNBT/ATNBT_26.jpeg", + "ATNBT/ATNBT_27.jpeg", + "ATNBT/ATNBT_28.jpeg", + "ATNBT/ATNBT_29.jpeg" + ], + "extracted": "C", + "result": 1 + }, + { + "sample_id": 193, + "question": "Examine the images at hand and forecast the person's imminent activity. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Take the paper/notebook.\nB. Wash the mirror.\nC. Take the bag.\nD. Open the closet/cabinet.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "take the paper notebook", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "a", + "choice_list": [ + "take the paper notebook", + "wash the mirror", + "take the bag", + "open the closet cabinet" + ], + "image_quantity_level": "Medium", + "image": [ + "ZPRJH/ZPRJH_0.jpeg", + "ZPRJH/ZPRJH_1.jpeg", + "ZPRJH/ZPRJH_2.jpeg", + "ZPRJH/ZPRJH_3.jpeg", + "ZPRJH/ZPRJH_4.jpeg", + "ZPRJH/ZPRJH_5.jpeg", + "ZPRJH/ZPRJH_6.jpeg", + "ZPRJH/ZPRJH_7.jpeg", + "ZPRJH/ZPRJH_8.jpeg", + "ZPRJH/ZPRJH_9.jpeg", + "ZPRJH/ZPRJH_10.jpeg", + "ZPRJH/ZPRJH_11.jpeg", + "ZPRJH/ZPRJH_12.jpeg", + "ZPRJH/ZPRJH_13.jpeg", + "ZPRJH/ZPRJH_14.jpeg", + "ZPRJH/ZPRJH_15.jpeg", + "ZPRJH/ZPRJH_16.jpeg", + "ZPRJH/ZPRJH_17.jpeg", + "ZPRJH/ZPRJH_18.jpeg", + "ZPRJH/ZPRJH_19.jpeg", + "ZPRJH/ZPRJH_20.jpeg", + "ZPRJH/ZPRJH_21.jpeg", + "ZPRJH/ZPRJH_22.jpeg", + "ZPRJH/ZPRJH_23.jpeg", + "ZPRJH/ZPRJH_24.jpeg", + "ZPRJH/ZPRJH_25.jpeg", + "ZPRJH/ZPRJH_26.jpeg", + "ZPRJH/ZPRJH_27.jpeg", + "ZPRJH/ZPRJH_28.jpeg", + "ZPRJH/ZPRJH_29.jpeg" + ], + "extracted": "A", + "result": 1 + }, + { + "sample_id": 189, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next with the blanket?\nChoice list: \nA. Throw.\nB. Hold.\nC. Tidy up.\nD. Put down.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "d", + "choice_list": [ + "throw", + "hold", + "tidy up", + "put down" + ], + "image_quantity_level": "Medium", + "image": [ + "IFQS1/IFQS1_0.jpeg", + "IFQS1/IFQS1_1.jpeg", + "IFQS1/IFQS1_2.jpeg", + "IFQS1/IFQS1_3.jpeg", + "IFQS1/IFQS1_4.jpeg", + "IFQS1/IFQS1_5.jpeg", + "IFQS1/IFQS1_6.jpeg", + "IFQS1/IFQS1_7.jpeg", + "IFQS1/IFQS1_8.jpeg", + "IFQS1/IFQS1_9.jpeg", + "IFQS1/IFQS1_10.jpeg", + "IFQS1/IFQS1_11.jpeg", + "IFQS1/IFQS1_12.jpeg", + "IFQS1/IFQS1_13.jpeg", + "IFQS1/IFQS1_14.jpeg", + "IFQS1/IFQS1_15.jpeg" + ], + "extracted": "D", + "result": 1 + }, + { + "sample_id": 196, + "question": "From the images supplied, deduce the likely next step of the person. You must choose your answer from the Choice List.\nWhat will the person do next?\nChoice list: \nA. Put down the pillow.\nB. Lie on the floor.\nC. Put down the broom.\nD. Take the phone/camera.\nAnswer with the option's letter from the given choices directly.", + "gt_response": "put down the broom", + "gen_kwargs": { + "do_sample": false, + "num_beams": 1, + "max_new_tokens": 32, + "eos_token_id": 92542 + }, + "pred_response": "c", + "choice_list": [ + "put down the pillow", + "lie on the floor", + "put down the broom", + "take the phone camera" + ], + "image_quantity_level": "Medium", + "image": [ + "PZD7Z/PZD7Z_0.jpeg", + "PZD7Z/PZD7Z_1.jpeg", + "PZD7Z/PZD7Z_2.jpeg", + "PZD7Z/PZD7Z_3.jpeg", + "PZD7Z/PZD7Z_4.jpeg", + "PZD7Z/PZD7Z_5.jpeg", + "PZD7Z/PZD7Z_6.jpeg", + "PZD7Z/PZD7Z_7.jpeg", + "PZD7Z/PZD7Z_8.jpeg", + "PZD7Z/PZD7Z_9.jpeg", + "PZD7Z/PZD7Z_10.jpeg", + "PZD7Z/PZD7Z_11.jpeg", + "PZD7Z/PZD7Z_12.jpeg", + "PZD7Z/PZD7Z_13.jpeg", + "PZD7Z/PZD7Z_14.jpeg", + "PZD7Z/PZD7Z_15.jpeg", + "PZD7Z/PZD7Z_16.jpeg", + "PZD7Z/PZD7Z_17.jpeg", + "PZD7Z/PZD7Z_18.jpeg", + "PZD7Z/PZD7Z_19.jpeg", + "PZD7Z/PZD7Z_20.jpeg", + "PZD7Z/PZD7Z_21.jpeg", + "PZD7Z/PZD7Z_22.jpeg", + "PZD7Z/PZD7Z_23.jpeg", + "PZD7Z/PZD7Z_24.jpeg", + "PZD7Z/PZD7Z_25.jpeg", + "PZD7Z/PZD7Z_26.jpeg" + ], + "extracted": "C", + "result": 1 + } +] \ No newline at end of file