{"gen_modes": [["visual"], ["tactile"], ["audio"], ["visual", "tactile"], ["visual", "audio"], ["tactile", "audio"], ["visual", "tactile", "audio"]], "remove_tokenizers": [], "text_processor": {"module": "octo.data.utils.text_processing", "name": "HFTokenizer", "args": [], "kwargs": {"tokenizer_name": "t5-base", "encode_with_model": false, "tokenizer_kwargs": {"max_length": 24, "padding": "max_length", "truncation": true, "return_tensors": "np"}}}, "multi_head": false, "unified_lang": true, "pretrained_loaders": [{"module": "octo.utils.train_utils", "name": "tvl_loader", "args": [], "kwargs": {"restore_path": "gs://oier-europe-bucket/ported_weights/tvl/tvl_vitbgs_params_jax.npz", "verbose": true}}], "modalities": ["cam_primary", "cam_wrist", "vit_left", "vit_right"], "pretrained_path": "hf://rail-berkeley/octo-small-1.5", "batch_size": 32, "shuffle_buffer_size": 10000, "num_steps": 50000, "log_interval": 100, "eval_interval": 500000, "save_interval": 5000, "lang_interval": 10000, "save_dir": "gs://oier-europe-bucket", "seed": 42, "wandb": {"project": "octo", "group": null, "entity": null}, "dataset_kwargs": {"name": "digit_dataset:53.0.0", "data_dir": "gs://oier-europe-bucket", "image_obs_keys": {"primary": "image_0", "wrist": "image_1", "digit_left": "digit_0", "digit_left_background": "digit_0_background", "digit_right": "digit_1", "digit_right_background": "digit_1_background"}, "proprio_obs_key": null, "sensor_obs_keys": {"mel_spectro": "mel_spectro", "mic_mask": "has_mic"}, "language_key": "rephrase_batch_full", "annotation_manager_kwargs": {"force_uniform_overall": true, "reconstruction_loss_keys": ["visual", "tactile", "audio", "visual,tactile", "visual,audio", "tactile,audio", "visual,tactile,audio"], "num_gpt_gen": 20, "rephrase_prefixes": ["rephrased_2", "rephrased_3", "rephrased_4", "rephrased_5", "rephrased_6", "rephrased_7", "rephrased_8", "rephrased_0"], "all_lang_prefixes": ["all_lang_2", "all_lang_3", "all_lang_4", "all_lang_5", "all_lang_6", "all_lang_7", "all_lang_8", "all_lang_0"], "lang_info_str": "simple||visual|tactile|audio|visual,tactile|visual,audio|tactile,audio|visual,tactile,audio"}, "action_normalization_mask": [true, true, true, true, true, true, false], "standardize_fn": {"module": "octo.data.oxe.oxe_standardization_transforms", "name": "bridge_dataset_transform", "args": [], "kwargs": {}}, "num_gpt_gen_arg": 20}, "modality": "language_conditioned", "finetuning_mode": "full", "window_size": 2, "optimizer": {"learning_rate": {"name": "cosine", "init_value": 0.0, "peak_value": 0.0003, "warmup_steps": 2000, "decay_steps": 50000, "end_value": 0.0}, "weight_decay": 0.01, "clip_gradient": 1.0, "frozen_keys": ["*hf_model*"], "grad_accumulation_steps": null}, "val_kwargs": {"val_shuffle_buffer_size": 1000, "num_val_batches": 16}, "gen_kwargs": {"val_shuffle_buffer_size": 1000, "num_val_batches": 16}, "viz_kwargs": {"eval_batch_size": 64, "trajs_for_metrics": 100, "trajs_for_viz": 8, "samples_per_state": 8}, "gradcam_kwargs": {"eval_batch_size": 4, "shuffle_buffer_size": 1000, "train": false, "gradcam_kwargs_list": [["obs_primary", {"psuedo_loss_type": "loss"}], ["obs_wrist", {"psuedo_loss_type": "loss"}]]}, "frame_transform_threads": 16, "traj_transform_kwargs": {"window_size": 2, "action_horizon": 4, "goal_relabeling_strategy": null, "task_augment_strategy": "delete_task_conditioning", "task_augment_kwargs": {"keep_image_prob": 0.0}}, "frame_transform_kwargs": {"resize_size": {"primary": [256, 256], "wrist": [128, 128], "digit_left": [224, 224], "digit_right": [224, 224], "digit_left_background": [224, 224], "digit_right_background": [224, 224]}, "image_augment_kwargs": {"primary": {"random_resized_crop": {"scale": [0.8, 1.0], "ratio": [0.9, 1.1]}, "random_brightness": [0.1], "random_contrast": [0.9, 1.1], "random_saturation": [0.9, 1.1], "random_hue": [0.05], "augment_order": ["random_resized_crop", "random_brightness", "random_contrast", "random_saturation", "random_hue"]}, "wrist": {"random_brightness": [0.1], "random_contrast": [0.9, 1.1], "random_saturation": [0.9, 1.1], "random_hue": [0.05], "augment_order": ["random_brightness", "random_contrast", "random_saturation", "random_hue"]}}, "background_subtraction_map": {"image_digit_left": "image_digit_left_background", "image_digit_right": "image_digit_right_background"}}, "new_obs_tokenizers": {"mel_spectro": {"module": "octo.model.components.tokenizers", "name": "ImageTokenizer", "args": [], "kwargs": {"obs_stack_keys": ["mel_spectro"], "task_stack_keys": [], "encoder": {"module": "octo.model.components.vit_encoders", "name": "ResNet26FILM", "args": [], "kwargs": {"use_film": false}}, "add_channel_dim": true}}, "tvl": {"module": "octo.model.components.tokenizers", "name": "ImageTokenizerConcatTokens", "args": [], "kwargs": {"obs_stack_keys": ["image_digit_left", "image_digit_right"], "task_stack_keys": [], "encoder": {"module": "octo.model.components.tvl_vit", "name": "tvlViT", "args": [], "kwargs": {"img_size": [224, 224]}}}}}, "update_config": {"model": {"repeat_task_tokens": true, "heads": {"action": {"module": "octo.model.components.action_heads", "name": "MSEActionHead", "args": [], "kwargs": {"readout_key": "readout_action", "use_map": true, "action_horizon": 4, "action_dim": 7}}}, "observation_tokenizers": {"mel_spectro": {"module": "octo.model.components.tokenizers", "name": "ImageTokenizer", "args": [], "kwargs": {"obs_stack_keys": ["mel_spectro"], "task_stack_keys": [], "encoder": {"module": "octo.model.components.vit_encoders", "name": "ResNet26FILM", "args": [], "kwargs": {"use_film": false}}, "add_channel_dim": true}}, "tvl": {"module": "octo.model.components.tokenizers", "name": "ImageTokenizerConcatTokens", "args": [], "kwargs": {"obs_stack_keys": ["image_digit_left", "image_digit_right"], "task_stack_keys": [], "encoder": {"module": "octo.model.components.tvl_vit", "name": "tvlViT", "args": [], "kwargs": {"img_size": [224, 224]}}}}}, "readouts": {"language": 24}}}, "reconstruction_loss_weight": 1.0, "lang_head": {"module": "octo.model.components.language_reconstruction_heads", "name": "CLIPContrastiveHead", "args": [], "kwargs": {"readout_key": "readout_language", "use_map": true}}, "gen_head": {"module": "octo.model.components.language_reconstruction_heads", "name": "SingleHeadContinuousGenerationHead", "args": [], "kwargs": {"n_lang_tokens": 24}}, "pop_keys": [["model", "heads", "action", "kwargs", "n_diffusion_samples"], ["model", "heads", "action", "kwargs", "dropout_rate"]]}