| { | |
| "vla_name": "VITRA_Paligemma", | |
| "task_name": "pretrain", | |
| "model": "vitra_paligemma2", | |
| "fwd_pred_next_n": 16, | |
| "seed": 42, | |
| "batch_size": 64, | |
| "output_root": "/data/vla_checkpoint/vitra_vla_3b/checkpoints", | |
| "log_root": "/data/vla_checkpoint/vitra_vla_3b/logs", | |
| "cache_root": "/data/vla_checkpoint/vitra_vla_3b/cache/vitra_paligemma2", | |
| "model_load_path": null, | |
| "resume": true, | |
| "wandb_project": "vitra_paligemma2_humanpretrain", | |
| "wandb_entity": "", | |
| "save_steps": 5000, | |
| "total_batch_size": 512, | |
| "use_bf16": true, | |
| "use_fov": true, | |
| "untied_cognition_token": true, | |
| "use_state": "DiT", | |
| "loss_type": "human", | |
| "train_setup": { | |
| "freeze_option": "freeze_vision_encoder" | |
| }, | |
| "state_encoder": { | |
| "state_dim": 212 | |
| }, | |
| "action_model": { | |
| "model_type": "DiT-B", | |
| "token_size": 2304, | |
| "action_dim": 192, | |
| "hidden_size": 1024 | |
| }, | |
| "vlm": { | |
| "type": "PaliGemmaForConditionalGeneration", | |
| "name": "paligemma", | |
| "pretrained_model_name_or_path": "google/paligemma2-3b-mix-224" | |
| }, | |
| "trainer": { | |
| "sharding_strategy": "shard-grad-op", | |
| "strategy": "fsdp_paligemma_with_checkpointing", | |
| "lr_scheduler_type": "backbone-freeze-warmup", | |
| "gradient_clip_val": 1.0, | |
| "learning_rate": 1e-05, | |
| "weight_decay": 0.1, | |
| "max_epochs": 100000, | |
| "max_steps": 2000000, | |
| "reduce_in_full_precision": true, | |
| "enable_mixed_precision_training": false, | |
| "enable_gradient_checkpointing": true, | |
| "action_model_learning_rate": 0.0001, | |
| "llm_freeze_step": 5000, | |
| "warmup_ratio": null | |
| }, | |
| "train_dataset": { | |
| "data_root_dir": "/data/VITRA_1M", | |
| "augmentation": true, | |
| "set_none_ratio": 0.0, | |
| "data_mix": "magic_mix", | |
| "num_workers": 18, | |
| "prefetch_factor": null, | |
| "flip_augmentation": 1.0, | |
| "action_type": "angle", | |
| "use_rel": false, | |
| "clip_len": 2000, | |
| "normalization": true, | |
| "state_mask_prob": 0.1 | |
| }, | |
| "repeated_diffusion_steps": 8, | |
| "config": "vitra/configs/human_pretrain.json", | |
| "data_mix": null, | |
| "debug": false, | |
| "num_workers": null, | |
| "prefetch_factor": null | |
| } |