VITRA-VLA-3B / config.json
arnoldland's picture
Initial commit
643312c
{
"vla_name": "VITRA_Paligemma",
"task_name": "pretrain",
"model": "vitra_paligemma2",
"fwd_pred_next_n": 16,
"seed": 42,
"batch_size": 64,
"output_root": "/data/vla_checkpoint/vitra_vla_3b/checkpoints",
"log_root": "/data/vla_checkpoint/vitra_vla_3b/logs",
"cache_root": "/data/vla_checkpoint/vitra_vla_3b/cache/vitra_paligemma2",
"model_load_path": null,
"resume": true,
"wandb_project": "vitra_paligemma2_humanpretrain",
"wandb_entity": "",
"save_steps": 5000,
"total_batch_size": 512,
"use_bf16": true,
"use_fov": true,
"untied_cognition_token": true,
"use_state": "DiT",
"loss_type": "human",
"train_setup": {
"freeze_option": "freeze_vision_encoder"
},
"state_encoder": {
"state_dim": 212
},
"action_model": {
"model_type": "DiT-B",
"token_size": 2304,
"action_dim": 192,
"hidden_size": 1024
},
"vlm": {
"type": "PaliGemmaForConditionalGeneration",
"name": "paligemma",
"pretrained_model_name_or_path": "google/paligemma2-3b-mix-224"
},
"trainer": {
"sharding_strategy": "shard-grad-op",
"strategy": "fsdp_paligemma_with_checkpointing",
"lr_scheduler_type": "backbone-freeze-warmup",
"gradient_clip_val": 1.0,
"learning_rate": 1e-05,
"weight_decay": 0.1,
"max_epochs": 100000,
"max_steps": 2000000,
"reduce_in_full_precision": true,
"enable_mixed_precision_training": false,
"enable_gradient_checkpointing": true,
"action_model_learning_rate": 0.0001,
"llm_freeze_step": 5000,
"warmup_ratio": null
},
"train_dataset": {
"data_root_dir": "/data/VITRA_1M",
"augmentation": true,
"set_none_ratio": 0.0,
"data_mix": "magic_mix",
"num_workers": 18,
"prefetch_factor": null,
"flip_augmentation": 1.0,
"action_type": "angle",
"use_rel": false,
"clip_len": 2000,
"normalization": true,
"state_mask_prob": 0.1
},
"repeated_diffusion_steps": 8,
"config": "vitra/configs/human_pretrain.json",
"data_mix": null,
"debug": false,
"num_workers": null,
"prefetch_factor": null
}