Foundry-Qwen3VLA-2.1B / config.yaml
jmercat's picture
release: initial squashed history
7f083f6
data:
action_dim: 20
action_fields:
- robot__action__poses__left::panda__xyz_relative
- robot__action__poses__right::panda__xyz_relative
- robot__action__poses__left::panda__rot_6d_relative
- robot__action__poses__right::panda__rot_6d_relative
- robot__action__grippers__left::panda_hand
- robot__action__grippers__right::panda_hand
allow_multiple_epochs: true
augmentation:
enabled: true
image:
color_jitter:
brightness: 0.2
contrast: 0.4
enabled: true
hue:
- -0.05
- 0.05
saturation: 0.2
crop:
enabled: true
mode: random
shape:
- 224
- 224
point_cloud:
color_jitter:
brightness: 0.2
contrast: 0.4
enabled: false
hue:
- -0.05
- 0.05
saturation: 0.2
camera_names:
- scene_right_0
- scene_left_0
- wrist_left_plus
- wrist_right_minus
dataset_manifest:
- null
dataset_modality:
- robotics
dataset_statistics:
- null
dataset_weighting:
- 1.0
extrinsics_fields:
- extrinsics.scene_right_0
- extrinsics.scene_left_0
- extrinsics.wrist_left_minus
- extrinsics.wrist_left_plus
- extrinsics.wrist_right_minus
- extrinsics.wrist_right_plus
image_indices:
- -1
- 0
image_names:
- scene_right_0_t-1
- scene_left_0_t-1
- wrist_left_plus_t-1
- wrist_right_minus_t-1
- scene_right_0_t0
- scene_left_0_t0
- wrist_left_plus_t0
- wrist_right_minus_t0
image_size: 224
img_num_tokens: 256
intrinsics_fields:
- intrinsics.scene_right_0
- intrinsics.scene_left_0
- intrinsics.wrist_left_minus
- intrinsics.wrist_left_plus
- intrinsics.wrist_right_minus
- intrinsics.wrist_right_plus
language_instruction_types:
- original
- randomized
- verbose
- alternative
lowdim_future_timesteps: 8
lowdim_past_timesteps: 1
mask_padded_images: true
max_text_seq_len: null
normalization:
centered_norm: true
enabled: true
epsilon: 0.01
field_configs:
robot__action__grippers__left::panda_hand:
enabled: true
epsilon: 0.01
method: percentile_1_99
scope: per_timestep
robot__action__grippers__right::panda_hand:
enabled: true
epsilon: 0.01
method: percentile_1_99
scope: per_timestep
robot__action__poses__left::panda__rot_6d_relative:
enabled: true
epsilon: 0.01
method: percentile_1_99
scope: per_timestep
robot__action__poses__left::panda__xyz_relative:
enabled: true
epsilon: 0.01
method: percentile_1_99
scope: per_timestep
robot__action__poses__right::panda__rot_6d_relative:
enabled: true
epsilon: 0.01
method: percentile_1_99
scope: per_timestep
robot__action__poses__right::panda__xyz_relative:
enabled: true
epsilon: 0.01
method: percentile_1_99
scope: per_timestep
include_fields:
- robot__action__poses__left::panda__xyz_relative
- robot__action__poses__right::panda__xyz_relative
- robot__action__poses__left::panda__rot_6d_relative
- robot__action__poses__right::panda__rot_6d_relative
- robot__action__grippers__left::panda_hand
- robot__action__grippers__right::panda_hand
lowdim_future_timesteps: 19
lowdim_past_timesteps: 5
method: percentile_1_99
scope: per_timestep
num_workers: 24
pad_missing_images: true
point_cloud_num_points: 4096
pose_groups:
- name: left_panda_action
position_key: robot__action__poses__left::panda__xyz
rotation_key: robot__action__poses__left::panda__rot_6d
- name: right_panda_action
position_key: robot__action__poses__right::panda__xyz
rotation_key: robot__action__poses__right::panda__rot_6d
prefetch_factor: 2
processor: Qwen/Qwen3-VL-2B-Thinking
processor_kwargs:
do_resize: false
proprioception_dim: 0
proprioception_fields: []
seed: 42
seq_len: 2048
shuffle: true
shuffle_buffer_size: 2000
shuffle_initial: 500
type: robotics
use_point_cloud: false
val_dataset_manifest: []
val_dataset_statistics: []
val_dataset_weighting: []
db_logging: true
distributed:
ddp_static_graph: false
device: cuda:0
dist_backend: nccl
dist_url: env://
fsdp: true
fsdp_cpu_offload: false
fsdp_reshard_after_forward: false
local_rank: 0
rank: 0
use_distributed: true
world_size: 32
ema:
alpha: 0.999
enabled: false
inv_gamma: 1.0
max_value: 0.9999
min_value: 0.0
power: 0.75
type: ema
update_after_step: 0
hparams:
beta1: 0.9
beta2: 0.95
decay: '0.3'
eps: 1.0e-08
force_min_lr: 0.0
global_batch_size: 1024
grad_checkpointing: false
grad_clip_norm: 1.0
loss_function: mse
lr: 5.0e-05
lr_cooldown_end: 0.0
lr_scheduler: cosine
optimizer: adamw
per_gpu_batch_size: 16
precision: amp_bf16
seed: 42
torchcompile: false
warmup: '1000'
wd: 1.0e-08
world_size: 32
z_loss_coefficient: 0.0
log_every_n_steps: 20
log_level: INFO
max_checkpoint_limit: null
model:
action_dim: 20
diffusion_step_conditioning: add
freeze: false
input_noise_std: 0.05
noise_scheduler:
beta_end: 0.02
beta_start: 0.0001
clamp_range:
- -1.5
- 1.5
freeze: false
num_timesteps: 1000
resume_from_checkpoint: null
resume_weights_only: false
type: noise_scheduler
num_action_head_repeats: 8
proprioception_dim: 0
resume_from_checkpoint: null
resume_weights_only: false
transformer:
attn_name: torch_attn
cast_output_to_float32: false
ffn_type: swiglu
freeze: false
hidden_dim: 1024
is_causal: true
max_seq_len: 2048
n_heads: 16
n_layers: 24
norm_eps: 1.0e-05
norm_type: default_layer_norm
positional_embedding_type: rotary
post_embed_norm: false
qk_norm: false
resume_from_checkpoint: null
resume_weights_only: false
type: transformer
vocab_size: 50432
weight_tying: false
type: diffusion_policy
use_diffusers_scheduler: false
use_flow_matching_scheduler: true
vision_language_backbone:
freeze: false
hf_pretrained: Qwen/Qwen3-VL-2B-Thinking
num_vlm_layers_to_use: 1
resume_from_checkpoint: null
resume_weights_only: false
type: vlm_backbone
name: 2026_03_24-01_40_07-model_diffusion_policy-lr_5e-05-bsz_1024
num_checkpoints: 10
num_epochs: null
remote_sync: null
remote_sync_fixed_path: null
resolve_configs: false
resolve_configs_path: null
save_path: /tmp
total_train_samples: 100000000
total_val_samples: null
val_every_n_checkpoints: 1
wandb: true
wandb_entity: tri
wandb_project_name: vla_foundry
wandb_tags: []