defaults: - _self_ - /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor] - /data: Korean_dataset - /model: tiny-ar - /strategy: ddp - /noise: loglinear - /lr_scheduler: constant_warmup mode: sample_eval # train / ppl_eval / sample_eval diffusion: absorbing_state backbone: ar # dit / dimamba / ar parameterization: ar # subs / d3pm / sedd time_conditioning: False T: 0 # 0 (continuous time) / 1000 subs_masking: False seed: 1 loader: global_batch_size: 32 eval_global_batch_size: ${.global_batch_size} # Note: batch_size and eval_batch_size are **per machine** batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}} eval_batch_size: 1 #${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}} num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"} pin_memory: True sampling: predictor: ddpm_cache # analytic, ddpm, ddpm_cache steps: 128 noise_removal: True # TODO(yair): @subham, why aren't these params under `eval`? num_sample_batches: 1 # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches num_sample_log: 1 semi_ar: False stride_length: 1 num_strides: 1 training: ema: 0.9999 antithetic_sampling: True importance_sampling: False sampling_eps: 1e-3 change_of_variables: False eval: checkpoint_path: /home/elicer/lhb01/mdlm/outputs/parkseongjun/psjkodata/2025.04.05/051927/checkpoints/best.ckpt # Used to evaluate a checkpoint after training. disable_ema: False compute_generative_perplexity: True perplexity_batch_size: 8 compute_perplexity_on_sanity: False gen_ppl_eval_model_name_or_path: gpt2-large # gpt2-large, meta-llama/Llama-2-7b-hf generate_samples: True optim: weight_decay: 0.01 lr: 5e-5 beta1: 0.9 beta2: 0.999 eps: 1e-8 trainer: _target_: lightning.Trainer accelerator: cuda num_nodes: 1 devices: ${device_count:} accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}} gradient_clip_val: 1.0 precision: 'bf16' num_sanity_val_steps: 0 max_steps: 50000 log_every_n_steps: 10 limit_train_batches: 1.0 # train on full dataset, can be used to toggle quick run limit_val_batches: 1.0 # validate on full dataset, can be used to toggle quick run val_check_interval: 0.5 wandb: project: test-ar mode: online notes: Mulan for text resume: must group: null job_type: null name: ar id: f12b7c5e-07c9-48ae-96fa-4798823b8492 tags: - ${noise.type} - ${data.train} - ${data.valid} hydra: run: dir: ./outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S} job: chdir: true checkpointing: # Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is save_dir: ${cwd:} # Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath` resume_from_ckpt: true resume_ckpt_path: /home/elicer/lhb01/mdlm/outputs/parkseongjun/psjkodata/2025.04.05/045928/checkpoints/last.ckpt