Hanbin42
/

my-mdlm-ar-model

PyTorch Lightning

Model card Files Files and versions

my-mdlm-ar-model / config.yaml

Hanbin42's picture

Upload config.yaml with huggingface_hub

529e40a verified about 1 year ago

history blame contribute delete

3.11 kB

	defaults:
	- _self_
	- /callbacks: [checkpoint_every_n_steps, checkpoint_monitor, learning_rate_monitor]
	- /data: Korean_dataset
	- /model: tiny-ar
	- /strategy: ddp
	- /noise: loglinear
	- /lr_scheduler: constant_warmup

	mode: sample_eval # train / ppl_eval / sample_eval
	diffusion: absorbing_state
	backbone: ar # dit / dimamba / ar
	parameterization: ar # subs / d3pm / sedd
	time_conditioning: False
	T: 0 # 0 (continuous time) / 1000
	subs_masking: False

	seed: 1

	loader:
	global_batch_size: 32
	eval_global_batch_size: ${.global_batch_size}
	# Note: batch_size and eval_batch_size are per machine
	batch_size: ${div_up:${.global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
	eval_batch_size: 1
	#${div_up:${.eval_global_batch_size}, ${eval:${trainer.devices} * ${trainer.num_nodes}}}
	num_workers: ${eval:"len(__import__('os').sched_getaffinity(0))"}
	pin_memory: True

	sampling:
	predictor: ddpm_cache # analytic, ddpm, ddpm_cache
	steps: 128
	noise_removal: True
	# TODO(yair): @subham, why aren't these params under `eval`?
	num_sample_batches: 1 # Total samples: `num_gpus` * `loader.eval_batch_size` * num_sample_batches
	num_sample_log: 1
	semi_ar: False
	stride_length: 1
	num_strides: 1

	training:
	ema: 0.9999
	antithetic_sampling: True
	importance_sampling: False
	sampling_eps: 1e-3
	change_of_variables: False

	eval:
	checkpoint_path: /home/elicer/lhb01/mdlm/outputs/parkseongjun/psjkodata/2025.04.05/051927/checkpoints/best.ckpt # Used to evaluate a checkpoint after training.
	disable_ema: False
	compute_generative_perplexity: True
	perplexity_batch_size: 8
	compute_perplexity_on_sanity: False
	gen_ppl_eval_model_name_or_path: gpt2-large # gpt2-large, meta-llama/Llama-2-7b-hf
	generate_samples: True

	optim:
	weight_decay: 0.01
	lr: 5e-5
	beta1: 0.9
	beta2: 0.999
	eps: 1e-8

	trainer:
	_target_: lightning.Trainer
	accelerator: cuda
	num_nodes: 1
	devices: ${device_count:}
	accumulate_grad_batches: ${div_up:${loader.global_batch_size}, ${eval:${trainer.devices} * ${loader.batch_size} * ${trainer.num_nodes}}}
	gradient_clip_val: 1.0
	precision: 'bf16'
	num_sanity_val_steps: 0
	max_steps: 50000
	log_every_n_steps: 10
	limit_train_batches: 1.0 # train on full dataset, can be used to toggle quick run
	limit_val_batches: 1.0 # validate on full dataset, can be used to toggle quick run
	val_check_interval: 0.5

	wandb:
	project: test-ar
	mode: online
	notes: Mulan for text
	resume: must
	group: null
	job_type: null
	name: ar
	id: f12b7c5e-07c9-48ae-96fa-4798823b8492
	tags:
	- ${noise.type}
	- ${data.train}
	- ${data.valid}

	hydra:
	run:
	dir: ./outputs/${data.train}/${now:%Y.%m.%d}/${now:%H%M%S}
	job:
	chdir: true

	checkpointing:
	# Use custom `save_dir` if, e.g., saving to S3 bucket, otherwise leave this parameter as is
	save_dir: ${cwd:}
	# Note: `checkpoints` path should correspond to `checkpoint_every_n_steps.dirpath`
	resume_from_ckpt: true
	resume_ckpt_path: /home/elicer/lhb01/mdlm/outputs/parkseongjun/psjkodata/2025.04.05/045928/checkpoints/last.ckpt