diff --git "a/debug.log" "b/debug.log" new file mode 100644--- /dev/null +++ "b/debug.log" @@ -0,0 +1,754 @@ +[2026-01-11 04:24:19,378] [WARNING] [axolotl.utils.trainer.prepare_optim_env:644] [PID:4935] P2P support not detected, setting `NCCL_P2P_DISABLE=1` +[2026-01-11 04:24:19,378] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:4935] bf16 support detected, enabling for this configuration. +[2026-01-11 04:24:19,702] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:4935] baseline 0.000GB () +[2026-01-11 04:24:19,703] [INFO] [axolotl.cli.config.load_cfg:263] [PID:4935] config: +{ + "activation_offloading": false, + "adapter": "qlora", + "axolotl_config_path": "olmo-stage1.yaml", + "base_model": "allenai/Olmo-3.1-32B-Instruct", + "base_model_config": "allenai/Olmo-3.1-32B-Instruct", + "batch_size": 4, + "bf16": true, + "capabilities": { + "bf16": true, + "compute_capability": "sm_86", + "fp8": false, + "n_gpu": 2, + "n_node": 1 + }, + "context_parallel_size": 2, + "cut_cross_entropy": true, + "dataloader_num_workers": 2, + "dataloader_pin_memory": true, + "dataloader_prefetch_factor": 256, + "dataset_num_proc": 24, + "dataset_prepared_path": "last_run_prepared", + "datasets": [ + { + "message_property_mappings": { + "content": "content", + "role": "role" + }, + "path": "../marvin_no_anthologies.json", + "trust_remote_code": false, + "type": "completion" + } + ], + "ddp": true, + "device": "cuda:0", + "device_map": { + "": 0 + }, + "dion_rank_fraction": 1.0, + "dion_rank_multiple_of": 1, + "env_capabilities": { + "torch_version": "2.9.1" + }, + "eval_batch_size": 1, + "eval_causal_lm_metrics": [ + "sacrebleu", + "comet", + "ter", + "chrf" + ], + "eval_max_new_tokens": 128, + "eval_sample_packing": true, + "eval_steps": 0.2, + "eval_table_size": 0, + "evals_per_epoch": 5, + "experimental_skip_move_to_device": true, + "flash_attention": true, + "fp16": false, + "fsdp": [ + "full_shard", + "auto_wrap" + ], + "fsdp_config": { + "activation_checkpointing": true, + "auto_wrap_policy": "TRANSFORMER_BASED_WRAP", + "cpu_ram_efficient_loading": true, + "offload_params": true, + "state_dict_type": "FULL_STATE_DICT", + "sync_module_states": true, + "transformer_layer_cls_to_wrap": "Olmo3DecoderLayer", + "use_orig_params": false + }, + "gc_steps": 10, + "gradient_accumulation_steps": 2, + "gradient_checkpointing": false, + "group_by_length": false, + "heads_k_stride": 1, + "include_tkps": true, + "learning_rate": 1e-05, + "liger_glu_activation": true, + "liger_layer_norm": true, + "liger_rms_norm": true, + "liger_rope": true, + "lisa_layers_attribute": "model.layers", + "load_best_model_at_end": false, + "load_in_4bit": true, + "load_in_8bit": false, + "local_rank": 0, + "logging_steps": 1, + "lora_alpha": 8, + "lora_dropout": 0.05, + "lora_r": 32, + "lora_target_linear": true, + "loraplus_lr_embedding": 1e-06, + "lr_scheduler": "cosine", + "max_grad_norm": 1.0, + "mean_resizing_embeddings": false, + "micro_batch_size": 1, + "model_config_type": "olmo3", + "num_epochs": 1.0, + "optimizer": "adamw_torch_fused", + "otel_metrics_host": "localhost", + "otel_metrics_port": 8000, + "output_dir": "ckpts-olmo-qlora/marvin-full", + "pad_to_sequence_len": true, + "peft_use_rslora": true, + "plugins": [ + "axolotl.integrations.liger.LigerPlugin", + "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin" + ], + "pretrain_multipack_attn": true, + "profiler_steps_start": 0, + "qlora_sharded_model_loading": false, + "ray_num_workers": 1, + "resources_per_worker": { + "GPU": 1 + }, + "ring_attn_func": "varlen_llama3", + "sample_packing": true, + "sample_packing_bin_size": 200, + "sample_packing_group_size": 100000, + "save_only_model": false, + "save_safetensors": true, + "save_total_limit": 2, + "saves_per_epoch": 1, + "seed": 69, + "sequence_len": 32768, + "sequence_parallel_degree": 2, + "shuffle_before_merging_datasets": false, + "shuffle_merged_datasets": true, + "skip_prepare_dataset": false, + "streaming_multipack_buffer_size": 10000, + "strict": false, + "tensor_parallel_size": 1, + "tiled_mlp_use_original_mlp": true, + "tokenizer_config": "allenai/Olmo-3.1-32B-Instruct", + "tokenizer_save_jinja_files": true, + "torch_dtype": "torch.bfloat16", + "train_on_inputs": false, + "trl": { + "log_completions": false, + "mask_truncated_completions": false, + "ref_model_mixup_alpha": 0.9, + "ref_model_sync_steps": 64, + "scale_rewards": true, + "sync_ref_model": false, + "use_vllm": false, + "vllm_server_host": "0.0.0.0", + "vllm_server_port": 8000 + }, + "use_otel_metrics": false, + "use_ray": false, + "use_wandb": true, + "val_set_size": 0.025, + "vllm": { + "device": "auto", + "dtype": "auto", + "gpu_memory_utilization": 0.9, + "host": "0.0.0.0", + "port": 8000 + }, + "wandb_name": "marvin-qlora-full", + "wandb_project": "Olmo3", + "warmup_ratio": 0.025, + "weight_decay": 0.01, + "world_size": 2 +} +[2026-01-11 04:24:20,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4935] EOS: 100257 / <|endoftext|> +[2026-01-11 04:24:20,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4935] BOS: 100257 / <|endoftext|> +[2026-01-11 04:24:20,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:4935] PAD: 100277 / <|pad|> +[2026-01-11 04:24:20,543] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:4935] UNK: 100257 / <|endoftext|> +[2026-01-11 04:24:38,731] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:475] [PID:4935] Loading prepared dataset from disk at last_run_prepared/8d6cb9376a109abc6ac61340266a9d2f... +[2026-01-11 04:24:38,747] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:4935] total_num_tokens: 607_040 +[2026-01-11 04:24:38,770] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:4935] `total_supervised_tokens: 607_040` +[2026-01-11 04:24:38,851] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:39,552] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:39,778] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.22662973403930664 +[2026-01-11 04:24:39,779] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:40,005] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.22648382186889648 +[2026-01-11 04:24:40,005] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:40,232] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.22647571563720703 +[2026-01-11 04:24:40,232] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:40,458] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.22629952430725098 +[2026-01-11 04:24:41,068] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4935] gather_len_batches: [19, 19] +[2026-01-11 04:24:41,131] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:4935] data_loader_len: 4 +[2026-01-11 04:24:41,144] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:4935] sample_packing_eff_est across ranks: [0.9750205874443054, 0.9750205874443054] +[2026-01-11 04:24:41,144] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:4935] sample_packing_eff_est: None +[2026-01-11 04:24:41,144] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:4935] total_num_steps: 8 +[2026-01-11 04:24:41,207] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:4935] total_num_tokens: 21_945_632 +[2026-01-11 04:24:41,350] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:4935] `total_supervised_tokens: 21_945_632` +[2026-01-11 04:24:41,359] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:41,585] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:41,811] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.2260892391204834 +[2026-01-11 04:24:41,811] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:42,037] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.22601652145385742 +[2026-01-11 04:24:42,037] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:42,263] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.2259969711303711 +[2026-01-11 04:24:42,264] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:4935] Using single process for pack_parallel, running sequentially. +[2026-01-11 04:24:42,490] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:4935] generate_batches time: 0.22606921195983887 +[2026-01-11 04:24:42,490] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:4935] gather_len_batches: [677, 678] +[2026-01-11 04:24:42,491] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:4935] data_loader_len: 169 +[2026-01-11 04:24:42,491] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:4935] sample_packing_eff_est across ranks: [0.9877987504005432, 0.9877987504005432] +[2026-01-11 04:24:42,492] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:4935] sample_packing_eff_est: 0.99 +[2026-01-11 04:24:42,492] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:4935] total_num_steps: 338 +[2026-01-11 04:24:42,492] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:4935] Maximum number of steps set at 338 +[2026-01-11 04:24:42,505] [DEBUG] [axolotl.train.setup_model_and_tokenizer:70] [PID:4935] loading tokenizer... allenai/Olmo-3.1-32B-Instruct +[2026-01-11 04:24:43,213] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:4935] EOS: 100257 / <|endoftext|> +[2026-01-11 04:24:43,213] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:4935] BOS: 100257 / <|endoftext|> +[2026-01-11 04:24:43,213] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:282] [PID:4935] PAD: 100277 / <|pad|> +[2026-01-11 04:24:43,213] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:283] [PID:4935] UNK: 100257 / <|endoftext|> +[2026-01-11 04:24:43,213] [DEBUG] [axolotl.train.setup_model_and_tokenizer:82] [PID:4935] Loading model +[2026-01-11 04:24:43,408] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:4935] Patched Trainer.evaluation_loop with nanmean loss calculation +[2026-01-11 04:24:43,409] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:4935] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation +[2026-01-11 04:24:43,411] [DEBUG] [axolotl.monkeypatch.transformers.trainer_context_parallel.patch_prepare_context_parallel_inputs:64] [PID:4935] Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP +[2026-01-11 04:24:43,413] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:4935] Applying multipack dataloader patch for sample packing... +[2026-01-11 04:24:43,508] [INFO] [axolotl.integrations.liger.plugin.pre_model_load:98] [PID:4935] Applying LIGER to olmo3 with kwargs: {'rope': True, 'cross_entropy': None, 'fused_linear_cross_entropy': None, 'rms_norm': True, 'swiglu': True} +[2026-01-11 04:24:43,601] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:94] [PID:4935] Applying Cut Cross Entropy to model type: olmo3 + Loading checkpoint shards: 0%| | 0/14 [00:00