Buckets:
| import{s as Kn,o as ei,n as pl}from"../chunks/scheduler.56725da7.js";import{S as li,i as ti,e as s,s as i,c as p,h as ni,a as r,d as t,b as a,f as Dn,g as y,j as M,k as qn,l as ii,m as n,n as c,t as U,o as T,p as u}from"../chunks/index.18a26576.js";import{T as ol}from"../chunks/Tip.5b941656.js";import{C as ai}from"../chunks/CopyLLMTxtMenu.4513c8ed.js";import{C as B}from"../chunks/CodeBlock.58e3e98b.js";import{H as J}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.049405bf.js";function si($){let m,b="All the training examples in the optimum-neuron repo use these parallelism features via the <code>NeuronTrainer</code>.";return{c(){m=s("p"),m.innerHTML=b},l(d){m=r(d,"P",{"data-svelte-h":!0}),M(m)!=="svelte-1s96mwn"&&(m.innerHTML=b)},m(d,w){n(d,m,w)},p:pl,d(d){d&&t(m)}}}function ri($){let m,b="Since the example scripts use the <code>NeuronTrainer</code>, you can enable ZeRO-1 when using them by adding the <code>--zero_1</code> flag to your command line.",d,w,g="For example:",f,h,C;return h=new B({props:{code:"dG9yY2hydW4lMjAtLW5wcm9jX3Blcl9ub2RlJTNEMiUyMGV4YW1wbGVzJTJGdHJhaW5pbmclMkZxd2VuMyUyRmZpbmV0dW5lX3F3ZW4zLnB5JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbF9uYW1lX29yX3BhdGglMjBRd2VuJTJGUXdlbjIuNS0wLjVCJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kYXRhc2V0X25hbWUlMjB3aWtpdGV4dCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tZGF0YXNldF9jb25maWdfbmFtZSUyMHdpa2l0ZXh0LTItcmF3LXYxJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kb190cmFpbiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTIwMSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tYmxvY2tfc2l6ZSUyMDEwMjQlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWJmMTYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXplcm9fMSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tdGVuc29yX3BhcmFsbGVsX3NpemUlMjAyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1vdXRwdXRfZGlyJTIwbXlfdHJhaW5pbmclMkY=",highlighted:`torchrun --nproc_per_node=2 examples/training/qwen3/finetune_qwen3.py \\ | |
| --model_name_or_path Qwen/Qwen2.5-0.5B \\ | |
| --dataset_name wikitext \\ | |
| --dataset_config_name wikitext-2-raw-v1 \\ | |
| --do_train \\ | |
| --per_device_train_batch_size 1 \\ | |
| --block_size 1024 \\ | |
| --bf16 \\ | |
| --zero_1 \\ | |
| --tensor_parallel_size 2 \\ | |
| --output_dir my_training/`,wrap:!1}}),{c(){m=s("p"),m.innerHTML=b,d=i(),w=s("p"),w.textContent=g,f=i(),p(h.$$.fragment)},l(o){m=r(o,"P",{"data-svelte-h":!0}),M(m)!=="svelte-nmzr1t"&&(m.innerHTML=b),d=a(o),w=r(o,"P",{"data-svelte-h":!0}),M(w)!=="svelte-1gkqha7"&&(w.textContent=g),f=a(o),y(h.$$.fragment,o)},m(o,j){n(o,m,j),n(o,d,j),n(o,w,j),n(o,f,j),c(h,o,j),C=!0},p:pl,i(o){C||(U(h.$$.fragment,o),C=!0)},o(o){T(h.$$.fragment,o),C=!1},d(o){o&&(t(m),t(d),t(w),t(f)),u(h,o)}}}function Mi($){let m,b="Since the example scripts use the <code>NeuronTrainer</code>, you can enable Tensor Parallelism when using them by specifying the <code>--tensor_parallel_size</code> argument.",d,w,g="For example:",f,h,C;return h=new B({props:{code:"dG9yY2hydW4lMjAtLW5wcm9jX3Blcl9ub2RlJTNEOCUyMGV4YW1wbGVzJTJGdHJhaW5pbmclMkZxd2VuMyUyRmZpbmV0dW5lX3F3ZW4zLnB5JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbF9uYW1lX29yX3BhdGglMjBRd2VuJTJGUXdlbjIuNS0wLjVCJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kYXRhc2V0X25hbWUlMjB3aWtpdGV4dCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tZGF0YXNldF9jb25maWdfbmFtZSUyMHdpa2l0ZXh0LTItcmF3LXYxJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kb190cmFpbiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTIwMSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tYmxvY2tfc2l6ZSUyMDEwMjQlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWJmMTYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXRlbnNvcl9wYXJhbGxlbF9zaXplJTIwOCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tb3V0cHV0X2RpciUyMG15X3RyYWluaW5nJTJG",highlighted:`torchrun --nproc_per_node=8 examples/training/qwen3/finetune_qwen3.py \\ | |
| --model_name_or_path Qwen/Qwen2.5-0.5B \\ | |
| --dataset_name wikitext \\ | |
| --dataset_config_name wikitext-2-raw-v1 \\ | |
| --do_train \\ | |
| --per_device_train_batch_size 1 \\ | |
| --block_size 1024 \\ | |
| --bf16 \\ | |
| --tensor_parallel_size 8 \\ | |
| --output_dir my_training/`,wrap:!1}}),{c(){m=s("p"),m.innerHTML=b,d=i(),w=s("p"),w.textContent=g,f=i(),p(h.$$.fragment)},l(o){m=r(o,"P",{"data-svelte-h":!0}),M(m)!=="svelte-7fhmhn"&&(m.innerHTML=b),d=a(o),w=r(o,"P",{"data-svelte-h":!0}),M(w)!=="svelte-1gkqha7"&&(w.textContent=g),f=a(o),y(h.$$.fragment,o)},m(o,j){n(o,m,j),n(o,d,j),n(o,w,j),n(o,f,j),c(h,o,j),C=!0},p:pl,i(o){C||(U(h.$$.fragment,o),C=!0)},o(o){T(h.$$.fragment,o),C=!1},d(o){o&&(t(m),t(d),t(w),t(f)),u(h,o)}}}function mi($){let m,b="When using pipeline parallelism, the total number of processes should be at least <code>tensor_parallel_size * pipeline_parallel_size</code>. For example, with <code>tensor_parallel_size=2</code> and <code>pipeline_parallel_size=4</code>, you need 8 processes total.";return{c(){m=s("p"),m.innerHTML=b},l(d){m=r(d,"P",{"data-svelte-h":!0}),M(m)!=="svelte-1ypidsl"&&(m.innerHTML=b)},m(d,w){n(d,m,w)},p:pl,d(d){d&&t(m)}}}function oi($){let m,b="The sharded checkpoints are saved under a directory called <code>shards</code>. The <code>optimum-cli neuron consolidate</code> command accepts as input both a directory that contains a <code>shards</code> directory, or the <code>shards</code> directory itself.";return{c(){m=s("p"),m.innerHTML=b},l(d){m=r(d,"P",{"data-svelte-h":!0}),M(m)!=="svelte-1sydsko"&&(m.innerHTML=b)},m(d,w){n(d,m,w)},p:pl,d(d){d&&t(m)}}}function pi($){let m,b,d,w,g,f,h,C,o,j="AWS Trainium instances provide powerful infrastructure for training large language models at scale. A <code>trn1.32xlarge</code> instance contains 16 Neuron devices with 32 cores total, offering 512GB of memory (16GB per core).",yl,V,Pt="However, training large models presents a fundamental challenge: by default, each Neuron core operates as an independent data-parallel worker, requiring the entire model, gradients, and optimizer state (approximately 4× the model size) to fit within a single core’s 16GB memory limit, with additional space needed for activations.",cl,G,Ot="For models that exceed these memory constraints, <code>optimum-neuron</code> provides sophisticated parallelism strategies that distribute computation and memory across multiple devices, enabling you to train models that would be impossible to fit on individual cores:",Ul,F,Tl,k,ul,R,Dt='<a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/zero1_gpt2.html" rel="nofollow">ZeRO-1</a> is an optimizer-level optimization that reduces memory usage without changing your model architecture.',dl,X,qt="<strong>How it works</strong>: Shards the optimizer state (gradients, momentum, variance) across data-parallel ranks instead of replicating it on each device.",wl,N,Kt="<strong>Memory savings</strong>: Reduces optimizer memory usage by <code>1/data_parellel_size</code>.",Jl,H,en="<strong>When to use</strong>: Always beneficial when training with multiple devices, regardless of model size.",hl,A,bl,E,ln='<a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tensor_parallelism_overview.html" rel="nofollow">Tensor Parallelism</a> splits individual model layers across multiple devices.',fl,Q,tn="<strong>How it works</strong>: Shards matrix multiplications (linear layers, attention) along rows or columns across devices. Each device computes part of each layer, requiring communication between devices for each forward/backward pass.",Cl,z,nn="<strong>Memory savings</strong>: Reduces model parameter memory by <code>1/tensor_parallel_size</code>.",jl,Y,an="<strong>When to use</strong>: When your model is too large to fit on a single device, even after applying ZeRO-1.",gl,x,sn="<strong>Typical deployment</strong>: Usually applied within a single node (intra-node) due to high communication requirements.",$l,S,rn="<strong>Trade-offs</strong>: Increases communication overhead between devices, which can slow down training if overused.",Bl,L,Il,P,Mn='<a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> is an optimization that works alongside Tensor Parallelism to further reduce memory usage.',vl,O,mn="<strong>How it works</strong>: Shards activations along the sequence dimension in regions where tensors are not already sharded by tensor parallelism.",_l,D,on="<strong>Memory savings</strong>: Reduces activation memory proportional to sequence length, especially beneficial for long sequences.",Wl,q,pn="<strong>When to use</strong>: Always enable when using tensor parallelism - it provides additional memory savings with minimal overhead.",Zl,K,yn="<strong>Requirement</strong>: Only works in combination with tensor parallelism.",Vl,ee,Gl,le,cn='<a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/pipeline_parallelism_overview.html" rel="nofollow">Pipeline Parallelism</a> splits model layers across different devices.',Fl,te,Un="<strong>How it works</strong>: Divides your model into stages, with each stage containing consecutive layers running on different devices. Uses microbatching to keep all devices busy.",kl,ne,Tn="<strong>Memory savings</strong>: Reduces model parameter memory by <code>1/pipeline_parallel_size</code>.",Rl,ie,un="<strong>When to use</strong>: For very large models that don’t fit even with tensor parallelism, or when you want to scale across many devices with less communication overhead than tensor parallelism.",Xl,ae,dn="<strong>Typical deployment</strong>: Usually applied across multiple nodes (inter-node) to scale to larger numbers of devices while minimizing high-bandwidth communication requirements.",Nl,se,wn="<strong>Trade-offs</strong>: Introduces pipeline bubbles (idle time) and requires careful tuning of microbatch sizes.",Hl,re,Jn="The good news is that it is possible to combine those techniques, and <code>optimum-neuron</code> makes it very easy!",Al,I,El,Me,Ql,me,hn="ZeRO-1 can be enabled either through the <code>NeuronTrainer</code> or directly with the <code>NeuronAccelerator</code>.",zl,oe,Yl,pe,xl,v,Sl,ye,Ll,ce,bn="When using the <code>NeuronAccelerator</code> directly, you need to create a <code>TrainingNeuronConfig</code> and enable ZeRO-1 separately:",Pl,Ue,Ol,Te,Dl,ue,fn="Tensor Parallelism can be used with either the <code>NeuronTrainer</code> or <code>NeuronAccelerator</code>.",ql,de,Cn="<strong>Important</strong>: Tensor parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code>.",Kl,we,jn="When doing Tensor Parallelism, you have several important settings:",et,Je,gn='<li>The <code>tensor_parallel_size</code>: Ideally it should be the smallest value for which the model fits in memory.</li> <li>Whether sequence parallelism should be enabled: <a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> shards the activations on the sequence axis outside of the tensor parallel regions, saving memory by sharding the activations.</li>',lt,he,$n="When using distributed training, the training script is called by <code>torchrun</code>, which will dispatch it to workers, one worker per core. Each worker will load the sharded model and dispatch the parameters automatically across the cores. The <code>tensor_parallel_size</code> is the number of workers to shard the model parameters on.",tt,be,nt,fe,it,_,at,Ce,st,je,Bn="When using the <code>NeuronAccelerator</code> directly, you configure tensor parallelism through the <code>TrainingNeuronConfig</code>:",rt,ge,Mt,$e,mt,Be,In="Pipeline Parallelism allows you to split your model layers across multiple devices, enabling training of very large models that wouldn’t fit on a single device, or even a signle node.",ot,Ie,vn="<strong>Important</strong>: Pipeline parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code> and declare <code>SUPPORTS_PIPELINE_PARALLELISM = True</code>.",pt,ve,yt,_e,_n="Pipeline parallelism has several configuration parameters:",ct,We,Wn="<li><code>pipeline_parallel_size</code>: Number of pipeline stages (devices to split layers across)</li> <li><code>pipeline_parallel_num_microbatches</code>: Number of microbatches for pipeline scheduling</li> <li>When pipeline parallelism is enabled, ZeRO-1 can be automatically applied to the pipeline parallel optimizer</li>",Ut,Ze,Tt,Ve,ut,Ge,dt,Fe,wt,W,Jt,ke,ht,Re,Zn="You can combine multiple parallelism strategies for maximum memory efficiency and performance. Here’s an example with all strategies combined:",bt,Xe,ft,Ne,Ct,He,Vn="This configuration uses 4 * 2 = 8 total processes:",jt,Ae,Gn="<li>Each tensor parallel group has 4 processes</li> <li>Each pipeline stage runs on one tensor parallel group</li>",gt,Ee,Fn="We can then run the training script on the <code>trn1.32xlarge</code> instance with 32 Neuron cores, resulting in the following configuration: <code>dp=4, tp=4, pp=2</code>, which means 4 data-parallel groups, each with 4 tensor-parallel devices, and 2 pipeline stages.",$t,Qe,Bt,ze,kn="Since distributed training uses sharded checkpoints across different workers, you need to consolidate them to create a standard model checkpoint that can be shared and used outside of the specific training configuration.",It,Ye,Rn="The Optimum CLI provides a way of doing that very easily via the <code>optimum neuron consolidate</code> command:",vt,xe,_t,Se,Xn=`All you need to do is specify the sharded checkpoints directory and the output directory that will contain the consolidated checkpoints, and the command takes care of the rest. | |
| It is also possible to specify the output format of the consolidated checkpoints. By default it will export them to the <code>safetensors</code> format, which is the recommended format to use.`,Wt,Le,Nn="Example:",Zt,Pe,Hn="Training with distributed parallelism just completed and the output dir is called <code>my_training</code>. The directory looks like the following:",Vt,Oe,Gt,De,An="You can consolidate the sharded checkpoints in <code>my_training/shards</code>, which correspond to the sharded checkpoints saved at the end of training, by running the following command:",Ft,qe,kt,Z,Rt,Ke,Xt,el,Nt,ll,En="<li><strong>Start with Tensor Parallelism</strong>: Use the smallest <code>tensor_parallel_size</code> that fits your model in memory</li> <li><strong>Add Pipeline Parallelism</strong>: For very large models, combine with pipeline parallelism</li> <li><strong>Enable Sequence Parallelism</strong>: Always enable when using tensor parallelism for memory savings (set <code>disable_sequence_parallel=False</code>)</li> <li><strong>Use ZeRO-1</strong>: Combine with any parallelism strategy for optimizer memory savings</li>",Ht,tl,At,nl,Qn="<li>Enable <code>gradient_checkpointing</code> for large models</li> <li>Set appropriate <code>pipeline_parallel_num_microbatches</code> for pipeline parallelism</li>",Et,il,Qt,al,zt,sl,zn="<li><strong>Out of Memory</strong>: Reduce batch size, increase parallelism, or enable gradient checkpointing</li> <li><strong>Model Not Supported</strong>: Ensure you’re using a model from <code>optimum.neuron.models.training</code></li> <li><strong>Pipeline Parallelism Fails</strong>: Check that the model supports pipeline parallelism</li> <li><strong>Incorrect Process Count</strong>: Ensure <code>nproc_per_node</code> matches your parallelism configuration</li>",Yt,rl,xt,Ml,Yn="<li>Start with smaller models and parallelism sizes</li> <li>Check that all processes can communicate properly</li> <li>Verify checkpoint directories and permissions</li> <li>Monitor Neuron device utilization</li>",St,ml,Lt;return g=new ai({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),h=new J({props:{title:"Distributed Training with optimum-neuron",local:"distributed-training-with-optimum-neuron",headingTag:"h1"}}),F=new J({props:{title:"Parallelism Strategies Overview",local:"parallelism-strategies-overview",headingTag:"h2"}}),k=new J({props:{title:"1. ZeRO-1 (Optimizer State Sharding)",local:"1-zero-1-optimizer-state-sharding",headingTag:"h3"}}),A=new J({props:{title:"2. Tensor Parallelism (Intra-layer Model Parallelism)",local:"2-tensor-parallelism-intra-layer-model-parallelism",headingTag:"h3"}}),L=new J({props:{title:"3. Sequence Parallelism (Activation Sharding)",local:"3-sequence-parallelism-activation-sharding",headingTag:"h3"}}),ee=new J({props:{title:"4. Pipeline Parallelism (Inter-layer Model Parallelism)",local:"4-pipeline-parallelism-inter-layer-model-parallelism",headingTag:"h3"}}),I=new ol({props:{$$slots:{default:[si]},$$scope:{ctx:$}}}),Me=new J({props:{title:"How to enable ZeRO-1?",local:"how-to-enable-zero-1",headingTag:"h2"}}),oe=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),pe=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBJTBBJTIzJTIwRW5hYmxlJTIwWmVSTy0xJTIwaW4lMjB0aGUlMjB0cmFpbmluZyUyMGFyZ3VtZW50cyUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBOZXVyb25UcmFpbmluZ0FyZ3VtZW50cyglMEElMjAlMjAlMjAlMjBvdXRwdXRfZGlyJTNEJTIyLiUyRm91dHB1dCUyMiUyQyUwQSUyMCUyMCUyMCUyMHBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSUzRDElMkMlMEElMjAlMjAlMjAlMjB6ZXJvXzElM0RUcnVlJTJDJTIwJTIwJTIzJTIwRW5hYmxlJTIwWmVSTy0xJTBBJTIwJTIwJTIwJTIwYmYxNiUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjAlMjMlMjAuLi4lMjBvdGhlciUyMHRyYWluaW5nJTIwYXJndW1lbnRzJTBBKSUwQSUwQXRyYWluZXIlMjAlM0QlMjBOZXVyb25UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEbW9kZWwlMkMlMEElMjAlMjAlMjAlMjBhcmdzJTNEdHJhaW5pbmdfYXJncyUyQyUwQSUyMCUyMCUyMCUyMHRyYWluX2RhdGFzZXQlM0R0cmFpbl9kYXRhc2V0JTJDJTBBJTIwJTIwJTIwJTIwZXZhbF9kYXRhc2V0JTNEZXZhbF9kYXRhc2V0JTJDJTBBKSUwQSUwQXRyYWluZXIudHJhaW4oKQ==",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-comment"># Enable ZeRO-1 in the training arguments</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">1</span>, | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span> | |
| bf16=<span class="hljs-literal">True</span>, | |
| <span class="hljs-comment"># ... other training arguments</span> | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),v=new ol({props:{$$slots:{default:[ri]},$$scope:{ctx:$}}}),ye=new J({props:{title:"Via the NeuronAccelerator",local:"via-the-neuronaccelerator",headingTag:"h3"}}),Ue=new B({props:{code:"ZnJvbSUyMHRvcmNoLm9wdGltJTIwaW1wb3J0JTIwQWRhbVclMEFmcm9tJTIwb3B0aW11bS5uZXVyb24lMjBpbXBvcnQlMjBOZXVyb25BY2NlbGVyYXRvciUwQWZyb20lMjBvcHRpbXVtLm5ldXJvbi5tb2RlbHMudHJhaW5pbmcuY29uZmlnJTIwaW1wb3J0JTIwVHJhaW5pbmdOZXVyb25Db25maWclMEElMEElMjMlMjBDcmVhdGUlMjB0aGUlMjB0cmFpbmluZyUyMGNvbmZpZ3VyYXRpb24lMEF0cm5fY29uZmlnJTIwJTNEJTIwVHJhaW5pbmdOZXVyb25Db25maWcoKSUwQSUwQSUyMyUyMENyZWF0ZSUyMGFjY2VsZXJhdG9yJTIwd2l0aCUyMFplUk8tMSUyMGVuYWJsZWQlMEFhY2NlbGVyYXRvciUyMCUzRCUyME5ldXJvbkFjY2VsZXJhdG9yKCUwQSUyMCUyMCUyMCUyMHRybl9jb25maWclM0R0cm5fY29uZmlnJTJDJTBBJTIwJTIwJTIwJTIwemVyb18xJTNEVHJ1ZSUyQyUyMCUyMCUyMyUyMEVuYWJsZSUyMFplUk8tMSUwQSUyMCUyMCUyMCUyMG1peGVkX3ByZWNpc2lvbiUzRCUyMmJmMTYlMjIlMkMlMEEpJTBBJTBBbW9kZWwlMjAlM0QlMjAuLi4lMjAlMjAlMjMlMjBZb3VyJTIwbW9kZWwlMjBpbnN0YW5jZSUwQW9wdGltaXplciUyMCUzRCUyMEFkYW1XKG1vZGVsLnBhcmFtZXRlcnMoKSUyQyUyMGxyJTNENWUtNSklMEElMEElMjMlMjBQcmVwYXJlJTIwbW9kZWwlMjBhbmQlMjBvcHRpbWl6ZXIlMEFtb2RlbCUyQyUyMG9wdGltaXplciUyMCUzRCUyMGFjY2VsZXJhdG9yLnByZXBhcmUobW9kZWwlMkMlMjBvcHRpbWl6ZXIp",highlighted:`<span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW | |
| <span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig | |
| <span class="hljs-comment"># Create the training configuration</span> | |
| trn_config = TrainingNeuronConfig() | |
| <span class="hljs-comment"># Create accelerator with ZeRO-1 enabled</span> | |
| accelerator = NeuronAccelerator( | |
| trn_config=trn_config, | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span> | |
| mixed_precision=<span class="hljs-string">"bf16"</span>, | |
| ) | |
| model = ... <span class="hljs-comment"># Your model instance</span> | |
| optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>) | |
| <span class="hljs-comment"># Prepare model and optimizer</span> | |
| model, optimizer = accelerator.prepare(model, optimizer)`,wrap:!1}}),Te=new J({props:{title:"How to enable Tensor Parallelism?",local:"how-to-enable-tensor-parallelism",headingTag:"h2"}}),be=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),fe=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBJTBBJTIzJTIwQ29uZmlndXJlJTIwdGVuc29yJTIwcGFyYWxsZWxpc20lMjBpbiUyMHRyYWluaW5nJTIwYXJndW1lbnRzJTBBdHJhaW5pbmdfYXJncyUyMCUzRCUyME5ldXJvblRyYWluaW5nQXJndW1lbnRzKCUwQSUyMCUyMCUyMCUyMG91dHB1dF9kaXIlM0QlMjIuJTJGb3V0cHV0JTIyJTJDJTBBJTIwJTIwJTIwJTIwcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTNEMSUyQyUwQSUyMCUyMCUyMCUyMGJmMTYlM0RUcnVlJTJDJTBBJTIwJTIwJTIwJTIwdGVuc29yX3BhcmFsbGVsX3NpemUlM0Q4JTJDJTBBJTIwJTIwJTIwJTIwJTIzJTIwLi4uJTIwb3RoZXIlMjB0cmFpbmluZyUyMGFyZ3VtZW50cyUwQSklMEElMEF0cmFpbmVyJTIwJTNEJTIwTmV1cm9uVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRG1vZGVsJTJDJTBBJTIwJTIwJTIwJTIwYXJncyUzRHRyYWluaW5nX2FyZ3MlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEdHJhaW5fZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMGV2YWxfZGF0YXNldCUzRGV2YWxfZGF0YXNldCUyQyUwQSklMEElMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-comment"># Configure tensor parallelism in training arguments</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">1</span>, | |
| bf16=<span class="hljs-literal">True</span>, | |
| tensor_parallel_size=<span class="hljs-number">8</span>, | |
| <span class="hljs-comment"># ... other training arguments</span> | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),_=new ol({props:{$$slots:{default:[Mi]},$$scope:{ctx:$}}}),Ce=new J({props:{title:"Via the NeuronAccelerator",local:"via-the-neuronaccelerator",headingTag:"h3"}}),ge=new B({props:{code:"ZnJvbSUyMHRvcmNoLm9wdGltJTIwaW1wb3J0JTIwQWRhbVclMEFmcm9tJTIwb3B0aW11bS5uZXVyb24lMjBpbXBvcnQlMjBOZXVyb25BY2NlbGVyYXRvciUwQWZyb20lMjBvcHRpbXVtLm5ldXJvbi5tb2RlbHMudHJhaW5pbmcuY29uZmlnJTIwaW1wb3J0JTIwVHJhaW5pbmdOZXVyb25Db25maWclMEElMEElMjMlMjBDb25maWd1cmUlMjB0ZW5zb3IlMjBwYXJhbGxlbGlzbSUwQXRybl9jb25maWclMjAlM0QlMjBUcmFpbmluZ05ldXJvbkNvbmZpZyglMEElMjAlMjAlMjAlMjB0ZW5zb3JfcGFyYWxsZWxfc2l6ZSUzRDglMkMlMEElMjAlMjAlMjAlMjBzZXF1ZW5jZV9wYXJhbGxlbF9lbmFibGVkJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMGNoZWNrcG9pbnRfZGlyJTNETm9uZSUyQyUyMCUyMCUyMyUyMENhbiUyMGJlJTIwc3BlY2lmaWVkJTIwd2hlbiUyMHJlc3VtaW5nJTIwZnJvbSUyMGNoZWNrcG9pbnQlMEEpJTBBJTBBYWNjZWxlcmF0b3IlMjAlM0QlMjBOZXVyb25BY2NlbGVyYXRvciglMEElMjAlMjAlMjAlMjB0cm5fY29uZmlnJTNEdHJuX2NvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMG1peGVkX3ByZWNpc2lvbiUzRCUyMmJmMTYlMjIlMkMlMEEpJTBBJTBBbW9kZWwlMjAlM0QlMjAuLi4lMjAlMjAlMjMlMjBZb3VyJTIwbW9kZWwlMjBpbnN0YW5jZSUwQW9wdGltaXplciUyMCUzRCUyMEFkYW1XKG1vZGVsLnBhcmFtZXRlcnMoKSUyQyUyMGxyJTNENWUtNSklMEElMEFtb2RlbCUyQyUyMG9wdGltaXplciUyMCUzRCUyMGFjY2VsZXJhdG9yLnByZXBhcmUobW9kZWwlMkMlMjBvcHRpbWl6ZXIp",highlighted:`<span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW | |
| <span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig | |
| <span class="hljs-comment"># Configure tensor parallelism</span> | |
| trn_config = TrainingNeuronConfig( | |
| tensor_parallel_size=<span class="hljs-number">8</span>, | |
| sequence_parallel_enabled=<span class="hljs-literal">True</span>, | |
| checkpoint_dir=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Can be specified when resuming from checkpoint</span> | |
| ) | |
| accelerator = NeuronAccelerator( | |
| trn_config=trn_config, | |
| mixed_precision=<span class="hljs-string">"bf16"</span>, | |
| ) | |
| model = ... <span class="hljs-comment"># Your model instance</span> | |
| optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>) | |
| model, optimizer = accelerator.prepare(model, optimizer)`,wrap:!1}}),$e=new J({props:{title:"How to enable Pipeline Parallelism?",local:"how-to-enable-pipeline-parallelism",headingTag:"h2"}}),ve=new J({props:{title:"Configuration Options",local:"configuration-options",headingTag:"h3"}}),Ze=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),Ve=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBZnJvbSUyMG9wdGltdW0ubmV1cm9uLm1vZGVscy50cmFpbmluZyUyMGltcG9ydCUyMExsYW1hRm9yQ2F1c2FsTE0lMjAlMjAlMjMlMjBDdXN0b20lMjBtb2RlbCUyMGltcGxlbWVudGF0aW9uJTBBJTBBJTIzJTIwQ29uZmlndXJlJTIwcGlwZWxpbmUlMjBwYXJhbGxlbGlzbSUyMGluJTIwdHJhaW5pbmclMjBhcmd1bWVudHMlMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMoJTBBJTIwJTIwJTIwJTIwb3V0cHV0X2RpciUzRCUyMi4lMkZvdXRwdXQlMjIlMkMlMEElMjAlMjAlMjAlMjBwZXJfZGV2aWNlX3RyYWluX2JhdGNoX3NpemUlM0Q0JTJDJTIwJTIwJTIzJTIwV2lsbCUyMGJlJTIwc3BsaXQlMjBpbnRvJTIwbWljcm9iYXRjaGVzJTBBJTIwJTIwJTIwJTIwYmYxNiUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjB0ZW5zb3JfcGFyYWxsZWxfc2l6ZSUzRDIlMkMlMEElMjAlMjAlMjAlMjBwaXBlbGluZV9wYXJhbGxlbF9zaXplJTNENCUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFNwbGl0JTIwbW9kZWwlMjBhY3Jvc3MlMjA0JTIwcGlwZWxpbmUlMjBzdGFnZXMlMEElMjAlMjAlMjAlMjBwaXBlbGluZV9wYXJhbGxlbF9udW1fbWljcm9iYXRjaGVzJTNENCUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyME51bWJlciUyMG9mJTIwbWljcm9iYXRjaGVzJTBBJTIwJTIwJTIwJTIwemVyb18xJTNEVHJ1ZSUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMEVuYWJsZSUyMFplUk8tMSUyMHdpdGglMjBwaXBlbGluZSUyMHBhcmFsbGVsaXNtJTBBJTIwJTIwJTIwJTIwJTIzJTIwLi4uJTIwb3RoZXIlMjB0cmFpbmluZyUyMGFyZ3VtZW50cyUwQSklMEElMEElMjMlMjBMb2FkJTIwbW9kZWwlMjB1c2luZyUyMGN1c3RvbSUyMGltcGxlbWVudGF0aW9uJTIwLSUyMG11c3QlMjBiZSUyMGRvbmUlMjB3aXRoJTIwdGhlJTIwbW9kZWwlMjBjbGFzcyUyMGRpcmVjdGx5JTBBbW9kZWwlMjAlM0QlMjBMbGFtYUZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJtZXRhLWxsYW1hJTJGTGxhbWEtMy4yLTNCJTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJuX2NvbmZpZyUzRHRyYWluaW5nX2FyZ3MudHJuX2NvbmZpZyUyMCUyMCUyMyUyMFBhc3MlMjB0aGUlMjBhdXRvLWdlbmVyYXRlZCUyMHRybl9jb25maWclMEEpJTBBJTBBdHJhaW5lciUyMCUzRCUyME5ldXJvblRyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0Rtb2RlbCUyQyUwQSUyMCUyMCUyMCUyMGFyZ3MlM0R0cmFpbmluZ19hcmdzJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRHRyYWluX2RhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBldmFsX2RhdGFzZXQlM0RldmFsX2RhdGFzZXQlMkMlMEEpJTBBJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM <span class="hljs-comment"># Custom model implementation</span> | |
| <span class="hljs-comment"># Configure pipeline parallelism in training arguments</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Will be split into microbatches</span> | |
| bf16=<span class="hljs-literal">True</span>, | |
| tensor_parallel_size=<span class="hljs-number">2</span>, | |
| pipeline_parallel_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Split model across 4 pipeline stages</span> | |
| pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>, <span class="hljs-comment"># Number of microbatches</span> | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1 with pipeline parallelism</span> | |
| <span class="hljs-comment"># ... other training arguments</span> | |
| ) | |
| <span class="hljs-comment"># Load model using custom implementation - must be done with the model class directly</span> | |
| model = LlamaForCausalLM.from_pretrained( | |
| <span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>, | |
| trn_config=training_args.trn_config <span class="hljs-comment"># Pass the auto-generated trn_config</span> | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),Ge=new J({props:{title:"Via the NeuronAccelerator",local:"via-the-neuronaccelerator",headingTag:"h3"}}),Fe=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uQWNjZWxlcmF0b3IlMEFmcm9tJTIwb3B0aW11bS5uZXVyb24ubW9kZWxzLnRyYWluaW5nLmNvbmZpZyUyMGltcG9ydCUyMFRyYWluaW5nTmV1cm9uQ29uZmlnJTBBZnJvbSUyMG9wdGltdW0ubmV1cm9uLm1vZGVscy50cmFpbmluZyUyMGltcG9ydCUyMExsYW1hRm9yQ2F1c2FsTE0lMEFmcm9tJTIwdG9yY2gub3B0aW0lMjBpbXBvcnQlMjBBZGFtVyUwQSUwQSUyMyUyMENvbmZpZ3VyZSUyMGNvbWJpbmVkJTIwcGFyYWxsZWxpc20lMjBzdHJhdGVnaWVzJTBBdHJuX2NvbmZpZyUyMCUzRCUyMFRyYWluaW5nTmV1cm9uQ29uZmlnKCUwQSUyMCUyMCUyMCUyMHRlbnNvcl9wYXJhbGxlbF9zaXplJTNEMiUyQyUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lX3BhcmFsbGVsX3NpemUlM0Q0JTJDJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmVfcGFyYWxsZWxfbnVtX21pY3JvYmF0Y2hlcyUzRDQlMkMlMEElMjAlMjAlMjAlMjBzZXF1ZW5jZV9wYXJhbGxlbF9lbmFibGVkJTNEVHJ1ZSUyQyUwQSklMEElMEFhY2NlbGVyYXRvciUyMCUzRCUyME5ldXJvbkFjY2VsZXJhdG9yKCUwQSUyMCUyMCUyMCUyMHRybl9jb25maWclM0R0cm5fY29uZmlnJTJDJTBBJTIwJTIwJTIwJTIwemVyb18xJTNEVHJ1ZSUyQyUyMCUyMCUyMyUyMENhbiUyMGNvbWJpbmUlMjB3aXRoJTIwWmVSTy0xJTBBJTIwJTIwJTIwJTIwbWl4ZWRfcHJlY2lzaW9uJTNEJTIyYmYxNiUyMiUyQyUwQSklMEElMEElMjMlMjBMb2FkJTIwbW9kZWwlMjB3aXRoJTIwY3VzdG9tJTIwaW1wbGVtZW50YXRpb24lMEFtb2RlbCUyMCUzRCUyMExsYW1hRm9yQ2F1c2FsTE0uZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMm1ldGEtbGxhbWElMkZMbGFtYS0zLjItM0IlMjIlMkMlMEElMjAlMjAlMjAlMjB0cm5fY29uZmlnJTNEdHJuX2NvbmZpZyUwQSklMEElMEFvcHRpbWl6ZXIlMjAlM0QlMjBBZGFtVyhtb2RlbC5wYXJhbWV0ZXJzKCklMkMlMjBsciUzRDVlLTUpJTBBbW9kZWwlMkMlMjBvcHRpbWl6ZXIlMjAlM0QlMjBhY2NlbGVyYXRvci5wcmVwYXJlKG1vZGVsJTJDJTIwb3B0aW1pemVyKQ==",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM | |
| <span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW | |
| <span class="hljs-comment"># Configure combined parallelism strategies</span> | |
| trn_config = TrainingNeuronConfig( | |
| tensor_parallel_size=<span class="hljs-number">2</span>, | |
| pipeline_parallel_size=<span class="hljs-number">4</span>, | |
| pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>, | |
| sequence_parallel_enabled=<span class="hljs-literal">True</span>, | |
| ) | |
| accelerator = NeuronAccelerator( | |
| trn_config=trn_config, | |
| zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Can combine with ZeRO-1</span> | |
| mixed_precision=<span class="hljs-string">"bf16"</span>, | |
| ) | |
| <span class="hljs-comment"># Load model with custom implementation</span> | |
| model = LlamaForCausalLM.from_pretrained( | |
| <span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>, | |
| trn_config=trn_config | |
| ) | |
| optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>) | |
| model, optimizer = accelerator.prepare(model, optimizer)`,wrap:!1}}),W=new ol({props:{$$slots:{default:[mi]},$$scope:{ctx:$}}}),ke=new J({props:{title:"Combining Parallelism Strategies",local:"combining-parallelism-strategies",headingTag:"h2"}}),Xe=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),Ne=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBZnJvbSUyMG9wdGltdW0ubmV1cm9uLm1vZGVscy50cmFpbmluZyUyMGltcG9ydCUyMExsYW1hRm9yQ2F1c2FsTE0lMEElMEElMjMlMjBFeGFtcGxlJTNBJTIwQ29tYmluZSUyMGFsbCUyMHBhcmFsbGVsaXNtJTIwc3RyYXRlZ2llcyUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBOZXVyb25UcmFpbmluZ0FyZ3VtZW50cyglMEElMjAlMjAlMjAlMjBvdXRwdXRfZGlyJTNEJTIyLiUyRm91dHB1dCUyMiUyQyUwQSUyMCUyMCUyMCUyMHBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSUzRDMyJTJDJTBBJTIwJTIwJTIwJTIwYmYxNiUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjBncmFkaWVudF9jaGVja3BvaW50aW5nJTNEVHJ1ZSUyQyUwQSUwQSUyMCUyMCUyMCUyMCUyMyUyMFplUk8tMSUwQSUyMCUyMCUyMCUyMHplcm9fMSUzRFRydWUlMkMlMEElMEElMjAlMjAlMjAlMjAlMjMlMjBUZW5zb3IlMjBwYXJhbGxlbGlzbSUwQSUyMCUyMCUyMCUyMHRlbnNvcl9wYXJhbGxlbF9zaXplJTNENCUyQyUwQSUyMCUyMCUyMCUyMGRpc2FibGVfc2VxdWVuY2VfcGFyYWxsZWwlM0RGYWxzZSUyQyUyMCUyMCUyMCUyMCUyMCUyMyUyMEVuYWJsZSUyMHNlcXVlbmNlJTIwcGFyYWxsZWxpc20lMEElMEElMjAlMjAlMjAlMjAlMjMlMjBQaXBlbGluZSUyMHBhcmFsbGVsaXNtJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmVfcGFyYWxsZWxfc2l6ZSUzRDIlMkMlMEElMjAlMjAlMjAlMjBwaXBlbGluZV9wYXJhbGxlbF9udW1fbWljcm9iYXRjaGVzJTNEOCUyQyUwQSUwQSUyMCUyMCUyMCUyMCUyMyUyMEFkZGl0aW9uYWwlMjBvcHRpbWl6YXRpb25zJTBBJTIwJTIwJTIwJTIwZnVzZV9xa3YlM0RUcnVlJTJDJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwRnVzZSUyMFFLViUyMHByb2plY3Rpb25zJTIwZm9yJTIwZWZmaWNpZW5jeSUwQSUyMCUyMCUyMCUyMGt2X3NpemVfbXVsdGlwbGllciUzRE5vbmUlMkMlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBBdXRvLWNhbGN1bGF0ZSUyMG9wdGltYWwlMjBLViUyMG11bHRpcGxpZXIlMEEpJTBBJTBBJTIzJTIwTG9hZCUyMG1vZGVsJTIwdXNpbmclMjBjdXN0b20lMjBpbXBsZW1lbnRhdGlvbiUwQW1vZGVsJTIwJTNEJTIwTGxhbWFGb3JDYXVzYWxMTS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIybWV0YS1sbGFtYSUyRkxsYW1hLTMuMi0zQiUyMiUyQyUwQSUyMCUyMCUyMCUyMHRybl9jb25maWclM0R0cmFpbmluZ19hcmdzLnRybl9jb25maWclMEEpJTBBJTBBdHJhaW5lciUyMCUzRCUyME5ldXJvblRyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0Rtb2RlbCUyQyUwQSUyMCUyMCUyMCUyMGFyZ3MlM0R0cmFpbmluZ19hcmdzJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRHRyYWluX2RhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBldmFsX2RhdGFzZXQlM0RldmFsX2RhdGFzZXQlMkMlMEEpJTBBJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer | |
| <span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM | |
| <span class="hljs-comment"># Example: Combine all parallelism strategies</span> | |
| training_args = NeuronTrainingArguments( | |
| output_dir=<span class="hljs-string">"./output"</span>, | |
| per_device_train_batch_size=<span class="hljs-number">32</span>, | |
| bf16=<span class="hljs-literal">True</span>, | |
| gradient_checkpointing=<span class="hljs-literal">True</span>, | |
| <span class="hljs-comment"># ZeRO-1</span> | |
| zero_1=<span class="hljs-literal">True</span>, | |
| <span class="hljs-comment"># Tensor parallelism</span> | |
| tensor_parallel_size=<span class="hljs-number">4</span>, | |
| disable_sequence_parallel=<span class="hljs-literal">False</span>, <span class="hljs-comment"># Enable sequence parallelism</span> | |
| <span class="hljs-comment"># Pipeline parallelism</span> | |
| pipeline_parallel_size=<span class="hljs-number">2</span>, | |
| pipeline_parallel_num_microbatches=<span class="hljs-number">8</span>, | |
| <span class="hljs-comment"># Additional optimizations</span> | |
| fuse_qkv=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Fuse QKV projections for efficiency</span> | |
| kv_size_multiplier=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Auto-calculate optimal KV multiplier</span> | |
| ) | |
| <span class="hljs-comment"># Load model using custom implementation</span> | |
| model = LlamaForCausalLM.from_pretrained( | |
| <span class="hljs-string">"meta-llama/Llama-3.2-3B"</span>, | |
| trn_config=training_args.trn_config | |
| ) | |
| trainer = NeuronTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_dataset, | |
| eval_dataset=eval_dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),Qe=new J({props:{title:"Checkpoint consolidation",local:"checkpoint-consolidation",headingTag:"h2"}}),xe=new B({props:{code:"b3B0aW11bS1jbGklMjBuZXVyb24lMjBjb25zb2xpZGF0ZSUyMC0taGVscCUwQSUwQXVzYWdlJTNBJTIwb3B0aW11bS1jbGklMjBuZXVyb24lMjBjb25zb2xpZGF0ZSUyMCU1Qi1oJTVEJTIwJTVCLWYlMjAlN0JweXRvcmNoJTJDc2FmZXRlbnNvcnMlN0QlNUQlMjBjaGVja3BvaW50X2RpciUyMG91dHB1dF9kaXIlMEElMEFwb3NpdGlvbmFsJTIwYXJndW1lbnRzJTNBJTBBJTIwJTIwY2hlY2twb2ludF9kaXIlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBwYXRoJTIwdG8lMjB0aGUlMjBkaXJlY3RvcnklMjBjb250YWluaW5nJTIwdGhlJTIwY2hlY2twb2ludHMuJTBBJTIwJTIwb3V0cHV0X2RpciUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFRoZSUyMHBhdGglMjB0byUyMHRoZSUyMG91dHB1dCUyMGRpcmVjdG9yeSUyMGNvbnRhaW5pbmclMjB0aGUlMjBjb25zb2xpZGF0ZWQlMjBjaGVja3BvaW50LiUwQSUwQW9wdGlvbmFsJTIwYXJndW1lbnRzJTNBJTBBJTIwJTIwLWglMkMlMjAtLWhlbHAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzaG93JTIwdGhpcyUyMGhlbHAlMjBtZXNzYWdlJTIwYW5kJTIwZXhpdCUwQSUyMCUyMC1mJTIwJTdCcHl0b3JjaCUyQ3NhZmV0ZW5zb3JzJTdEJTJDJTIwLS1mb3JtYXQlMjAlN0JweXRvcmNoJTJDc2FmZXRlbnNvcnMlN0QlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBmb3JtYXQlMjB1c2VkJTIwdG8lMjBzYXZlJTIwdGhlJTIwY29uc29saWRhdGVkJTIwY2hlY2twb2ludC4=",highlighted:`optimum-cli neuron consolidate --<span class="hljs-built_in">help</span> | |
| usage: optimum-cli neuron consolidate [-h] [-f {pytorch,safetensors}] checkpoint_dir output_dir | |
| positional arguments: | |
| checkpoint_dir The path to the directory containing the checkpoints. | |
| output_dir The path to the output directory containing the consolidated checkpoint. | |
| optional arguments: | |
| -h, --<span class="hljs-built_in">help</span> show this <span class="hljs-built_in">help</span> message and <span class="hljs-built_in">exit</span> | |
| -f {pytorch,safetensors}, --format {pytorch,safetensors} | |
| The format used to save the consolidated checkpoint.`,wrap:!1}}),Oe=new B({props:{code:"bXlfdHJhaW5pbmclMkYlMEElRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBSRUFETUUubWQlMEElRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBhbGxfcmVzdWx0cy5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwY2hlY2twb2ludC0xMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMGNvbmZpZy5qc29uJTBBJUUyJTk0JTgyJTIwJTIwJTIwJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwc2NoZWR1bGVyLnB0JTBBJUUyJTk0JTgyJTIwJTIwJTIwJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwc3BlY2lhbF90b2tlbnNfbWFwLmpzb24lMEElRTIlOTQlODIlMjAlMjAlMjAlRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBzaGFyZHMlMkYlMEElRTIlOTQlODIlMjAlMjAlMjAlRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjB0b2tlbml6ZXIuanNvbiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRva2VuaXplci5tb2RlbCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRva2VuaXplcl9jb25maWcuanNvbiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRyYWluZXJfc3RhdGUuanNvbiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5NCVFMiU5NCU4MCVFMiU5NCU4MCUyMHRyYWluaW5nX2FyZ3MuYmluJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwY29uZmlnLmpzb24lMEElRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBzcGVjaWFsX3Rva2Vuc19tYXAuanNvbiUwQSVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHNoYXJkcyUyRiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDBfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDFfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDJfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDNfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDBfcHBfcmFua18wMSUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDFfcHBfcmFua18wMSUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDJfcHBfcmFua18wMSUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5NCVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDNfcHBfcmFua18wMSUwQSVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRva2VuaXplci5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdG9rZW5pemVyLm1vZGVsJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdG9rZW5pemVyX2NvbmZpZy5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdHJhaW5fcmVzdWx0cy5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdHJhaW5lcl9zdGF0ZS5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdHJhaW5pbmdfYXJncy5iaW4lMEElRTIlOTQlOTQlRTIlOTQlODAlRTIlOTQlODAlMjB0cm5fY29uZmlnLmpzb24=",highlighted:`my_training/ | |
| ├── README.md | |
| ├── all_results.json | |
| ├── checkpoint-10 | |
| │ ├── config.json | |
| │ ├── scheduler.pt | |
| │ ├── special_tokens_map.json | |
| │ ├── shards/ | |
| │ ├── tokenizer.json | |
| │ ├── tokenizer.model | |
| │ ├── tokenizer_config.json | |
| │ ├── trainer_state.json | |
| │ └── training_args.bin | |
| ├── config.json | |
| ├── special_tokens_map.json | |
| ├── shards/ | |
| │ ├── tp_rank_00_pp_rank_00 | |
| │ ├── tp_rank_01_pp_rank_00 | |
| │ ├── tp_rank_02_pp_rank_00 | |
| │ ├── tp_rank_03_pp_rank_00 | |
| │ ├── tp_rank_00_pp_rank_01 | |
| │ ├── tp_rank_01_pp_rank_01 | |
| │ ├── tp_rank_02_pp_rank_01 | |
| │ └── tp_rank_03_pp_rank_01 | |
| ├── tokenizer.json | |
| ├── tokenizer.model | |
| ├── tokenizer_config.json | |
| ├── train_results.json | |
| ├── trainer_state.json | |
| ├── training_args.bin | |
| └── trn_config.json`,wrap:!1}}),qe=new B({props:{code:"b3B0aW11bS1jbGklMjBuZXVyb24lMjBjb25zb2xpZGF0ZSUyMG15X3RyYWluaW5nJTIwbXlfdHJhaW5pbmdfY29uc29saWRhdGVkX2NoZWNrcG9pbnQ=",highlighted:"optimum-cli neuron consolidate my_training my_training_consolidated_checkpoint",wrap:!1}}),Z=new ol({props:{$$slots:{default:[oi]},$$scope:{ctx:$}}}),Ke=new J({props:{title:"Best Practices",local:"best-practices",headingTag:"h2"}}),el=new J({props:{title:"Choosing Parallelism Strategy",local:"choosing-parallelism-strategy",headingTag:"h3"}}),tl=new J({props:{title:"Memory Optimization",local:"memory-optimization",headingTag:"h3"}}),il=new J({props:{title:"Troubleshooting",local:"troubleshooting",headingTag:"h2"}}),al=new J({props:{title:"Common Issues",local:"common-issues",headingTag:"h3"}}),rl=new J({props:{title:"Debugging Tips",local:"debugging-tips",headingTag:"h3"}}),{c(){m=s("meta"),b=i(),d=s("p"),w=i(),p(g.$$.fragment),f=i(),p(h.$$.fragment),C=i(),o=s("p"),o.innerHTML=j,yl=i(),V=s("p"),V.textContent=Pt,cl=i(),G=s("p"),G.innerHTML=Ot,Ul=i(),p(F.$$.fragment),Tl=i(),p(k.$$.fragment),ul=i(),R=s("p"),R.innerHTML=Dt,dl=i(),X=s("p"),X.innerHTML=qt,wl=i(),N=s("p"),N.innerHTML=Kt,Jl=i(),H=s("p"),H.innerHTML=en,hl=i(),p(A.$$.fragment),bl=i(),E=s("p"),E.innerHTML=ln,fl=i(),Q=s("p"),Q.innerHTML=tn,Cl=i(),z=s("p"),z.innerHTML=nn,jl=i(),Y=s("p"),Y.innerHTML=an,gl=i(),x=s("p"),x.innerHTML=sn,$l=i(),S=s("p"),S.innerHTML=rn,Bl=i(),p(L.$$.fragment),Il=i(),P=s("p"),P.innerHTML=Mn,vl=i(),O=s("p"),O.innerHTML=mn,_l=i(),D=s("p"),D.innerHTML=on,Wl=i(),q=s("p"),q.innerHTML=pn,Zl=i(),K=s("p"),K.innerHTML=yn,Vl=i(),p(ee.$$.fragment),Gl=i(),le=s("p"),le.innerHTML=cn,Fl=i(),te=s("p"),te.innerHTML=Un,kl=i(),ne=s("p"),ne.innerHTML=Tn,Rl=i(),ie=s("p"),ie.innerHTML=un,Xl=i(),ae=s("p"),ae.innerHTML=dn,Nl=i(),se=s("p"),se.innerHTML=wn,Hl=i(),re=s("p"),re.innerHTML=Jn,Al=i(),p(I.$$.fragment),El=i(),p(Me.$$.fragment),Ql=i(),me=s("p"),me.innerHTML=hn,zl=i(),p(oe.$$.fragment),Yl=i(),p(pe.$$.fragment),xl=i(),p(v.$$.fragment),Sl=i(),p(ye.$$.fragment),Ll=i(),ce=s("p"),ce.innerHTML=bn,Pl=i(),p(Ue.$$.fragment),Ol=i(),p(Te.$$.fragment),Dl=i(),ue=s("p"),ue.innerHTML=fn,ql=i(),de=s("p"),de.innerHTML=Cn,Kl=i(),we=s("p"),we.textContent=jn,et=i(),Je=s("ol"),Je.innerHTML=gn,lt=i(),he=s("p"),he.innerHTML=$n,tt=i(),p(be.$$.fragment),nt=i(),p(fe.$$.fragment),it=i(),p(_.$$.fragment),at=i(),p(Ce.$$.fragment),st=i(),je=s("p"),je.innerHTML=Bn,rt=i(),p(ge.$$.fragment),Mt=i(),p($e.$$.fragment),mt=i(),Be=s("p"),Be.textContent=In,ot=i(),Ie=s("p"),Ie.innerHTML=vn,pt=i(),p(ve.$$.fragment),yt=i(),_e=s("p"),_e.textContent=_n,ct=i(),We=s("ul"),We.innerHTML=Wn,Ut=i(),p(Ze.$$.fragment),Tt=i(),p(Ve.$$.fragment),ut=i(),p(Ge.$$.fragment),dt=i(),p(Fe.$$.fragment),wt=i(),p(W.$$.fragment),Jt=i(),p(ke.$$.fragment),ht=i(),Re=s("p"),Re.textContent=Zn,bt=i(),p(Xe.$$.fragment),ft=i(),p(Ne.$$.fragment),Ct=i(),He=s("p"),He.textContent=Vn,jt=i(),Ae=s("ul"),Ae.innerHTML=Gn,gt=i(),Ee=s("p"),Ee.innerHTML=Fn,$t=i(),p(Qe.$$.fragment),Bt=i(),ze=s("p"),ze.textContent=kn,It=i(),Ye=s("p"),Ye.innerHTML=Rn,vt=i(),p(xe.$$.fragment),_t=i(),Se=s("p"),Se.innerHTML=Xn,Wt=i(),Le=s("p"),Le.textContent=Nn,Zt=i(),Pe=s("p"),Pe.innerHTML=Hn,Vt=i(),p(Oe.$$.fragment),Gt=i(),De=s("p"),De.innerHTML=An,Ft=i(),p(qe.$$.fragment),kt=i(),p(Z.$$.fragment),Rt=i(),p(Ke.$$.fragment),Xt=i(),p(el.$$.fragment),Nt=i(),ll=s("ol"),ll.innerHTML=En,Ht=i(),p(tl.$$.fragment),At=i(),nl=s("ul"),nl.innerHTML=Qn,Et=i(),p(il.$$.fragment),Qt=i(),p(al.$$.fragment),zt=i(),sl=s("ol"),sl.innerHTML=zn,Yt=i(),p(rl.$$.fragment),xt=i(),Ml=s("ul"),Ml.innerHTML=Yn,St=i(),ml=s("p"),this.h()},l(e){const l=ni("svelte-u9bgzb",document.head);m=r(l,"META",{name:!0,content:!0}),l.forEach(t),b=a(e),d=r(e,"P",{}),Dn(d).forEach(t),w=a(e),y(g.$$.fragment,e),f=a(e),y(h.$$.fragment,e),C=a(e),o=r(e,"P",{"data-svelte-h":!0}),M(o)!=="svelte-1hnco7m"&&(o.innerHTML=j),yl=a(e),V=r(e,"P",{"data-svelte-h":!0}),M(V)!=="svelte-d2kadp"&&(V.textContent=Pt),cl=a(e),G=r(e,"P",{"data-svelte-h":!0}),M(G)!=="svelte-1ndq4u8"&&(G.innerHTML=Ot),Ul=a(e),y(F.$$.fragment,e),Tl=a(e),y(k.$$.fragment,e),ul=a(e),R=r(e,"P",{"data-svelte-h":!0}),M(R)!=="svelte-1mjmifd"&&(R.innerHTML=Dt),dl=a(e),X=r(e,"P",{"data-svelte-h":!0}),M(X)!=="svelte-1f0jrrz"&&(X.innerHTML=qt),wl=a(e),N=r(e,"P",{"data-svelte-h":!0}),M(N)!=="svelte-18396cf"&&(N.innerHTML=Kt),Jl=a(e),H=r(e,"P",{"data-svelte-h":!0}),M(H)!=="svelte-190zbi2"&&(H.innerHTML=en),hl=a(e),y(A.$$.fragment,e),bl=a(e),E=r(e,"P",{"data-svelte-h":!0}),M(E)!=="svelte-1r1vc5"&&(E.innerHTML=ln),fl=a(e),Q=r(e,"P",{"data-svelte-h":!0}),M(Q)!=="svelte-wsb08h"&&(Q.innerHTML=tn),Cl=a(e),z=r(e,"P",{"data-svelte-h":!0}),M(z)!=="svelte-1rz7p8m"&&(z.innerHTML=nn),jl=a(e),Y=r(e,"P",{"data-svelte-h":!0}),M(Y)!=="svelte-kpqjki"&&(Y.innerHTML=an),gl=a(e),x=r(e,"P",{"data-svelte-h":!0}),M(x)!=="svelte-fc78y5"&&(x.innerHTML=sn),$l=a(e),S=r(e,"P",{"data-svelte-h":!0}),M(S)!=="svelte-otaiec"&&(S.innerHTML=rn),Bl=a(e),y(L.$$.fragment,e),Il=a(e),P=r(e,"P",{"data-svelte-h":!0}),M(P)!=="svelte-4fh0sm"&&(P.innerHTML=Mn),vl=a(e),O=r(e,"P",{"data-svelte-h":!0}),M(O)!=="svelte-1cma7oi"&&(O.innerHTML=mn),_l=a(e),D=r(e,"P",{"data-svelte-h":!0}),M(D)!=="svelte-hrmmz0"&&(D.innerHTML=on),Wl=a(e),q=r(e,"P",{"data-svelte-h":!0}),M(q)!=="svelte-1obmcw0"&&(q.innerHTML=pn),Zl=a(e),K=r(e,"P",{"data-svelte-h":!0}),M(K)!=="svelte-41lgu1"&&(K.innerHTML=yn),Vl=a(e),y(ee.$$.fragment,e),Gl=a(e),le=r(e,"P",{"data-svelte-h":!0}),M(le)!=="svelte-1lhkl2x"&&(le.innerHTML=cn),Fl=a(e),te=r(e,"P",{"data-svelte-h":!0}),M(te)!=="svelte-a3vdn"&&(te.innerHTML=Un),kl=a(e),ne=r(e,"P",{"data-svelte-h":!0}),M(ne)!=="svelte-1jq8cu1"&&(ne.innerHTML=Tn),Rl=a(e),ie=r(e,"P",{"data-svelte-h":!0}),M(ie)!=="svelte-1vm5hdt"&&(ie.innerHTML=un),Xl=a(e),ae=r(e,"P",{"data-svelte-h":!0}),M(ae)!=="svelte-4tw4cj"&&(ae.innerHTML=dn),Nl=a(e),se=r(e,"P",{"data-svelte-h":!0}),M(se)!=="svelte-l89uy8"&&(se.innerHTML=wn),Hl=a(e),re=r(e,"P",{"data-svelte-h":!0}),M(re)!=="svelte-1p0ihsg"&&(re.innerHTML=Jn),Al=a(e),y(I.$$.fragment,e),El=a(e),y(Me.$$.fragment,e),Ql=a(e),me=r(e,"P",{"data-svelte-h":!0}),M(me)!=="svelte-1xpk0lv"&&(me.innerHTML=hn),zl=a(e),y(oe.$$.fragment,e),Yl=a(e),y(pe.$$.fragment,e),xl=a(e),y(v.$$.fragment,e),Sl=a(e),y(ye.$$.fragment,e),Ll=a(e),ce=r(e,"P",{"data-svelte-h":!0}),M(ce)!=="svelte-106kvj9"&&(ce.innerHTML=bn),Pl=a(e),y(Ue.$$.fragment,e),Ol=a(e),y(Te.$$.fragment,e),Dl=a(e),ue=r(e,"P",{"data-svelte-h":!0}),M(ue)!=="svelte-1r4hhew"&&(ue.innerHTML=fn),ql=a(e),de=r(e,"P",{"data-svelte-h":!0}),M(de)!=="svelte-v1qtdm"&&(de.innerHTML=Cn),Kl=a(e),we=r(e,"P",{"data-svelte-h":!0}),M(we)!=="svelte-n127re"&&(we.textContent=jn),et=a(e),Je=r(e,"OL",{"data-svelte-h":!0}),M(Je)!=="svelte-1hoskl8"&&(Je.innerHTML=gn),lt=a(e),he=r(e,"P",{"data-svelte-h":!0}),M(he)!=="svelte-11wpmlp"&&(he.innerHTML=$n),tt=a(e),y(be.$$.fragment,e),nt=a(e),y(fe.$$.fragment,e),it=a(e),y(_.$$.fragment,e),at=a(e),y(Ce.$$.fragment,e),st=a(e),je=r(e,"P",{"data-svelte-h":!0}),M(je)!=="svelte-1ncu8vs"&&(je.innerHTML=Bn),rt=a(e),y(ge.$$.fragment,e),Mt=a(e),y($e.$$.fragment,e),mt=a(e),Be=r(e,"P",{"data-svelte-h":!0}),M(Be)!=="svelte-1vp0c4m"&&(Be.textContent=In),ot=a(e),Ie=r(e,"P",{"data-svelte-h":!0}),M(Ie)!=="svelte-1ytrjb2"&&(Ie.innerHTML=vn),pt=a(e),y(ve.$$.fragment,e),yt=a(e),_e=r(e,"P",{"data-svelte-h":!0}),M(_e)!=="svelte-wwttlo"&&(_e.textContent=_n),ct=a(e),We=r(e,"UL",{"data-svelte-h":!0}),M(We)!=="svelte-9fwfrb"&&(We.innerHTML=Wn),Ut=a(e),y(Ze.$$.fragment,e),Tt=a(e),y(Ve.$$.fragment,e),ut=a(e),y(Ge.$$.fragment,e),dt=a(e),y(Fe.$$.fragment,e),wt=a(e),y(W.$$.fragment,e),Jt=a(e),y(ke.$$.fragment,e),ht=a(e),Re=r(e,"P",{"data-svelte-h":!0}),M(Re)!=="svelte-ktf0yf"&&(Re.textContent=Zn),bt=a(e),y(Xe.$$.fragment,e),ft=a(e),y(Ne.$$.fragment,e),Ct=a(e),He=r(e,"P",{"data-svelte-h":!0}),M(He)!=="svelte-e9hr70"&&(He.textContent=Vn),jt=a(e),Ae=r(e,"UL",{"data-svelte-h":!0}),M(Ae)!=="svelte-138kta0"&&(Ae.innerHTML=Gn),gt=a(e),Ee=r(e,"P",{"data-svelte-h":!0}),M(Ee)!=="svelte-1oatqej"&&(Ee.innerHTML=Fn),$t=a(e),y(Qe.$$.fragment,e),Bt=a(e),ze=r(e,"P",{"data-svelte-h":!0}),M(ze)!=="svelte-10z9rkn"&&(ze.textContent=kn),It=a(e),Ye=r(e,"P",{"data-svelte-h":!0}),M(Ye)!=="svelte-24042q"&&(Ye.innerHTML=Rn),vt=a(e),y(xe.$$.fragment,e),_t=a(e),Se=r(e,"P",{"data-svelte-h":!0}),M(Se)!=="svelte-7l8i2j"&&(Se.innerHTML=Xn),Wt=a(e),Le=r(e,"P",{"data-svelte-h":!0}),M(Le)!=="svelte-11lpom8"&&(Le.textContent=Nn),Zt=a(e),Pe=r(e,"P",{"data-svelte-h":!0}),M(Pe)!=="svelte-15ioqnc"&&(Pe.innerHTML=Hn),Vt=a(e),y(Oe.$$.fragment,e),Gt=a(e),De=r(e,"P",{"data-svelte-h":!0}),M(De)!=="svelte-1e8fv74"&&(De.innerHTML=An),Ft=a(e),y(qe.$$.fragment,e),kt=a(e),y(Z.$$.fragment,e),Rt=a(e),y(Ke.$$.fragment,e),Xt=a(e),y(el.$$.fragment,e),Nt=a(e),ll=r(e,"OL",{"data-svelte-h":!0}),M(ll)!=="svelte-1j56tw2"&&(ll.innerHTML=En),Ht=a(e),y(tl.$$.fragment,e),At=a(e),nl=r(e,"UL",{"data-svelte-h":!0}),M(nl)!=="svelte-pjt7c2"&&(nl.innerHTML=Qn),Et=a(e),y(il.$$.fragment,e),Qt=a(e),y(al.$$.fragment,e),zt=a(e),sl=r(e,"OL",{"data-svelte-h":!0}),M(sl)!=="svelte-j5qw5u"&&(sl.innerHTML=zn),Yt=a(e),y(rl.$$.fragment,e),xt=a(e),Ml=r(e,"UL",{"data-svelte-h":!0}),M(Ml)!=="svelte-jjqy14"&&(Ml.innerHTML=Yn),St=a(e),ml=r(e,"P",{}),Dn(ml).forEach(t),this.h()},h(){qn(m,"name","hf:doc:metadata"),qn(m,"content",yi)},m(e,l){ii(document.head,m),n(e,b,l),n(e,d,l),n(e,w,l),c(g,e,l),n(e,f,l),c(h,e,l),n(e,C,l),n(e,o,l),n(e,yl,l),n(e,V,l),n(e,cl,l),n(e,G,l),n(e,Ul,l),c(F,e,l),n(e,Tl,l),c(k,e,l),n(e,ul,l),n(e,R,l),n(e,dl,l),n(e,X,l),n(e,wl,l),n(e,N,l),n(e,Jl,l),n(e,H,l),n(e,hl,l),c(A,e,l),n(e,bl,l),n(e,E,l),n(e,fl,l),n(e,Q,l),n(e,Cl,l),n(e,z,l),n(e,jl,l),n(e,Y,l),n(e,gl,l),n(e,x,l),n(e,$l,l),n(e,S,l),n(e,Bl,l),c(L,e,l),n(e,Il,l),n(e,P,l),n(e,vl,l),n(e,O,l),n(e,_l,l),n(e,D,l),n(e,Wl,l),n(e,q,l),n(e,Zl,l),n(e,K,l),n(e,Vl,l),c(ee,e,l),n(e,Gl,l),n(e,le,l),n(e,Fl,l),n(e,te,l),n(e,kl,l),n(e,ne,l),n(e,Rl,l),n(e,ie,l),n(e,Xl,l),n(e,ae,l),n(e,Nl,l),n(e,se,l),n(e,Hl,l),n(e,re,l),n(e,Al,l),c(I,e,l),n(e,El,l),c(Me,e,l),n(e,Ql,l),n(e,me,l),n(e,zl,l),c(oe,e,l),n(e,Yl,l),c(pe,e,l),n(e,xl,l),c(v,e,l),n(e,Sl,l),c(ye,e,l),n(e,Ll,l),n(e,ce,l),n(e,Pl,l),c(Ue,e,l),n(e,Ol,l),c(Te,e,l),n(e,Dl,l),n(e,ue,l),n(e,ql,l),n(e,de,l),n(e,Kl,l),n(e,we,l),n(e,et,l),n(e,Je,l),n(e,lt,l),n(e,he,l),n(e,tt,l),c(be,e,l),n(e,nt,l),c(fe,e,l),n(e,it,l),c(_,e,l),n(e,at,l),c(Ce,e,l),n(e,st,l),n(e,je,l),n(e,rt,l),c(ge,e,l),n(e,Mt,l),c($e,e,l),n(e,mt,l),n(e,Be,l),n(e,ot,l),n(e,Ie,l),n(e,pt,l),c(ve,e,l),n(e,yt,l),n(e,_e,l),n(e,ct,l),n(e,We,l),n(e,Ut,l),c(Ze,e,l),n(e,Tt,l),c(Ve,e,l),n(e,ut,l),c(Ge,e,l),n(e,dt,l),c(Fe,e,l),n(e,wt,l),c(W,e,l),n(e,Jt,l),c(ke,e,l),n(e,ht,l),n(e,Re,l),n(e,bt,l),c(Xe,e,l),n(e,ft,l),c(Ne,e,l),n(e,Ct,l),n(e,He,l),n(e,jt,l),n(e,Ae,l),n(e,gt,l),n(e,Ee,l),n(e,$t,l),c(Qe,e,l),n(e,Bt,l),n(e,ze,l),n(e,It,l),n(e,Ye,l),n(e,vt,l),c(xe,e,l),n(e,_t,l),n(e,Se,l),n(e,Wt,l),n(e,Le,l),n(e,Zt,l),n(e,Pe,l),n(e,Vt,l),c(Oe,e,l),n(e,Gt,l),n(e,De,l),n(e,Ft,l),c(qe,e,l),n(e,kt,l),c(Z,e,l),n(e,Rt,l),c(Ke,e,l),n(e,Xt,l),c(el,e,l),n(e,Nt,l),n(e,ll,l),n(e,Ht,l),c(tl,e,l),n(e,At,l),n(e,nl,l),n(e,Et,l),c(il,e,l),n(e,Qt,l),c(al,e,l),n(e,zt,l),n(e,sl,l),n(e,Yt,l),c(rl,e,l),n(e,xt,l),n(e,Ml,l),n(e,St,l),n(e,ml,l),Lt=!0},p(e,[l]){const xn={};l&2&&(xn.$$scope={dirty:l,ctx:e}),I.$set(xn);const Sn={};l&2&&(Sn.$$scope={dirty:l,ctx:e}),v.$set(Sn);const Ln={};l&2&&(Ln.$$scope={dirty:l,ctx:e}),_.$set(Ln);const Pn={};l&2&&(Pn.$$scope={dirty:l,ctx:e}),W.$set(Pn);const On={};l&2&&(On.$$scope={dirty:l,ctx:e}),Z.$set(On)},i(e){Lt||(U(g.$$.fragment,e),U(h.$$.fragment,e),U(F.$$.fragment,e),U(k.$$.fragment,e),U(A.$$.fragment,e),U(L.$$.fragment,e),U(ee.$$.fragment,e),U(I.$$.fragment,e),U(Me.$$.fragment,e),U(oe.$$.fragment,e),U(pe.$$.fragment,e),U(v.$$.fragment,e),U(ye.$$.fragment,e),U(Ue.$$.fragment,e),U(Te.$$.fragment,e),U(be.$$.fragment,e),U(fe.$$.fragment,e),U(_.$$.fragment,e),U(Ce.$$.fragment,e),U(ge.$$.fragment,e),U($e.$$.fragment,e),U(ve.$$.fragment,e),U(Ze.$$.fragment,e),U(Ve.$$.fragment,e),U(Ge.$$.fragment,e),U(Fe.$$.fragment,e),U(W.$$.fragment,e),U(ke.$$.fragment,e),U(Xe.$$.fragment,e),U(Ne.$$.fragment,e),U(Qe.$$.fragment,e),U(xe.$$.fragment,e),U(Oe.$$.fragment,e),U(qe.$$.fragment,e),U(Z.$$.fragment,e),U(Ke.$$.fragment,e),U(el.$$.fragment,e),U(tl.$$.fragment,e),U(il.$$.fragment,e),U(al.$$.fragment,e),U(rl.$$.fragment,e),Lt=!0)},o(e){T(g.$$.fragment,e),T(h.$$.fragment,e),T(F.$$.fragment,e),T(k.$$.fragment,e),T(A.$$.fragment,e),T(L.$$.fragment,e),T(ee.$$.fragment,e),T(I.$$.fragment,e),T(Me.$$.fragment,e),T(oe.$$.fragment,e),T(pe.$$.fragment,e),T(v.$$.fragment,e),T(ye.$$.fragment,e),T(Ue.$$.fragment,e),T(Te.$$.fragment,e),T(be.$$.fragment,e),T(fe.$$.fragment,e),T(_.$$.fragment,e),T(Ce.$$.fragment,e),T(ge.$$.fragment,e),T($e.$$.fragment,e),T(ve.$$.fragment,e),T(Ze.$$.fragment,e),T(Ve.$$.fragment,e),T(Ge.$$.fragment,e),T(Fe.$$.fragment,e),T(W.$$.fragment,e),T(ke.$$.fragment,e),T(Xe.$$.fragment,e),T(Ne.$$.fragment,e),T(Qe.$$.fragment,e),T(xe.$$.fragment,e),T(Oe.$$.fragment,e),T(qe.$$.fragment,e),T(Z.$$.fragment,e),T(Ke.$$.fragment,e),T(el.$$.fragment,e),T(tl.$$.fragment,e),T(il.$$.fragment,e),T(al.$$.fragment,e),T(rl.$$.fragment,e),Lt=!1},d(e){e&&(t(b),t(d),t(w),t(f),t(C),t(o),t(yl),t(V),t(cl),t(G),t(Ul),t(Tl),t(ul),t(R),t(dl),t(X),t(wl),t(N),t(Jl),t(H),t(hl),t(bl),t(E),t(fl),t(Q),t(Cl),t(z),t(jl),t(Y),t(gl),t(x),t($l),t(S),t(Bl),t(Il),t(P),t(vl),t(O),t(_l),t(D),t(Wl),t(q),t(Zl),t(K),t(Vl),t(Gl),t(le),t(Fl),t(te),t(kl),t(ne),t(Rl),t(ie),t(Xl),t(ae),t(Nl),t(se),t(Hl),t(re),t(Al),t(El),t(Ql),t(me),t(zl),t(Yl),t(xl),t(Sl),t(Ll),t(ce),t(Pl),t(Ol),t(Dl),t(ue),t(ql),t(de),t(Kl),t(we),t(et),t(Je),t(lt),t(he),t(tt),t(nt),t(it),t(at),t(st),t(je),t(rt),t(Mt),t(mt),t(Be),t(ot),t(Ie),t(pt),t(yt),t(_e),t(ct),t(We),t(Ut),t(Tt),t(ut),t(dt),t(wt),t(Jt),t(ht),t(Re),t(bt),t(ft),t(Ct),t(He),t(jt),t(Ae),t(gt),t(Ee),t($t),t(Bt),t(ze),t(It),t(Ye),t(vt),t(_t),t(Se),t(Wt),t(Le),t(Zt),t(Pe),t(Vt),t(Gt),t(De),t(Ft),t(kt),t(Rt),t(Xt),t(Nt),t(ll),t(Ht),t(At),t(nl),t(Et),t(Qt),t(zt),t(sl),t(Yt),t(xt),t(Ml),t(St),t(ml)),t(m),u(g,e),u(h,e),u(F,e),u(k,e),u(A,e),u(L,e),u(ee,e),u(I,e),u(Me,e),u(oe,e),u(pe,e),u(v,e),u(ye,e),u(Ue,e),u(Te,e),u(be,e),u(fe,e),u(_,e),u(Ce,e),u(ge,e),u($e,e),u(ve,e),u(Ze,e),u(Ve,e),u(Ge,e),u(Fe,e),u(W,e),u(ke,e),u(Xe,e),u(Ne,e),u(Qe,e),u(xe,e),u(Oe,e),u(qe,e),u(Z,e),u(Ke,e),u(el,e),u(tl,e),u(il,e),u(al,e),u(rl,e)}}}const yi='{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"Parallelism Strategies Overview","local":"parallelism-strategies-overview","sections":[{"title":"1. ZeRO-1 (Optimizer State Sharding)","local":"1-zero-1-optimizer-state-sharding","sections":[],"depth":3},{"title":"2. Tensor Parallelism (Intra-layer Model Parallelism)","local":"2-tensor-parallelism-intra-layer-model-parallelism","sections":[],"depth":3},{"title":"3. Sequence Parallelism (Activation Sharding)","local":"3-sequence-parallelism-activation-sharding","sections":[],"depth":3},{"title":"4. Pipeline Parallelism (Inter-layer Model Parallelism)","local":"4-pipeline-parallelism-inter-layer-model-parallelism","sections":[],"depth":3}],"depth":2},{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Pipeline Parallelism?","local":"how-to-enable-pipeline-parallelism","sections":[{"title":"Configuration Options","local":"configuration-options","sections":[],"depth":3},{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"Combining Parallelism Strategies","local":"combining-parallelism-strategies","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3}],"depth":2},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":2},{"title":"Best Practices","local":"best-practices","sections":[{"title":"Choosing Parallelism Strategy","local":"choosing-parallelism-strategy","sections":[],"depth":3},{"title":"Memory Optimization","local":"memory-optimization","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[{"title":"Common Issues","local":"common-issues","sections":[],"depth":3},{"title":"Debugging Tips","local":"debugging-tips","sections":[],"depth":3}],"depth":2}],"depth":1}';function ci($){return ei(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class hi extends li{constructor(m){super(),ti(this,m,ci,pi,Kn,{})}}export{hi as component}; | |
Xet Storage Details
- Size:
- 58.9 kB
- Xet hash:
- 91b8bb84fa9cc41918d17f0dceea5826d876d95a9f55598b5afd78a2a2d3410e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.