Buckets:

download
raw
58.9 kB
import{s as Kn,o as ei,n as pl}from"../chunks/scheduler.56725da7.js";import{S as li,i as ti,e as s,s as i,c as p,h as ni,a as r,d as t,b as a,f as Dn,g as y,j as M,k as qn,l as ii,m as n,n as c,t as U,o as T,p as u}from"../chunks/index.18a26576.js";import{T as ol}from"../chunks/Tip.5b941656.js";import{C as ai}from"../chunks/CopyLLMTxtMenu.4513c8ed.js";import{C as B}from"../chunks/CodeBlock.58e3e98b.js";import{H as J}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.049405bf.js";function si($){let m,b="All the training examples in the optimum-neuron repo use these parallelism features via the <code>NeuronTrainer</code>.";return{c(){m=s("p"),m.innerHTML=b},l(d){m=r(d,"P",{"data-svelte-h":!0}),M(m)!=="svelte-1s96mwn"&&(m.innerHTML=b)},m(d,w){n(d,m,w)},p:pl,d(d){d&&t(m)}}}function ri($){let m,b="Since the example scripts use the <code>NeuronTrainer</code>, you can enable ZeRO-1 when using them by adding the <code>--zero_1</code> flag to your command line.",d,w,g="For example:",f,h,C;return h=new B({props:{code:"dG9yY2hydW4lMjAtLW5wcm9jX3Blcl9ub2RlJTNEMiUyMGV4YW1wbGVzJTJGdHJhaW5pbmclMkZxd2VuMyUyRmZpbmV0dW5lX3F3ZW4zLnB5JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbF9uYW1lX29yX3BhdGglMjBRd2VuJTJGUXdlbjIuNS0wLjVCJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kYXRhc2V0X25hbWUlMjB3aWtpdGV4dCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tZGF0YXNldF9jb25maWdfbmFtZSUyMHdpa2l0ZXh0LTItcmF3LXYxJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kb190cmFpbiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTIwMSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tYmxvY2tfc2l6ZSUyMDEwMjQlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWJmMTYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXplcm9fMSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tdGVuc29yX3BhcmFsbGVsX3NpemUlMjAyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1vdXRwdXRfZGlyJTIwbXlfdHJhaW5pbmclMkY=",highlighted:`torchrun --nproc_per_node=2 examples/training/qwen3/finetune_qwen3.py \\
--model_name_or_path Qwen/Qwen2.5-0.5B \\
--dataset_name wikitext \\
--dataset_config_name wikitext-2-raw-v1 \\
--do_train \\
--per_device_train_batch_size 1 \\
--block_size 1024 \\
--bf16 \\
--zero_1 \\
--tensor_parallel_size 2 \\
--output_dir my_training/`,wrap:!1}}),{c(){m=s("p"),m.innerHTML=b,d=i(),w=s("p"),w.textContent=g,f=i(),p(h.$$.fragment)},l(o){m=r(o,"P",{"data-svelte-h":!0}),M(m)!=="svelte-nmzr1t"&&(m.innerHTML=b),d=a(o),w=r(o,"P",{"data-svelte-h":!0}),M(w)!=="svelte-1gkqha7"&&(w.textContent=g),f=a(o),y(h.$$.fragment,o)},m(o,j){n(o,m,j),n(o,d,j),n(o,w,j),n(o,f,j),c(h,o,j),C=!0},p:pl,i(o){C||(U(h.$$.fragment,o),C=!0)},o(o){T(h.$$.fragment,o),C=!1},d(o){o&&(t(m),t(d),t(w),t(f)),u(h,o)}}}function Mi($){let m,b="Since the example scripts use the <code>NeuronTrainer</code>, you can enable Tensor Parallelism when using them by specifying the <code>--tensor_parallel_size</code> argument.",d,w,g="For example:",f,h,C;return h=new B({props:{code:"dG9yY2hydW4lMjAtLW5wcm9jX3Blcl9ub2RlJTNEOCUyMGV4YW1wbGVzJTJGdHJhaW5pbmclMkZxd2VuMyUyRmZpbmV0dW5lX3F3ZW4zLnB5JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbF9uYW1lX29yX3BhdGglMjBRd2VuJTJGUXdlbjIuNS0wLjVCJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kYXRhc2V0X25hbWUlMjB3aWtpdGV4dCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tZGF0YXNldF9jb25maWdfbmFtZSUyMHdpa2l0ZXh0LTItcmF3LXYxJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1kb190cmFpbiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTIwMSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tYmxvY2tfc2l6ZSUyMDEwMjQlMjAlNUMlMEElMjAlMjAlMjAlMjAtLWJmMTYlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXRlbnNvcl9wYXJhbGxlbF9zaXplJTIwOCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tb3V0cHV0X2RpciUyMG15X3RyYWluaW5nJTJG",highlighted:`torchrun --nproc_per_node=8 examples/training/qwen3/finetune_qwen3.py \\
--model_name_or_path Qwen/Qwen2.5-0.5B \\
--dataset_name wikitext \\
--dataset_config_name wikitext-2-raw-v1 \\
--do_train \\
--per_device_train_batch_size 1 \\
--block_size 1024 \\
--bf16 \\
--tensor_parallel_size 8 \\
--output_dir my_training/`,wrap:!1}}),{c(){m=s("p"),m.innerHTML=b,d=i(),w=s("p"),w.textContent=g,f=i(),p(h.$$.fragment)},l(o){m=r(o,"P",{"data-svelte-h":!0}),M(m)!=="svelte-7fhmhn"&&(m.innerHTML=b),d=a(o),w=r(o,"P",{"data-svelte-h":!0}),M(w)!=="svelte-1gkqha7"&&(w.textContent=g),f=a(o),y(h.$$.fragment,o)},m(o,j){n(o,m,j),n(o,d,j),n(o,w,j),n(o,f,j),c(h,o,j),C=!0},p:pl,i(o){C||(U(h.$$.fragment,o),C=!0)},o(o){T(h.$$.fragment,o),C=!1},d(o){o&&(t(m),t(d),t(w),t(f)),u(h,o)}}}function mi($){let m,b="When using pipeline parallelism, the total number of processes should be at least <code>tensor_parallel_size * pipeline_parallel_size</code>. For example, with <code>tensor_parallel_size=2</code> and <code>pipeline_parallel_size=4</code>, you need 8 processes total.";return{c(){m=s("p"),m.innerHTML=b},l(d){m=r(d,"P",{"data-svelte-h":!0}),M(m)!=="svelte-1ypidsl"&&(m.innerHTML=b)},m(d,w){n(d,m,w)},p:pl,d(d){d&&t(m)}}}function oi($){let m,b="The sharded checkpoints are saved under a directory called <code>shards</code>. The <code>optimum-cli neuron consolidate</code> command accepts as input both a directory that contains a <code>shards</code> directory, or the <code>shards</code> directory itself.";return{c(){m=s("p"),m.innerHTML=b},l(d){m=r(d,"P",{"data-svelte-h":!0}),M(m)!=="svelte-1sydsko"&&(m.innerHTML=b)},m(d,w){n(d,m,w)},p:pl,d(d){d&&t(m)}}}function pi($){let m,b,d,w,g,f,h,C,o,j="AWS Trainium instances provide powerful infrastructure for training large language models at scale. A <code>trn1.32xlarge</code> instance contains 16 Neuron devices with 32 cores total, offering 512GB of memory (16GB per core).",yl,V,Pt="However, training large models presents a fundamental challenge: by default, each Neuron core operates as an independent data-parallel worker, requiring the entire model, gradients, and optimizer state (approximately 4× the model size) to fit within a single core’s 16GB memory limit, with additional space needed for activations.",cl,G,Ot="For models that exceed these memory constraints, <code>optimum-neuron</code> provides sophisticated parallelism strategies that distribute computation and memory across multiple devices, enabling you to train models that would be impossible to fit on individual cores:",Ul,F,Tl,k,ul,R,Dt='<a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/zero1_gpt2.html" rel="nofollow">ZeRO-1</a> is an optimizer-level optimization that reduces memory usage without changing your model architecture.',dl,X,qt="<strong>How it works</strong>: Shards the optimizer state (gradients, momentum, variance) across data-parallel ranks instead of replicating it on each device.",wl,N,Kt="<strong>Memory savings</strong>: Reduces optimizer memory usage by <code>1/data_parellel_size</code>.",Jl,H,en="<strong>When to use</strong>: Always beneficial when training with multiple devices, regardless of model size.",hl,A,bl,E,ln='<a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tensor_parallelism_overview.html" rel="nofollow">Tensor Parallelism</a> splits individual model layers across multiple devices.',fl,Q,tn="<strong>How it works</strong>: Shards matrix multiplications (linear layers, attention) along rows or columns across devices. Each device computes part of each layer, requiring communication between devices for each forward/backward pass.",Cl,z,nn="<strong>Memory savings</strong>: Reduces model parameter memory by <code>1/tensor_parallel_size</code>.",jl,Y,an="<strong>When to use</strong>: When your model is too large to fit on a single device, even after applying ZeRO-1.",gl,x,sn="<strong>Typical deployment</strong>: Usually applied within a single node (intra-node) due to high communication requirements.",$l,S,rn="<strong>Trade-offs</strong>: Increases communication overhead between devices, which can slow down training if overused.",Bl,L,Il,P,Mn='<a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> is an optimization that works alongside Tensor Parallelism to further reduce memory usage.',vl,O,mn="<strong>How it works</strong>: Shards activations along the sequence dimension in regions where tensors are not already sharded by tensor parallelism.",_l,D,on="<strong>Memory savings</strong>: Reduces activation memory proportional to sequence length, especially beneficial for long sequences.",Wl,q,pn="<strong>When to use</strong>: Always enable when using tensor parallelism - it provides additional memory savings with minimal overhead.",Zl,K,yn="<strong>Requirement</strong>: Only works in combination with tensor parallelism.",Vl,ee,Gl,le,cn='<a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/pipeline_parallelism_overview.html" rel="nofollow">Pipeline Parallelism</a> splits model layers across different devices.',Fl,te,Un="<strong>How it works</strong>: Divides your model into stages, with each stage containing consecutive layers running on different devices. Uses microbatching to keep all devices busy.",kl,ne,Tn="<strong>Memory savings</strong>: Reduces model parameter memory by <code>1/pipeline_parallel_size</code>.",Rl,ie,un="<strong>When to use</strong>: For very large models that don’t fit even with tensor parallelism, or when you want to scale across many devices with less communication overhead than tensor parallelism.",Xl,ae,dn="<strong>Typical deployment</strong>: Usually applied across multiple nodes (inter-node) to scale to larger numbers of devices while minimizing high-bandwidth communication requirements.",Nl,se,wn="<strong>Trade-offs</strong>: Introduces pipeline bubbles (idle time) and requires careful tuning of microbatch sizes.",Hl,re,Jn="The good news is that it is possible to combine those techniques, and <code>optimum-neuron</code> makes it very easy!",Al,I,El,Me,Ql,me,hn="ZeRO-1 can be enabled either through the <code>NeuronTrainer</code> or directly with the <code>NeuronAccelerator</code>.",zl,oe,Yl,pe,xl,v,Sl,ye,Ll,ce,bn="When using the <code>NeuronAccelerator</code> directly, you need to create a <code>TrainingNeuronConfig</code> and enable ZeRO-1 separately:",Pl,Ue,Ol,Te,Dl,ue,fn="Tensor Parallelism can be used with either the <code>NeuronTrainer</code> or <code>NeuronAccelerator</code>.",ql,de,Cn="<strong>Important</strong>: Tensor parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code>.",Kl,we,jn="When doing Tensor Parallelism, you have several important settings:",et,Je,gn='<li>The <code>tensor_parallel_size</code>: Ideally it should be the smallest value for which the model fits in memory.</li> <li>Whether sequence parallelism should be enabled: <a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> shards the activations on the sequence axis outside of the tensor parallel regions, saving memory by sharding the activations.</li>',lt,he,$n="When using distributed training, the training script is called by <code>torchrun</code>, which will dispatch it to workers, one worker per core. Each worker will load the sharded model and dispatch the parameters automatically across the cores. The <code>tensor_parallel_size</code> is the number of workers to shard the model parameters on.",tt,be,nt,fe,it,_,at,Ce,st,je,Bn="When using the <code>NeuronAccelerator</code> directly, you configure tensor parallelism through the <code>TrainingNeuronConfig</code>:",rt,ge,Mt,$e,mt,Be,In="Pipeline Parallelism allows you to split your model layers across multiple devices, enabling training of very large models that wouldn’t fit on a single device, or even a signle node.",ot,Ie,vn="<strong>Important</strong>: Pipeline parallelism requires models that have a custom modeling implementation in <code>optimum.neuron.models.training</code> and declare <code>SUPPORTS_PIPELINE_PARALLELISM = True</code>.",pt,ve,yt,_e,_n="Pipeline parallelism has several configuration parameters:",ct,We,Wn="<li><code>pipeline_parallel_size</code>: Number of pipeline stages (devices to split layers across)</li> <li><code>pipeline_parallel_num_microbatches</code>: Number of microbatches for pipeline scheduling</li> <li>When pipeline parallelism is enabled, ZeRO-1 can be automatically applied to the pipeline parallel optimizer</li>",Ut,Ze,Tt,Ve,ut,Ge,dt,Fe,wt,W,Jt,ke,ht,Re,Zn="You can combine multiple parallelism strategies for maximum memory efficiency and performance. Here’s an example with all strategies combined:",bt,Xe,ft,Ne,Ct,He,Vn="This configuration uses 4 * 2 = 8 total processes:",jt,Ae,Gn="<li>Each tensor parallel group has 4 processes</li> <li>Each pipeline stage runs on one tensor parallel group</li>",gt,Ee,Fn="We can then run the training script on the <code>trn1.32xlarge</code> instance with 32 Neuron cores, resulting in the following configuration: <code>dp=4, tp=4, pp=2</code>, which means 4 data-parallel groups, each with 4 tensor-parallel devices, and 2 pipeline stages.",$t,Qe,Bt,ze,kn="Since distributed training uses sharded checkpoints across different workers, you need to consolidate them to create a standard model checkpoint that can be shared and used outside of the specific training configuration.",It,Ye,Rn="The Optimum CLI provides a way of doing that very easily via the <code>optimum neuron consolidate</code> command:",vt,xe,_t,Se,Xn=`All you need to do is specify the sharded checkpoints directory and the output directory that will contain the consolidated checkpoints, and the command takes care of the rest.
It is also possible to specify the output format of the consolidated checkpoints. By default it will export them to the <code>safetensors</code> format, which is the recommended format to use.`,Wt,Le,Nn="Example:",Zt,Pe,Hn="Training with distributed parallelism just completed and the output dir is called <code>my_training</code>. The directory looks like the following:",Vt,Oe,Gt,De,An="You can consolidate the sharded checkpoints in <code>my_training/shards</code>, which correspond to the sharded checkpoints saved at the end of training, by running the following command:",Ft,qe,kt,Z,Rt,Ke,Xt,el,Nt,ll,En="<li><strong>Start with Tensor Parallelism</strong>: Use the smallest <code>tensor_parallel_size</code> that fits your model in memory</li> <li><strong>Add Pipeline Parallelism</strong>: For very large models, combine with pipeline parallelism</li> <li><strong>Enable Sequence Parallelism</strong>: Always enable when using tensor parallelism for memory savings (set <code>disable_sequence_parallel=False</code>)</li> <li><strong>Use ZeRO-1</strong>: Combine with any parallelism strategy for optimizer memory savings</li>",Ht,tl,At,nl,Qn="<li>Enable <code>gradient_checkpointing</code> for large models</li> <li>Set appropriate <code>pipeline_parallel_num_microbatches</code> for pipeline parallelism</li>",Et,il,Qt,al,zt,sl,zn="<li><strong>Out of Memory</strong>: Reduce batch size, increase parallelism, or enable gradient checkpointing</li> <li><strong>Model Not Supported</strong>: Ensure you’re using a model from <code>optimum.neuron.models.training</code></li> <li><strong>Pipeline Parallelism Fails</strong>: Check that the model supports pipeline parallelism</li> <li><strong>Incorrect Process Count</strong>: Ensure <code>nproc_per_node</code> matches your parallelism configuration</li>",Yt,rl,xt,Ml,Yn="<li>Start with smaller models and parallelism sizes</li> <li>Check that all processes can communicate properly</li> <li>Verify checkpoint directories and permissions</li> <li>Monitor Neuron device utilization</li>",St,ml,Lt;return g=new ai({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),h=new J({props:{title:"Distributed Training with optimum-neuron",local:"distributed-training-with-optimum-neuron",headingTag:"h1"}}),F=new J({props:{title:"Parallelism Strategies Overview",local:"parallelism-strategies-overview",headingTag:"h2"}}),k=new J({props:{title:"1. ZeRO-1 (Optimizer State Sharding)",local:"1-zero-1-optimizer-state-sharding",headingTag:"h3"}}),A=new J({props:{title:"2. Tensor Parallelism (Intra-layer Model Parallelism)",local:"2-tensor-parallelism-intra-layer-model-parallelism",headingTag:"h3"}}),L=new J({props:{title:"3. Sequence Parallelism (Activation Sharding)",local:"3-sequence-parallelism-activation-sharding",headingTag:"h3"}}),ee=new J({props:{title:"4. Pipeline Parallelism (Inter-layer Model Parallelism)",local:"4-pipeline-parallelism-inter-layer-model-parallelism",headingTag:"h3"}}),I=new ol({props:{$$slots:{default:[si]},$$scope:{ctx:$}}}),Me=new J({props:{title:"How to enable ZeRO-1?",local:"how-to-enable-zero-1",headingTag:"h2"}}),oe=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),pe=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBJTBBJTIzJTIwRW5hYmxlJTIwWmVSTy0xJTIwaW4lMjB0aGUlMjB0cmFpbmluZyUyMGFyZ3VtZW50cyUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBOZXVyb25UcmFpbmluZ0FyZ3VtZW50cyglMEElMjAlMjAlMjAlMjBvdXRwdXRfZGlyJTNEJTIyLiUyRm91dHB1dCUyMiUyQyUwQSUyMCUyMCUyMCUyMHBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSUzRDElMkMlMEElMjAlMjAlMjAlMjB6ZXJvXzElM0RUcnVlJTJDJTIwJTIwJTIzJTIwRW5hYmxlJTIwWmVSTy0xJTBBJTIwJTIwJTIwJTIwYmYxNiUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjAlMjMlMjAuLi4lMjBvdGhlciUyMHRyYWluaW5nJTIwYXJndW1lbnRzJTBBKSUwQSUwQXRyYWluZXIlMjAlM0QlMjBOZXVyb25UcmFpbmVyKCUwQSUyMCUyMCUyMCUyMG1vZGVsJTNEbW9kZWwlMkMlMEElMjAlMjAlMjAlMjBhcmdzJTNEdHJhaW5pbmdfYXJncyUyQyUwQSUyMCUyMCUyMCUyMHRyYWluX2RhdGFzZXQlM0R0cmFpbl9kYXRhc2V0JTJDJTBBJTIwJTIwJTIwJTIwZXZhbF9kYXRhc2V0JTNEZXZhbF9kYXRhc2V0JTJDJTBBKSUwQSUwQXRyYWluZXIudHJhaW4oKQ==",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
<span class="hljs-comment"># Enable ZeRO-1 in the training arguments</span>
training_args = NeuronTrainingArguments(
output_dir=<span class="hljs-string">&quot;./output&quot;</span>,
per_device_train_batch_size=<span class="hljs-number">1</span>,
zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span>
bf16=<span class="hljs-literal">True</span>,
<span class="hljs-comment"># ... other training arguments</span>
)
trainer = NeuronTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()`,wrap:!1}}),v=new ol({props:{$$slots:{default:[ri]},$$scope:{ctx:$}}}),ye=new J({props:{title:"Via the NeuronAccelerator",local:"via-the-neuronaccelerator",headingTag:"h3"}}),Ue=new B({props:{code:"ZnJvbSUyMHRvcmNoLm9wdGltJTIwaW1wb3J0JTIwQWRhbVclMEFmcm9tJTIwb3B0aW11bS5uZXVyb24lMjBpbXBvcnQlMjBOZXVyb25BY2NlbGVyYXRvciUwQWZyb20lMjBvcHRpbXVtLm5ldXJvbi5tb2RlbHMudHJhaW5pbmcuY29uZmlnJTIwaW1wb3J0JTIwVHJhaW5pbmdOZXVyb25Db25maWclMEElMEElMjMlMjBDcmVhdGUlMjB0aGUlMjB0cmFpbmluZyUyMGNvbmZpZ3VyYXRpb24lMEF0cm5fY29uZmlnJTIwJTNEJTIwVHJhaW5pbmdOZXVyb25Db25maWcoKSUwQSUwQSUyMyUyMENyZWF0ZSUyMGFjY2VsZXJhdG9yJTIwd2l0aCUyMFplUk8tMSUyMGVuYWJsZWQlMEFhY2NlbGVyYXRvciUyMCUzRCUyME5ldXJvbkFjY2VsZXJhdG9yKCUwQSUyMCUyMCUyMCUyMHRybl9jb25maWclM0R0cm5fY29uZmlnJTJDJTBBJTIwJTIwJTIwJTIwemVyb18xJTNEVHJ1ZSUyQyUyMCUyMCUyMyUyMEVuYWJsZSUyMFplUk8tMSUwQSUyMCUyMCUyMCUyMG1peGVkX3ByZWNpc2lvbiUzRCUyMmJmMTYlMjIlMkMlMEEpJTBBJTBBbW9kZWwlMjAlM0QlMjAuLi4lMjAlMjAlMjMlMjBZb3VyJTIwbW9kZWwlMjBpbnN0YW5jZSUwQW9wdGltaXplciUyMCUzRCUyMEFkYW1XKG1vZGVsLnBhcmFtZXRlcnMoKSUyQyUyMGxyJTNENWUtNSklMEElMEElMjMlMjBQcmVwYXJlJTIwbW9kZWwlMjBhbmQlMjBvcHRpbWl6ZXIlMEFtb2RlbCUyQyUyMG9wdGltaXplciUyMCUzRCUyMGFjY2VsZXJhdG9yLnByZXBhcmUobW9kZWwlMkMlMjBvcHRpbWl6ZXIp",highlighted:`<span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
<span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig
<span class="hljs-comment"># Create the training configuration</span>
trn_config = TrainingNeuronConfig()
<span class="hljs-comment"># Create accelerator with ZeRO-1 enabled</span>
accelerator = NeuronAccelerator(
trn_config=trn_config,
zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1</span>
mixed_precision=<span class="hljs-string">&quot;bf16&quot;</span>,
)
model = ... <span class="hljs-comment"># Your model instance</span>
optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>)
<span class="hljs-comment"># Prepare model and optimizer</span>
model, optimizer = accelerator.prepare(model, optimizer)`,wrap:!1}}),Te=new J({props:{title:"How to enable Tensor Parallelism?",local:"how-to-enable-tensor-parallelism",headingTag:"h2"}}),be=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),fe=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBJTBBJTIzJTIwQ29uZmlndXJlJTIwdGVuc29yJTIwcGFyYWxsZWxpc20lMjBpbiUyMHRyYWluaW5nJTIwYXJndW1lbnRzJTBBdHJhaW5pbmdfYXJncyUyMCUzRCUyME5ldXJvblRyYWluaW5nQXJndW1lbnRzKCUwQSUyMCUyMCUyMCUyMG91dHB1dF9kaXIlM0QlMjIuJTJGb3V0cHV0JTIyJTJDJTBBJTIwJTIwJTIwJTIwcGVyX2RldmljZV90cmFpbl9iYXRjaF9zaXplJTNEMSUyQyUwQSUyMCUyMCUyMCUyMGJmMTYlM0RUcnVlJTJDJTBBJTIwJTIwJTIwJTIwdGVuc29yX3BhcmFsbGVsX3NpemUlM0Q4JTJDJTBBJTIwJTIwJTIwJTIwJTIzJTIwLi4uJTIwb3RoZXIlMjB0cmFpbmluZyUyMGFyZ3VtZW50cyUwQSklMEElMEF0cmFpbmVyJTIwJTNEJTIwTmV1cm9uVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRG1vZGVsJTJDJTBBJTIwJTIwJTIwJTIwYXJncyUzRHRyYWluaW5nX2FyZ3MlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEdHJhaW5fZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMGV2YWxfZGF0YXNldCUzRGV2YWxfZGF0YXNldCUyQyUwQSklMEElMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
<span class="hljs-comment"># Configure tensor parallelism in training arguments</span>
training_args = NeuronTrainingArguments(
output_dir=<span class="hljs-string">&quot;./output&quot;</span>,
per_device_train_batch_size=<span class="hljs-number">1</span>,
bf16=<span class="hljs-literal">True</span>,
tensor_parallel_size=<span class="hljs-number">8</span>,
<span class="hljs-comment"># ... other training arguments</span>
)
trainer = NeuronTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()`,wrap:!1}}),_=new ol({props:{$$slots:{default:[Mi]},$$scope:{ctx:$}}}),Ce=new J({props:{title:"Via the NeuronAccelerator",local:"via-the-neuronaccelerator",headingTag:"h3"}}),ge=new B({props:{code:"ZnJvbSUyMHRvcmNoLm9wdGltJTIwaW1wb3J0JTIwQWRhbVclMEFmcm9tJTIwb3B0aW11bS5uZXVyb24lMjBpbXBvcnQlMjBOZXVyb25BY2NlbGVyYXRvciUwQWZyb20lMjBvcHRpbXVtLm5ldXJvbi5tb2RlbHMudHJhaW5pbmcuY29uZmlnJTIwaW1wb3J0JTIwVHJhaW5pbmdOZXVyb25Db25maWclMEElMEElMjMlMjBDb25maWd1cmUlMjB0ZW5zb3IlMjBwYXJhbGxlbGlzbSUwQXRybl9jb25maWclMjAlM0QlMjBUcmFpbmluZ05ldXJvbkNvbmZpZyglMEElMjAlMjAlMjAlMjB0ZW5zb3JfcGFyYWxsZWxfc2l6ZSUzRDglMkMlMEElMjAlMjAlMjAlMjBzZXF1ZW5jZV9wYXJhbGxlbF9lbmFibGVkJTNEVHJ1ZSUyQyUwQSUyMCUyMCUyMCUyMGNoZWNrcG9pbnRfZGlyJTNETm9uZSUyQyUyMCUyMCUyMyUyMENhbiUyMGJlJTIwc3BlY2lmaWVkJTIwd2hlbiUyMHJlc3VtaW5nJTIwZnJvbSUyMGNoZWNrcG9pbnQlMEEpJTBBJTBBYWNjZWxlcmF0b3IlMjAlM0QlMjBOZXVyb25BY2NlbGVyYXRvciglMEElMjAlMjAlMjAlMjB0cm5fY29uZmlnJTNEdHJuX2NvbmZpZyUyQyUwQSUyMCUyMCUyMCUyMG1peGVkX3ByZWNpc2lvbiUzRCUyMmJmMTYlMjIlMkMlMEEpJTBBJTBBbW9kZWwlMjAlM0QlMjAuLi4lMjAlMjAlMjMlMjBZb3VyJTIwbW9kZWwlMjBpbnN0YW5jZSUwQW9wdGltaXplciUyMCUzRCUyMEFkYW1XKG1vZGVsLnBhcmFtZXRlcnMoKSUyQyUyMGxyJTNENWUtNSklMEElMEFtb2RlbCUyQyUyMG9wdGltaXplciUyMCUzRCUyMGFjY2VsZXJhdG9yLnByZXBhcmUobW9kZWwlMkMlMjBvcHRpbWl6ZXIp",highlighted:`<span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
<span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig
<span class="hljs-comment"># Configure tensor parallelism</span>
trn_config = TrainingNeuronConfig(
tensor_parallel_size=<span class="hljs-number">8</span>,
sequence_parallel_enabled=<span class="hljs-literal">True</span>,
checkpoint_dir=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Can be specified when resuming from checkpoint</span>
)
accelerator = NeuronAccelerator(
trn_config=trn_config,
mixed_precision=<span class="hljs-string">&quot;bf16&quot;</span>,
)
model = ... <span class="hljs-comment"># Your model instance</span>
optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>)
model, optimizer = accelerator.prepare(model, optimizer)`,wrap:!1}}),$e=new J({props:{title:"How to enable Pipeline Parallelism?",local:"how-to-enable-pipeline-parallelism",headingTag:"h2"}}),ve=new J({props:{title:"Configuration Options",local:"configuration-options",headingTag:"h3"}}),Ze=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),Ve=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBZnJvbSUyMG9wdGltdW0ubmV1cm9uLm1vZGVscy50cmFpbmluZyUyMGltcG9ydCUyMExsYW1hRm9yQ2F1c2FsTE0lMjAlMjAlMjMlMjBDdXN0b20lMjBtb2RlbCUyMGltcGxlbWVudGF0aW9uJTBBJTBBJTIzJTIwQ29uZmlndXJlJTIwcGlwZWxpbmUlMjBwYXJhbGxlbGlzbSUyMGluJTIwdHJhaW5pbmclMjBhcmd1bWVudHMlMEF0cmFpbmluZ19hcmdzJTIwJTNEJTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMoJTBBJTIwJTIwJTIwJTIwb3V0cHV0X2RpciUzRCUyMi4lMkZvdXRwdXQlMjIlMkMlMEElMjAlMjAlMjAlMjBwZXJfZGV2aWNlX3RyYWluX2JhdGNoX3NpemUlM0Q0JTJDJTIwJTIwJTIzJTIwV2lsbCUyMGJlJTIwc3BsaXQlMjBpbnRvJTIwbWljcm9iYXRjaGVzJTBBJTIwJTIwJTIwJTIwYmYxNiUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjB0ZW5zb3JfcGFyYWxsZWxfc2l6ZSUzRDIlMkMlMEElMjAlMjAlMjAlMjBwaXBlbGluZV9wYXJhbGxlbF9zaXplJTNENCUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFNwbGl0JTIwbW9kZWwlMjBhY3Jvc3MlMjA0JTIwcGlwZWxpbmUlMjBzdGFnZXMlMEElMjAlMjAlMjAlMjBwaXBlbGluZV9wYXJhbGxlbF9udW1fbWljcm9iYXRjaGVzJTNENCUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyME51bWJlciUyMG9mJTIwbWljcm9iYXRjaGVzJTBBJTIwJTIwJTIwJTIwemVyb18xJTNEVHJ1ZSUyQyUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMEVuYWJsZSUyMFplUk8tMSUyMHdpdGglMjBwaXBlbGluZSUyMHBhcmFsbGVsaXNtJTBBJTIwJTIwJTIwJTIwJTIzJTIwLi4uJTIwb3RoZXIlMjB0cmFpbmluZyUyMGFyZ3VtZW50cyUwQSklMEElMEElMjMlMjBMb2FkJTIwbW9kZWwlMjB1c2luZyUyMGN1c3RvbSUyMGltcGxlbWVudGF0aW9uJTIwLSUyMG11c3QlMjBiZSUyMGRvbmUlMjB3aXRoJTIwdGhlJTIwbW9kZWwlMjBjbGFzcyUyMGRpcmVjdGx5JTBBbW9kZWwlMjAlM0QlMjBMbGFtYUZvckNhdXNhbExNLmZyb21fcHJldHJhaW5lZCglMEElMjAlMjAlMjAlMjAlMjJtZXRhLWxsYW1hJTJGTGxhbWEtMy4yLTNCJTIyJTJDJTBBJTIwJTIwJTIwJTIwdHJuX2NvbmZpZyUzRHRyYWluaW5nX2FyZ3MudHJuX2NvbmZpZyUyMCUyMCUyMyUyMFBhc3MlMjB0aGUlMjBhdXRvLWdlbmVyYXRlZCUyMHRybl9jb25maWclMEEpJTBBJTBBdHJhaW5lciUyMCUzRCUyME5ldXJvblRyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0Rtb2RlbCUyQyUwQSUyMCUyMCUyMCUyMGFyZ3MlM0R0cmFpbmluZ19hcmdzJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRHRyYWluX2RhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBldmFsX2RhdGFzZXQlM0RldmFsX2RhdGFzZXQlMkMlMEEpJTBBJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
<span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM <span class="hljs-comment"># Custom model implementation</span>
<span class="hljs-comment"># Configure pipeline parallelism in training arguments</span>
training_args = NeuronTrainingArguments(
output_dir=<span class="hljs-string">&quot;./output&quot;</span>,
per_device_train_batch_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Will be split into microbatches</span>
bf16=<span class="hljs-literal">True</span>,
tensor_parallel_size=<span class="hljs-number">2</span>,
pipeline_parallel_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># Split model across 4 pipeline stages</span>
pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>, <span class="hljs-comment"># Number of microbatches</span>
zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Enable ZeRO-1 with pipeline parallelism</span>
<span class="hljs-comment"># ... other training arguments</span>
)
<span class="hljs-comment"># Load model using custom implementation - must be done with the model class directly</span>
model = LlamaForCausalLM.from_pretrained(
<span class="hljs-string">&quot;meta-llama/Llama-3.2-3B&quot;</span>,
trn_config=training_args.trn_config <span class="hljs-comment"># Pass the auto-generated trn_config</span>
)
trainer = NeuronTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()`,wrap:!1}}),Ge=new J({props:{title:"Via the NeuronAccelerator",local:"via-the-neuronaccelerator",headingTag:"h3"}}),Fe=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uQWNjZWxlcmF0b3IlMEFmcm9tJTIwb3B0aW11bS5uZXVyb24ubW9kZWxzLnRyYWluaW5nLmNvbmZpZyUyMGltcG9ydCUyMFRyYWluaW5nTmV1cm9uQ29uZmlnJTBBZnJvbSUyMG9wdGltdW0ubmV1cm9uLm1vZGVscy50cmFpbmluZyUyMGltcG9ydCUyMExsYW1hRm9yQ2F1c2FsTE0lMEFmcm9tJTIwdG9yY2gub3B0aW0lMjBpbXBvcnQlMjBBZGFtVyUwQSUwQSUyMyUyMENvbmZpZ3VyZSUyMGNvbWJpbmVkJTIwcGFyYWxsZWxpc20lMjBzdHJhdGVnaWVzJTBBdHJuX2NvbmZpZyUyMCUzRCUyMFRyYWluaW5nTmV1cm9uQ29uZmlnKCUwQSUyMCUyMCUyMCUyMHRlbnNvcl9wYXJhbGxlbF9zaXplJTNEMiUyQyUwQSUyMCUyMCUyMCUyMHBpcGVsaW5lX3BhcmFsbGVsX3NpemUlM0Q0JTJDJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmVfcGFyYWxsZWxfbnVtX21pY3JvYmF0Y2hlcyUzRDQlMkMlMEElMjAlMjAlMjAlMjBzZXF1ZW5jZV9wYXJhbGxlbF9lbmFibGVkJTNEVHJ1ZSUyQyUwQSklMEElMEFhY2NlbGVyYXRvciUyMCUzRCUyME5ldXJvbkFjY2VsZXJhdG9yKCUwQSUyMCUyMCUyMCUyMHRybl9jb25maWclM0R0cm5fY29uZmlnJTJDJTBBJTIwJTIwJTIwJTIwemVyb18xJTNEVHJ1ZSUyQyUyMCUyMCUyMyUyMENhbiUyMGNvbWJpbmUlMjB3aXRoJTIwWmVSTy0xJTBBJTIwJTIwJTIwJTIwbWl4ZWRfcHJlY2lzaW9uJTNEJTIyYmYxNiUyMiUyQyUwQSklMEElMEElMjMlMjBMb2FkJTIwbW9kZWwlMjB3aXRoJTIwY3VzdG9tJTIwaW1wbGVtZW50YXRpb24lMEFtb2RlbCUyMCUzRCUyMExsYW1hRm9yQ2F1c2FsTE0uZnJvbV9wcmV0cmFpbmVkKCUwQSUyMCUyMCUyMCUyMCUyMm1ldGEtbGxhbWElMkZMbGFtYS0zLjItM0IlMjIlMkMlMEElMjAlMjAlMjAlMjB0cm5fY29uZmlnJTNEdHJuX2NvbmZpZyUwQSklMEElMEFvcHRpbWl6ZXIlMjAlM0QlMjBBZGFtVyhtb2RlbC5wYXJhbWV0ZXJzKCklMkMlMjBsciUzRDVlLTUpJTBBbW9kZWwlMkMlMjBvcHRpbWl6ZXIlMjAlM0QlMjBhY2NlbGVyYXRvci5wcmVwYXJlKG1vZGVsJTJDJTIwb3B0aW1pemVyKQ==",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronAccelerator
<span class="hljs-keyword">from</span> optimum.neuron.models.training.config <span class="hljs-keyword">import</span> TrainingNeuronConfig
<span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM
<span class="hljs-keyword">from</span> torch.optim <span class="hljs-keyword">import</span> AdamW
<span class="hljs-comment"># Configure combined parallelism strategies</span>
trn_config = TrainingNeuronConfig(
tensor_parallel_size=<span class="hljs-number">2</span>,
pipeline_parallel_size=<span class="hljs-number">4</span>,
pipeline_parallel_num_microbatches=<span class="hljs-number">4</span>,
sequence_parallel_enabled=<span class="hljs-literal">True</span>,
)
accelerator = NeuronAccelerator(
trn_config=trn_config,
zero_1=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Can combine with ZeRO-1</span>
mixed_precision=<span class="hljs-string">&quot;bf16&quot;</span>,
)
<span class="hljs-comment"># Load model with custom implementation</span>
model = LlamaForCausalLM.from_pretrained(
<span class="hljs-string">&quot;meta-llama/Llama-3.2-3B&quot;</span>,
trn_config=trn_config
)
optimizer = AdamW(model.parameters(), lr=<span class="hljs-number">5e-5</span>)
model, optimizer = accelerator.prepare(model, optimizer)`,wrap:!1}}),W=new ol({props:{$$slots:{default:[mi]},$$scope:{ctx:$}}}),ke=new J({props:{title:"Combining Parallelism Strategies",local:"combining-parallelism-strategies",headingTag:"h2"}}),Xe=new J({props:{title:"Via the NeuronTrainer",local:"via-the-neurontrainer",headingTag:"h3"}}),Ne=new B({props:{code:"ZnJvbSUyMG9wdGltdW0ubmV1cm9uJTIwaW1wb3J0JTIwTmV1cm9uVHJhaW5pbmdBcmd1bWVudHMlMkMlMjBOZXVyb25UcmFpbmVyJTBBZnJvbSUyMG9wdGltdW0ubmV1cm9uLm1vZGVscy50cmFpbmluZyUyMGltcG9ydCUyMExsYW1hRm9yQ2F1c2FsTE0lMEElMEElMjMlMjBFeGFtcGxlJTNBJTIwQ29tYmluZSUyMGFsbCUyMHBhcmFsbGVsaXNtJTIwc3RyYXRlZ2llcyUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBOZXVyb25UcmFpbmluZ0FyZ3VtZW50cyglMEElMjAlMjAlMjAlMjBvdXRwdXRfZGlyJTNEJTIyLiUyRm91dHB1dCUyMiUyQyUwQSUyMCUyMCUyMCUyMHBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSUzRDMyJTJDJTBBJTIwJTIwJTIwJTIwYmYxNiUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjBncmFkaWVudF9jaGVja3BvaW50aW5nJTNEVHJ1ZSUyQyUwQSUwQSUyMCUyMCUyMCUyMCUyMyUyMFplUk8tMSUwQSUyMCUyMCUyMCUyMHplcm9fMSUzRFRydWUlMkMlMEElMEElMjAlMjAlMjAlMjAlMjMlMjBUZW5zb3IlMjBwYXJhbGxlbGlzbSUwQSUyMCUyMCUyMCUyMHRlbnNvcl9wYXJhbGxlbF9zaXplJTNENCUyQyUwQSUyMCUyMCUyMCUyMGRpc2FibGVfc2VxdWVuY2VfcGFyYWxsZWwlM0RGYWxzZSUyQyUyMCUyMCUyMCUyMCUyMCUyMyUyMEVuYWJsZSUyMHNlcXVlbmNlJTIwcGFyYWxsZWxpc20lMEElMEElMjAlMjAlMjAlMjAlMjMlMjBQaXBlbGluZSUyMHBhcmFsbGVsaXNtJTBBJTIwJTIwJTIwJTIwcGlwZWxpbmVfcGFyYWxsZWxfc2l6ZSUzRDIlMkMlMEElMjAlMjAlMjAlMjBwaXBlbGluZV9wYXJhbGxlbF9udW1fbWljcm9iYXRjaGVzJTNEOCUyQyUwQSUwQSUyMCUyMCUyMCUyMCUyMyUyMEFkZGl0aW9uYWwlMjBvcHRpbWl6YXRpb25zJTBBJTIwJTIwJTIwJTIwZnVzZV9xa3YlM0RUcnVlJTJDJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwRnVzZSUyMFFLViUyMHByb2plY3Rpb25zJTIwZm9yJTIwZWZmaWNpZW5jeSUwQSUyMCUyMCUyMCUyMGt2X3NpemVfbXVsdGlwbGllciUzRE5vbmUlMkMlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBBdXRvLWNhbGN1bGF0ZSUyMG9wdGltYWwlMjBLViUyMG11bHRpcGxpZXIlMEEpJTBBJTBBJTIzJTIwTG9hZCUyMG1vZGVsJTIwdXNpbmclMjBjdXN0b20lMjBpbXBsZW1lbnRhdGlvbiUwQW1vZGVsJTIwJTNEJTIwTGxhbWFGb3JDYXVzYWxMTS5mcm9tX3ByZXRyYWluZWQoJTBBJTIwJTIwJTIwJTIwJTIybWV0YS1sbGFtYSUyRkxsYW1hLTMuMi0zQiUyMiUyQyUwQSUyMCUyMCUyMCUyMHRybl9jb25maWclM0R0cmFpbmluZ19hcmdzLnRybl9jb25maWclMEEpJTBBJTBBdHJhaW5lciUyMCUzRCUyME5ldXJvblRyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0Rtb2RlbCUyQyUwQSUyMCUyMCUyMCUyMGFyZ3MlM0R0cmFpbmluZ19hcmdzJTJDJTBBJTIwJTIwJTIwJTIwdHJhaW5fZGF0YXNldCUzRHRyYWluX2RhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBldmFsX2RhdGFzZXQlM0RldmFsX2RhdGFzZXQlMkMlMEEpJTBBJTBBdHJhaW5lci50cmFpbigp",highlighted:`<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronTrainingArguments, NeuronTrainer
<span class="hljs-keyword">from</span> optimum.neuron.models.training <span class="hljs-keyword">import</span> LlamaForCausalLM
<span class="hljs-comment"># Example: Combine all parallelism strategies</span>
training_args = NeuronTrainingArguments(
output_dir=<span class="hljs-string">&quot;./output&quot;</span>,
per_device_train_batch_size=<span class="hljs-number">32</span>,
bf16=<span class="hljs-literal">True</span>,
gradient_checkpointing=<span class="hljs-literal">True</span>,
<span class="hljs-comment"># ZeRO-1</span>
zero_1=<span class="hljs-literal">True</span>,
<span class="hljs-comment"># Tensor parallelism</span>
tensor_parallel_size=<span class="hljs-number">4</span>,
disable_sequence_parallel=<span class="hljs-literal">False</span>, <span class="hljs-comment"># Enable sequence parallelism</span>
<span class="hljs-comment"># Pipeline parallelism</span>
pipeline_parallel_size=<span class="hljs-number">2</span>,
pipeline_parallel_num_microbatches=<span class="hljs-number">8</span>,
<span class="hljs-comment"># Additional optimizations</span>
fuse_qkv=<span class="hljs-literal">True</span>, <span class="hljs-comment"># Fuse QKV projections for efficiency</span>
kv_size_multiplier=<span class="hljs-literal">None</span>, <span class="hljs-comment"># Auto-calculate optimal KV multiplier</span>
)
<span class="hljs-comment"># Load model using custom implementation</span>
model = LlamaForCausalLM.from_pretrained(
<span class="hljs-string">&quot;meta-llama/Llama-3.2-3B&quot;</span>,
trn_config=training_args.trn_config
)
trainer = NeuronTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
)
trainer.train()`,wrap:!1}}),Qe=new J({props:{title:"Checkpoint consolidation",local:"checkpoint-consolidation",headingTag:"h2"}}),xe=new B({props:{code:"b3B0aW11bS1jbGklMjBuZXVyb24lMjBjb25zb2xpZGF0ZSUyMC0taGVscCUwQSUwQXVzYWdlJTNBJTIwb3B0aW11bS1jbGklMjBuZXVyb24lMjBjb25zb2xpZGF0ZSUyMCU1Qi1oJTVEJTIwJTVCLWYlMjAlN0JweXRvcmNoJTJDc2FmZXRlbnNvcnMlN0QlNUQlMjBjaGVja3BvaW50X2RpciUyMG91dHB1dF9kaXIlMEElMEFwb3NpdGlvbmFsJTIwYXJndW1lbnRzJTNBJTBBJTIwJTIwY2hlY2twb2ludF9kaXIlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBwYXRoJTIwdG8lMjB0aGUlMjBkaXJlY3RvcnklMjBjb250YWluaW5nJTIwdGhlJTIwY2hlY2twb2ludHMuJTBBJTIwJTIwb3V0cHV0X2RpciUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMFRoZSUyMHBhdGglMjB0byUyMHRoZSUyMG91dHB1dCUyMGRpcmVjdG9yeSUyMGNvbnRhaW5pbmclMjB0aGUlMjBjb25zb2xpZGF0ZWQlMjBjaGVja3BvaW50LiUwQSUwQW9wdGlvbmFsJTIwYXJndW1lbnRzJTNBJTBBJTIwJTIwLWglMkMlMjAtLWhlbHAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzaG93JTIwdGhpcyUyMGhlbHAlMjBtZXNzYWdlJTIwYW5kJTIwZXhpdCUwQSUyMCUyMC1mJTIwJTdCcHl0b3JjaCUyQ3NhZmV0ZW5zb3JzJTdEJTJDJTIwLS1mb3JtYXQlMjAlN0JweXRvcmNoJTJDc2FmZXRlbnNvcnMlN0QlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBUaGUlMjBmb3JtYXQlMjB1c2VkJTIwdG8lMjBzYXZlJTIwdGhlJTIwY29uc29saWRhdGVkJTIwY2hlY2twb2ludC4=",highlighted:`optimum-cli neuron consolidate --<span class="hljs-built_in">help</span>
usage: optimum-cli neuron consolidate [-h] [-f {pytorch,safetensors}] checkpoint_dir output_dir
positional arguments:
checkpoint_dir The path to the directory containing the checkpoints.
output_dir The path to the output directory containing the consolidated checkpoint.
optional arguments:
-h, --<span class="hljs-built_in">help</span> show this <span class="hljs-built_in">help</span> message and <span class="hljs-built_in">exit</span>
-f {pytorch,safetensors}, --format {pytorch,safetensors}
The format used to save the consolidated checkpoint.`,wrap:!1}}),Oe=new B({props:{code:"bXlfdHJhaW5pbmclMkYlMEElRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBSRUFETUUubWQlMEElRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBhbGxfcmVzdWx0cy5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwY2hlY2twb2ludC0xMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMGNvbmZpZy5qc29uJTBBJUUyJTk0JTgyJTIwJTIwJTIwJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwc2NoZWR1bGVyLnB0JTBBJUUyJTk0JTgyJTIwJTIwJTIwJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwc3BlY2lhbF90b2tlbnNfbWFwLmpzb24lMEElRTIlOTQlODIlMjAlMjAlMjAlRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBzaGFyZHMlMkYlMEElRTIlOTQlODIlMjAlMjAlMjAlRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjB0b2tlbml6ZXIuanNvbiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRva2VuaXplci5tb2RlbCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRva2VuaXplcl9jb25maWcuanNvbiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRyYWluZXJfc3RhdGUuanNvbiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5NCVFMiU5NCU4MCVFMiU5NCU4MCUyMHRyYWluaW5nX2FyZ3MuYmluJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwY29uZmlnLmpzb24lMEElRTIlOTQlOUMlRTIlOTQlODAlRTIlOTQlODAlMjBzcGVjaWFsX3Rva2Vuc19tYXAuanNvbiUwQSVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHNoYXJkcyUyRiUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDBfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDFfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDJfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDNfcHBfcmFua18wMCUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDBfcHBfcmFua18wMSUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDFfcHBfcmFua18wMSUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDJfcHBfcmFua18wMSUwQSVFMiU5NCU4MiUyMCUyMCUyMCVFMiU5NCU5NCVFMiU5NCU4MCVFMiU5NCU4MCUyMHRwX3JhbmtfMDNfcHBfcmFua18wMSUwQSVFMiU5NCU5QyVFMiU5NCU4MCVFMiU5NCU4MCUyMHRva2VuaXplci5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdG9rZW5pemVyLm1vZGVsJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdG9rZW5pemVyX2NvbmZpZy5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdHJhaW5fcmVzdWx0cy5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdHJhaW5lcl9zdGF0ZS5qc29uJTBBJUUyJTk0JTlDJUUyJTk0JTgwJUUyJTk0JTgwJTIwdHJhaW5pbmdfYXJncy5iaW4lMEElRTIlOTQlOTQlRTIlOTQlODAlRTIlOTQlODAlMjB0cm5fY29uZmlnLmpzb24=",highlighted:`my_training/
├── README.md
├── all_results.json
├── checkpoint-10
│ ├── config.json
│ ├── scheduler.pt
│ ├── special_tokens_map.json
│ ├── shards/
│ ├── tokenizer.json
│ ├── tokenizer.model
│ ├── tokenizer_config.json
│ ├── trainer_state.json
│ └── training_args.bin
├── config.json
├── special_tokens_map.json
├── shards/
│ ├── tp_rank_00_pp_rank_00
│ ├── tp_rank_01_pp_rank_00
│ ├── tp_rank_02_pp_rank_00
│ ├── tp_rank_03_pp_rank_00
│ ├── tp_rank_00_pp_rank_01
│ ├── tp_rank_01_pp_rank_01
│ ├── tp_rank_02_pp_rank_01
│ └── tp_rank_03_pp_rank_01
├── tokenizer.json
├── tokenizer.model
├── tokenizer_config.json
├── train_results.json
├── trainer_state.json
├── training_args.bin
└── trn_config.json`,wrap:!1}}),qe=new B({props:{code:"b3B0aW11bS1jbGklMjBuZXVyb24lMjBjb25zb2xpZGF0ZSUyMG15X3RyYWluaW5nJTIwbXlfdHJhaW5pbmdfY29uc29saWRhdGVkX2NoZWNrcG9pbnQ=",highlighted:"optimum-cli neuron consolidate my_training my_training_consolidated_checkpoint",wrap:!1}}),Z=new ol({props:{$$slots:{default:[oi]},$$scope:{ctx:$}}}),Ke=new J({props:{title:"Best Practices",local:"best-practices",headingTag:"h2"}}),el=new J({props:{title:"Choosing Parallelism Strategy",local:"choosing-parallelism-strategy",headingTag:"h3"}}),tl=new J({props:{title:"Memory Optimization",local:"memory-optimization",headingTag:"h3"}}),il=new J({props:{title:"Troubleshooting",local:"troubleshooting",headingTag:"h2"}}),al=new J({props:{title:"Common Issues",local:"common-issues",headingTag:"h3"}}),rl=new J({props:{title:"Debugging Tips",local:"debugging-tips",headingTag:"h3"}}),{c(){m=s("meta"),b=i(),d=s("p"),w=i(),p(g.$$.fragment),f=i(),p(h.$$.fragment),C=i(),o=s("p"),o.innerHTML=j,yl=i(),V=s("p"),V.textContent=Pt,cl=i(),G=s("p"),G.innerHTML=Ot,Ul=i(),p(F.$$.fragment),Tl=i(),p(k.$$.fragment),ul=i(),R=s("p"),R.innerHTML=Dt,dl=i(),X=s("p"),X.innerHTML=qt,wl=i(),N=s("p"),N.innerHTML=Kt,Jl=i(),H=s("p"),H.innerHTML=en,hl=i(),p(A.$$.fragment),bl=i(),E=s("p"),E.innerHTML=ln,fl=i(),Q=s("p"),Q.innerHTML=tn,Cl=i(),z=s("p"),z.innerHTML=nn,jl=i(),Y=s("p"),Y.innerHTML=an,gl=i(),x=s("p"),x.innerHTML=sn,$l=i(),S=s("p"),S.innerHTML=rn,Bl=i(),p(L.$$.fragment),Il=i(),P=s("p"),P.innerHTML=Mn,vl=i(),O=s("p"),O.innerHTML=mn,_l=i(),D=s("p"),D.innerHTML=on,Wl=i(),q=s("p"),q.innerHTML=pn,Zl=i(),K=s("p"),K.innerHTML=yn,Vl=i(),p(ee.$$.fragment),Gl=i(),le=s("p"),le.innerHTML=cn,Fl=i(),te=s("p"),te.innerHTML=Un,kl=i(),ne=s("p"),ne.innerHTML=Tn,Rl=i(),ie=s("p"),ie.innerHTML=un,Xl=i(),ae=s("p"),ae.innerHTML=dn,Nl=i(),se=s("p"),se.innerHTML=wn,Hl=i(),re=s("p"),re.innerHTML=Jn,Al=i(),p(I.$$.fragment),El=i(),p(Me.$$.fragment),Ql=i(),me=s("p"),me.innerHTML=hn,zl=i(),p(oe.$$.fragment),Yl=i(),p(pe.$$.fragment),xl=i(),p(v.$$.fragment),Sl=i(),p(ye.$$.fragment),Ll=i(),ce=s("p"),ce.innerHTML=bn,Pl=i(),p(Ue.$$.fragment),Ol=i(),p(Te.$$.fragment),Dl=i(),ue=s("p"),ue.innerHTML=fn,ql=i(),de=s("p"),de.innerHTML=Cn,Kl=i(),we=s("p"),we.textContent=jn,et=i(),Je=s("ol"),Je.innerHTML=gn,lt=i(),he=s("p"),he.innerHTML=$n,tt=i(),p(be.$$.fragment),nt=i(),p(fe.$$.fragment),it=i(),p(_.$$.fragment),at=i(),p(Ce.$$.fragment),st=i(),je=s("p"),je.innerHTML=Bn,rt=i(),p(ge.$$.fragment),Mt=i(),p($e.$$.fragment),mt=i(),Be=s("p"),Be.textContent=In,ot=i(),Ie=s("p"),Ie.innerHTML=vn,pt=i(),p(ve.$$.fragment),yt=i(),_e=s("p"),_e.textContent=_n,ct=i(),We=s("ul"),We.innerHTML=Wn,Ut=i(),p(Ze.$$.fragment),Tt=i(),p(Ve.$$.fragment),ut=i(),p(Ge.$$.fragment),dt=i(),p(Fe.$$.fragment),wt=i(),p(W.$$.fragment),Jt=i(),p(ke.$$.fragment),ht=i(),Re=s("p"),Re.textContent=Zn,bt=i(),p(Xe.$$.fragment),ft=i(),p(Ne.$$.fragment),Ct=i(),He=s("p"),He.textContent=Vn,jt=i(),Ae=s("ul"),Ae.innerHTML=Gn,gt=i(),Ee=s("p"),Ee.innerHTML=Fn,$t=i(),p(Qe.$$.fragment),Bt=i(),ze=s("p"),ze.textContent=kn,It=i(),Ye=s("p"),Ye.innerHTML=Rn,vt=i(),p(xe.$$.fragment),_t=i(),Se=s("p"),Se.innerHTML=Xn,Wt=i(),Le=s("p"),Le.textContent=Nn,Zt=i(),Pe=s("p"),Pe.innerHTML=Hn,Vt=i(),p(Oe.$$.fragment),Gt=i(),De=s("p"),De.innerHTML=An,Ft=i(),p(qe.$$.fragment),kt=i(),p(Z.$$.fragment),Rt=i(),p(Ke.$$.fragment),Xt=i(),p(el.$$.fragment),Nt=i(),ll=s("ol"),ll.innerHTML=En,Ht=i(),p(tl.$$.fragment),At=i(),nl=s("ul"),nl.innerHTML=Qn,Et=i(),p(il.$$.fragment),Qt=i(),p(al.$$.fragment),zt=i(),sl=s("ol"),sl.innerHTML=zn,Yt=i(),p(rl.$$.fragment),xt=i(),Ml=s("ul"),Ml.innerHTML=Yn,St=i(),ml=s("p"),this.h()},l(e){const l=ni("svelte-u9bgzb",document.head);m=r(l,"META",{name:!0,content:!0}),l.forEach(t),b=a(e),d=r(e,"P",{}),Dn(d).forEach(t),w=a(e),y(g.$$.fragment,e),f=a(e),y(h.$$.fragment,e),C=a(e),o=r(e,"P",{"data-svelte-h":!0}),M(o)!=="svelte-1hnco7m"&&(o.innerHTML=j),yl=a(e),V=r(e,"P",{"data-svelte-h":!0}),M(V)!=="svelte-d2kadp"&&(V.textContent=Pt),cl=a(e),G=r(e,"P",{"data-svelte-h":!0}),M(G)!=="svelte-1ndq4u8"&&(G.innerHTML=Ot),Ul=a(e),y(F.$$.fragment,e),Tl=a(e),y(k.$$.fragment,e),ul=a(e),R=r(e,"P",{"data-svelte-h":!0}),M(R)!=="svelte-1mjmifd"&&(R.innerHTML=Dt),dl=a(e),X=r(e,"P",{"data-svelte-h":!0}),M(X)!=="svelte-1f0jrrz"&&(X.innerHTML=qt),wl=a(e),N=r(e,"P",{"data-svelte-h":!0}),M(N)!=="svelte-18396cf"&&(N.innerHTML=Kt),Jl=a(e),H=r(e,"P",{"data-svelte-h":!0}),M(H)!=="svelte-190zbi2"&&(H.innerHTML=en),hl=a(e),y(A.$$.fragment,e),bl=a(e),E=r(e,"P",{"data-svelte-h":!0}),M(E)!=="svelte-1r1vc5"&&(E.innerHTML=ln),fl=a(e),Q=r(e,"P",{"data-svelte-h":!0}),M(Q)!=="svelte-wsb08h"&&(Q.innerHTML=tn),Cl=a(e),z=r(e,"P",{"data-svelte-h":!0}),M(z)!=="svelte-1rz7p8m"&&(z.innerHTML=nn),jl=a(e),Y=r(e,"P",{"data-svelte-h":!0}),M(Y)!=="svelte-kpqjki"&&(Y.innerHTML=an),gl=a(e),x=r(e,"P",{"data-svelte-h":!0}),M(x)!=="svelte-fc78y5"&&(x.innerHTML=sn),$l=a(e),S=r(e,"P",{"data-svelte-h":!0}),M(S)!=="svelte-otaiec"&&(S.innerHTML=rn),Bl=a(e),y(L.$$.fragment,e),Il=a(e),P=r(e,"P",{"data-svelte-h":!0}),M(P)!=="svelte-4fh0sm"&&(P.innerHTML=Mn),vl=a(e),O=r(e,"P",{"data-svelte-h":!0}),M(O)!=="svelte-1cma7oi"&&(O.innerHTML=mn),_l=a(e),D=r(e,"P",{"data-svelte-h":!0}),M(D)!=="svelte-hrmmz0"&&(D.innerHTML=on),Wl=a(e),q=r(e,"P",{"data-svelte-h":!0}),M(q)!=="svelte-1obmcw0"&&(q.innerHTML=pn),Zl=a(e),K=r(e,"P",{"data-svelte-h":!0}),M(K)!=="svelte-41lgu1"&&(K.innerHTML=yn),Vl=a(e),y(ee.$$.fragment,e),Gl=a(e),le=r(e,"P",{"data-svelte-h":!0}),M(le)!=="svelte-1lhkl2x"&&(le.innerHTML=cn),Fl=a(e),te=r(e,"P",{"data-svelte-h":!0}),M(te)!=="svelte-a3vdn"&&(te.innerHTML=Un),kl=a(e),ne=r(e,"P",{"data-svelte-h":!0}),M(ne)!=="svelte-1jq8cu1"&&(ne.innerHTML=Tn),Rl=a(e),ie=r(e,"P",{"data-svelte-h":!0}),M(ie)!=="svelte-1vm5hdt"&&(ie.innerHTML=un),Xl=a(e),ae=r(e,"P",{"data-svelte-h":!0}),M(ae)!=="svelte-4tw4cj"&&(ae.innerHTML=dn),Nl=a(e),se=r(e,"P",{"data-svelte-h":!0}),M(se)!=="svelte-l89uy8"&&(se.innerHTML=wn),Hl=a(e),re=r(e,"P",{"data-svelte-h":!0}),M(re)!=="svelte-1p0ihsg"&&(re.innerHTML=Jn),Al=a(e),y(I.$$.fragment,e),El=a(e),y(Me.$$.fragment,e),Ql=a(e),me=r(e,"P",{"data-svelte-h":!0}),M(me)!=="svelte-1xpk0lv"&&(me.innerHTML=hn),zl=a(e),y(oe.$$.fragment,e),Yl=a(e),y(pe.$$.fragment,e),xl=a(e),y(v.$$.fragment,e),Sl=a(e),y(ye.$$.fragment,e),Ll=a(e),ce=r(e,"P",{"data-svelte-h":!0}),M(ce)!=="svelte-106kvj9"&&(ce.innerHTML=bn),Pl=a(e),y(Ue.$$.fragment,e),Ol=a(e),y(Te.$$.fragment,e),Dl=a(e),ue=r(e,"P",{"data-svelte-h":!0}),M(ue)!=="svelte-1r4hhew"&&(ue.innerHTML=fn),ql=a(e),de=r(e,"P",{"data-svelte-h":!0}),M(de)!=="svelte-v1qtdm"&&(de.innerHTML=Cn),Kl=a(e),we=r(e,"P",{"data-svelte-h":!0}),M(we)!=="svelte-n127re"&&(we.textContent=jn),et=a(e),Je=r(e,"OL",{"data-svelte-h":!0}),M(Je)!=="svelte-1hoskl8"&&(Je.innerHTML=gn),lt=a(e),he=r(e,"P",{"data-svelte-h":!0}),M(he)!=="svelte-11wpmlp"&&(he.innerHTML=$n),tt=a(e),y(be.$$.fragment,e),nt=a(e),y(fe.$$.fragment,e),it=a(e),y(_.$$.fragment,e),at=a(e),y(Ce.$$.fragment,e),st=a(e),je=r(e,"P",{"data-svelte-h":!0}),M(je)!=="svelte-1ncu8vs"&&(je.innerHTML=Bn),rt=a(e),y(ge.$$.fragment,e),Mt=a(e),y($e.$$.fragment,e),mt=a(e),Be=r(e,"P",{"data-svelte-h":!0}),M(Be)!=="svelte-1vp0c4m"&&(Be.textContent=In),ot=a(e),Ie=r(e,"P",{"data-svelte-h":!0}),M(Ie)!=="svelte-1ytrjb2"&&(Ie.innerHTML=vn),pt=a(e),y(ve.$$.fragment,e),yt=a(e),_e=r(e,"P",{"data-svelte-h":!0}),M(_e)!=="svelte-wwttlo"&&(_e.textContent=_n),ct=a(e),We=r(e,"UL",{"data-svelte-h":!0}),M(We)!=="svelte-9fwfrb"&&(We.innerHTML=Wn),Ut=a(e),y(Ze.$$.fragment,e),Tt=a(e),y(Ve.$$.fragment,e),ut=a(e),y(Ge.$$.fragment,e),dt=a(e),y(Fe.$$.fragment,e),wt=a(e),y(W.$$.fragment,e),Jt=a(e),y(ke.$$.fragment,e),ht=a(e),Re=r(e,"P",{"data-svelte-h":!0}),M(Re)!=="svelte-ktf0yf"&&(Re.textContent=Zn),bt=a(e),y(Xe.$$.fragment,e),ft=a(e),y(Ne.$$.fragment,e),Ct=a(e),He=r(e,"P",{"data-svelte-h":!0}),M(He)!=="svelte-e9hr70"&&(He.textContent=Vn),jt=a(e),Ae=r(e,"UL",{"data-svelte-h":!0}),M(Ae)!=="svelte-138kta0"&&(Ae.innerHTML=Gn),gt=a(e),Ee=r(e,"P",{"data-svelte-h":!0}),M(Ee)!=="svelte-1oatqej"&&(Ee.innerHTML=Fn),$t=a(e),y(Qe.$$.fragment,e),Bt=a(e),ze=r(e,"P",{"data-svelte-h":!0}),M(ze)!=="svelte-10z9rkn"&&(ze.textContent=kn),It=a(e),Ye=r(e,"P",{"data-svelte-h":!0}),M(Ye)!=="svelte-24042q"&&(Ye.innerHTML=Rn),vt=a(e),y(xe.$$.fragment,e),_t=a(e),Se=r(e,"P",{"data-svelte-h":!0}),M(Se)!=="svelte-7l8i2j"&&(Se.innerHTML=Xn),Wt=a(e),Le=r(e,"P",{"data-svelte-h":!0}),M(Le)!=="svelte-11lpom8"&&(Le.textContent=Nn),Zt=a(e),Pe=r(e,"P",{"data-svelte-h":!0}),M(Pe)!=="svelte-15ioqnc"&&(Pe.innerHTML=Hn),Vt=a(e),y(Oe.$$.fragment,e),Gt=a(e),De=r(e,"P",{"data-svelte-h":!0}),M(De)!=="svelte-1e8fv74"&&(De.innerHTML=An),Ft=a(e),y(qe.$$.fragment,e),kt=a(e),y(Z.$$.fragment,e),Rt=a(e),y(Ke.$$.fragment,e),Xt=a(e),y(el.$$.fragment,e),Nt=a(e),ll=r(e,"OL",{"data-svelte-h":!0}),M(ll)!=="svelte-1j56tw2"&&(ll.innerHTML=En),Ht=a(e),y(tl.$$.fragment,e),At=a(e),nl=r(e,"UL",{"data-svelte-h":!0}),M(nl)!=="svelte-pjt7c2"&&(nl.innerHTML=Qn),Et=a(e),y(il.$$.fragment,e),Qt=a(e),y(al.$$.fragment,e),zt=a(e),sl=r(e,"OL",{"data-svelte-h":!0}),M(sl)!=="svelte-j5qw5u"&&(sl.innerHTML=zn),Yt=a(e),y(rl.$$.fragment,e),xt=a(e),Ml=r(e,"UL",{"data-svelte-h":!0}),M(Ml)!=="svelte-jjqy14"&&(Ml.innerHTML=Yn),St=a(e),ml=r(e,"P",{}),Dn(ml).forEach(t),this.h()},h(){qn(m,"name","hf:doc:metadata"),qn(m,"content",yi)},m(e,l){ii(document.head,m),n(e,b,l),n(e,d,l),n(e,w,l),c(g,e,l),n(e,f,l),c(h,e,l),n(e,C,l),n(e,o,l),n(e,yl,l),n(e,V,l),n(e,cl,l),n(e,G,l),n(e,Ul,l),c(F,e,l),n(e,Tl,l),c(k,e,l),n(e,ul,l),n(e,R,l),n(e,dl,l),n(e,X,l),n(e,wl,l),n(e,N,l),n(e,Jl,l),n(e,H,l),n(e,hl,l),c(A,e,l),n(e,bl,l),n(e,E,l),n(e,fl,l),n(e,Q,l),n(e,Cl,l),n(e,z,l),n(e,jl,l),n(e,Y,l),n(e,gl,l),n(e,x,l),n(e,$l,l),n(e,S,l),n(e,Bl,l),c(L,e,l),n(e,Il,l),n(e,P,l),n(e,vl,l),n(e,O,l),n(e,_l,l),n(e,D,l),n(e,Wl,l),n(e,q,l),n(e,Zl,l),n(e,K,l),n(e,Vl,l),c(ee,e,l),n(e,Gl,l),n(e,le,l),n(e,Fl,l),n(e,te,l),n(e,kl,l),n(e,ne,l),n(e,Rl,l),n(e,ie,l),n(e,Xl,l),n(e,ae,l),n(e,Nl,l),n(e,se,l),n(e,Hl,l),n(e,re,l),n(e,Al,l),c(I,e,l),n(e,El,l),c(Me,e,l),n(e,Ql,l),n(e,me,l),n(e,zl,l),c(oe,e,l),n(e,Yl,l),c(pe,e,l),n(e,xl,l),c(v,e,l),n(e,Sl,l),c(ye,e,l),n(e,Ll,l),n(e,ce,l),n(e,Pl,l),c(Ue,e,l),n(e,Ol,l),c(Te,e,l),n(e,Dl,l),n(e,ue,l),n(e,ql,l),n(e,de,l),n(e,Kl,l),n(e,we,l),n(e,et,l),n(e,Je,l),n(e,lt,l),n(e,he,l),n(e,tt,l),c(be,e,l),n(e,nt,l),c(fe,e,l),n(e,it,l),c(_,e,l),n(e,at,l),c(Ce,e,l),n(e,st,l),n(e,je,l),n(e,rt,l),c(ge,e,l),n(e,Mt,l),c($e,e,l),n(e,mt,l),n(e,Be,l),n(e,ot,l),n(e,Ie,l),n(e,pt,l),c(ve,e,l),n(e,yt,l),n(e,_e,l),n(e,ct,l),n(e,We,l),n(e,Ut,l),c(Ze,e,l),n(e,Tt,l),c(Ve,e,l),n(e,ut,l),c(Ge,e,l),n(e,dt,l),c(Fe,e,l),n(e,wt,l),c(W,e,l),n(e,Jt,l),c(ke,e,l),n(e,ht,l),n(e,Re,l),n(e,bt,l),c(Xe,e,l),n(e,ft,l),c(Ne,e,l),n(e,Ct,l),n(e,He,l),n(e,jt,l),n(e,Ae,l),n(e,gt,l),n(e,Ee,l),n(e,$t,l),c(Qe,e,l),n(e,Bt,l),n(e,ze,l),n(e,It,l),n(e,Ye,l),n(e,vt,l),c(xe,e,l),n(e,_t,l),n(e,Se,l),n(e,Wt,l),n(e,Le,l),n(e,Zt,l),n(e,Pe,l),n(e,Vt,l),c(Oe,e,l),n(e,Gt,l),n(e,De,l),n(e,Ft,l),c(qe,e,l),n(e,kt,l),c(Z,e,l),n(e,Rt,l),c(Ke,e,l),n(e,Xt,l),c(el,e,l),n(e,Nt,l),n(e,ll,l),n(e,Ht,l),c(tl,e,l),n(e,At,l),n(e,nl,l),n(e,Et,l),c(il,e,l),n(e,Qt,l),c(al,e,l),n(e,zt,l),n(e,sl,l),n(e,Yt,l),c(rl,e,l),n(e,xt,l),n(e,Ml,l),n(e,St,l),n(e,ml,l),Lt=!0},p(e,[l]){const xn={};l&2&&(xn.$$scope={dirty:l,ctx:e}),I.$set(xn);const Sn={};l&2&&(Sn.$$scope={dirty:l,ctx:e}),v.$set(Sn);const Ln={};l&2&&(Ln.$$scope={dirty:l,ctx:e}),_.$set(Ln);const Pn={};l&2&&(Pn.$$scope={dirty:l,ctx:e}),W.$set(Pn);const On={};l&2&&(On.$$scope={dirty:l,ctx:e}),Z.$set(On)},i(e){Lt||(U(g.$$.fragment,e),U(h.$$.fragment,e),U(F.$$.fragment,e),U(k.$$.fragment,e),U(A.$$.fragment,e),U(L.$$.fragment,e),U(ee.$$.fragment,e),U(I.$$.fragment,e),U(Me.$$.fragment,e),U(oe.$$.fragment,e),U(pe.$$.fragment,e),U(v.$$.fragment,e),U(ye.$$.fragment,e),U(Ue.$$.fragment,e),U(Te.$$.fragment,e),U(be.$$.fragment,e),U(fe.$$.fragment,e),U(_.$$.fragment,e),U(Ce.$$.fragment,e),U(ge.$$.fragment,e),U($e.$$.fragment,e),U(ve.$$.fragment,e),U(Ze.$$.fragment,e),U(Ve.$$.fragment,e),U(Ge.$$.fragment,e),U(Fe.$$.fragment,e),U(W.$$.fragment,e),U(ke.$$.fragment,e),U(Xe.$$.fragment,e),U(Ne.$$.fragment,e),U(Qe.$$.fragment,e),U(xe.$$.fragment,e),U(Oe.$$.fragment,e),U(qe.$$.fragment,e),U(Z.$$.fragment,e),U(Ke.$$.fragment,e),U(el.$$.fragment,e),U(tl.$$.fragment,e),U(il.$$.fragment,e),U(al.$$.fragment,e),U(rl.$$.fragment,e),Lt=!0)},o(e){T(g.$$.fragment,e),T(h.$$.fragment,e),T(F.$$.fragment,e),T(k.$$.fragment,e),T(A.$$.fragment,e),T(L.$$.fragment,e),T(ee.$$.fragment,e),T(I.$$.fragment,e),T(Me.$$.fragment,e),T(oe.$$.fragment,e),T(pe.$$.fragment,e),T(v.$$.fragment,e),T(ye.$$.fragment,e),T(Ue.$$.fragment,e),T(Te.$$.fragment,e),T(be.$$.fragment,e),T(fe.$$.fragment,e),T(_.$$.fragment,e),T(Ce.$$.fragment,e),T(ge.$$.fragment,e),T($e.$$.fragment,e),T(ve.$$.fragment,e),T(Ze.$$.fragment,e),T(Ve.$$.fragment,e),T(Ge.$$.fragment,e),T(Fe.$$.fragment,e),T(W.$$.fragment,e),T(ke.$$.fragment,e),T(Xe.$$.fragment,e),T(Ne.$$.fragment,e),T(Qe.$$.fragment,e),T(xe.$$.fragment,e),T(Oe.$$.fragment,e),T(qe.$$.fragment,e),T(Z.$$.fragment,e),T(Ke.$$.fragment,e),T(el.$$.fragment,e),T(tl.$$.fragment,e),T(il.$$.fragment,e),T(al.$$.fragment,e),T(rl.$$.fragment,e),Lt=!1},d(e){e&&(t(b),t(d),t(w),t(f),t(C),t(o),t(yl),t(V),t(cl),t(G),t(Ul),t(Tl),t(ul),t(R),t(dl),t(X),t(wl),t(N),t(Jl),t(H),t(hl),t(bl),t(E),t(fl),t(Q),t(Cl),t(z),t(jl),t(Y),t(gl),t(x),t($l),t(S),t(Bl),t(Il),t(P),t(vl),t(O),t(_l),t(D),t(Wl),t(q),t(Zl),t(K),t(Vl),t(Gl),t(le),t(Fl),t(te),t(kl),t(ne),t(Rl),t(ie),t(Xl),t(ae),t(Nl),t(se),t(Hl),t(re),t(Al),t(El),t(Ql),t(me),t(zl),t(Yl),t(xl),t(Sl),t(Ll),t(ce),t(Pl),t(Ol),t(Dl),t(ue),t(ql),t(de),t(Kl),t(we),t(et),t(Je),t(lt),t(he),t(tt),t(nt),t(it),t(at),t(st),t(je),t(rt),t(Mt),t(mt),t(Be),t(ot),t(Ie),t(pt),t(yt),t(_e),t(ct),t(We),t(Ut),t(Tt),t(ut),t(dt),t(wt),t(Jt),t(ht),t(Re),t(bt),t(ft),t(Ct),t(He),t(jt),t(Ae),t(gt),t(Ee),t($t),t(Bt),t(ze),t(It),t(Ye),t(vt),t(_t),t(Se),t(Wt),t(Le),t(Zt),t(Pe),t(Vt),t(Gt),t(De),t(Ft),t(kt),t(Rt),t(Xt),t(Nt),t(ll),t(Ht),t(At),t(nl),t(Et),t(Qt),t(zt),t(sl),t(Yt),t(xt),t(Ml),t(St),t(ml)),t(m),u(g,e),u(h,e),u(F,e),u(k,e),u(A,e),u(L,e),u(ee,e),u(I,e),u(Me,e),u(oe,e),u(pe,e),u(v,e),u(ye,e),u(Ue,e),u(Te,e),u(be,e),u(fe,e),u(_,e),u(Ce,e),u(ge,e),u($e,e),u(ve,e),u(Ze,e),u(Ve,e),u(Ge,e),u(Fe,e),u(W,e),u(ke,e),u(Xe,e),u(Ne,e),u(Qe,e),u(xe,e),u(Oe,e),u(qe,e),u(Z,e),u(Ke,e),u(el,e),u(tl,e),u(il,e),u(al,e),u(rl,e)}}}const yi='{"title":"Distributed Training with optimum-neuron","local":"distributed-training-with-optimum-neuron","sections":[{"title":"Parallelism Strategies Overview","local":"parallelism-strategies-overview","sections":[{"title":"1. ZeRO-1 (Optimizer State Sharding)","local":"1-zero-1-optimizer-state-sharding","sections":[],"depth":3},{"title":"2. Tensor Parallelism (Intra-layer Model Parallelism)","local":"2-tensor-parallelism-intra-layer-model-parallelism","sections":[],"depth":3},{"title":"3. Sequence Parallelism (Activation Sharding)","local":"3-sequence-parallelism-activation-sharding","sections":[],"depth":3},{"title":"4. Pipeline Parallelism (Inter-layer Model Parallelism)","local":"4-pipeline-parallelism-inter-layer-model-parallelism","sections":[],"depth":3}],"depth":2},{"title":"How to enable ZeRO-1?","local":"how-to-enable-zero-1","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Tensor Parallelism?","local":"how-to-enable-tensor-parallelism","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"How to enable Pipeline Parallelism?","local":"how-to-enable-pipeline-parallelism","sections":[{"title":"Configuration Options","local":"configuration-options","sections":[],"depth":3},{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3},{"title":"Via the NeuronAccelerator","local":"via-the-neuronaccelerator","sections":[],"depth":3}],"depth":2},{"title":"Combining Parallelism Strategies","local":"combining-parallelism-strategies","sections":[{"title":"Via the NeuronTrainer","local":"via-the-neurontrainer","sections":[],"depth":3}],"depth":2},{"title":"Checkpoint consolidation","local":"checkpoint-consolidation","sections":[],"depth":2},{"title":"Best Practices","local":"best-practices","sections":[{"title":"Choosing Parallelism Strategy","local":"choosing-parallelism-strategy","sections":[],"depth":3},{"title":"Memory Optimization","local":"memory-optimization","sections":[],"depth":3}],"depth":2},{"title":"Troubleshooting","local":"troubleshooting","sections":[{"title":"Common Issues","local":"common-issues","sections":[],"depth":3},{"title":"Debugging Tips","local":"debugging-tips","sections":[],"depth":3}],"depth":2}],"depth":1}';function ci($){return ei(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class hi extends li{constructor(m){super(),ti(this,m,ci,pi,Kn,{})}}export{hi as component};

Xet Storage Details

Size:
58.9 kB
·
Xet hash:
91b8bb84fa9cc41918d17f0dceea5826d876d95a9f55598b5afd78a2a2d3410e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.