Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / optimum-neuron /v0.4.4 /en /training_tutorials /finetune_qwen3.html

rtrm

15 days ago

download

raw

42.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"🚀 Fine-Tune Qwen3 8B with LoRA","local":"-fine-tune-qwen3-8b-with-lora","sections":[{"title":"1. 🛠️ Setup AWS Environment","local":"1--setup-aws-environment","sections":[],"depth":2},{"title":"2. 📊 Load and Prepare the Dataset","local":"2--load-and-prepare-the-dataset","sections":[],"depth":2},{"title":"3. 🎯 Fine-tune Qwen3 with NeuronSFTTrainer and PEFT","local":"3--fine-tune-qwen3-with-neuronsfttrainer-and-peft","sections":[],"depth":2},{"title":"4. 🔄 Consolidate and Test the Fine-Tuned Model","local":"4--consolidate-and-test-the-fine-tuned-model","sections":[],"depth":2},{"title":"5. 🤗 Push to Hugging Face Hub","local":"5--push-to-hugging-face-hub","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/entry/start.40e9a376.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/scheduler.56725da7.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/singletons.803151f8.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/paths.316b9bfe.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/entry/app.0fb5ce66.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/preload-helper.b93cc304.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/index.18a26576.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/nodes/0.3130ff89.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/nodes/48.ccb23d56.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/CopyLLMTxtMenu.4513c8ed.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/globals.7f7f1b26.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.049405bf.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/v0.4.4/en/_app/immutable/chunks/CodeBlock.58e3e98b.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"🚀 Fine-Tune Qwen3 8B with LoRA","local":"-fine-tune-qwen3-8b-with-lora","sections":[{"title":"1. 🛠️ Setup AWS Environment","local":"1--setup-aws-environment","sections":[],"depth":2},{"title":"2. 📊 Load and Prepare the Dataset","local":"2--load-and-prepare-the-dataset","sections":[],"depth":2},{"title":"3. 🎯 Fine-tune Qwen3 with NeuronSFTTrainer and PEFT","local":"3--fine-tune-qwen3-with-neuronsfttrainer-and-peft","sections":[],"depth":2},{"title":"4. 🔄 Consolidate and Test the Fine-Tuned Model","local":"4--consolidate-and-test-the-fine-tuned-model","sections":[],"depth":2},{"title":"5. 🤗 Push to Hugging Face Hub","local":"5--push-to-hugging-face-hub","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="-fine-tune-qwen3-8b-with-lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-fine-tune-qwen3-8b-with-lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🚀 Fine-Tune Qwen3 8B with LoRA</span></h1> <p data-svelte-h="svelte-61i5q2">This tutorial shows how to fine-tune the Qwen3 model on AWS Trainium accelerators using optimum-neuron.</p> <p data-svelte-h="svelte-1vtxnz9"><strong>This is based on the <a href="https://github.com/huggingface/optimum-neuron/tree/main/examples/training/qwen3" rel="nofollow">Qwen3 fine-tuning example script</a>.</strong></p> <h2 class="relative group"><a id="1--setup-aws-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1--setup-aws-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. 🛠️ Setup AWS Environment</span></h2> <p data-svelte-h="svelte-43r1bv">We’ll use a <code>trn1.32xlarge</code> instance with 16 Trainium Accelerators (32 Neuron Cores) and the Hugging Face Neuron Deep Learning AMI.</p> <p data-svelte-h="svelte-1ktungo">The Hugging Face AMI includes all required libraries pre-installed:</p> <ul data-svelte-h="svelte-1efvabb"><li><code>datasets</code>, <code>transformers</code>, <code>optimum-neuron</code></li> <li>Neuron SDK packages</li> <li>No additional environment setup needed</li></ul> <p data-svelte-h="svelte-1gchww4">To create your instance, follow the guide <a href="https://huggingface.co/docs/optimum-neuron/ec2-setup" rel="nofollow">here</a>.</p> <h2 class="relative group"><a id="2--load-and-prepare-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2--load-and-prepare-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. 📊 Load and Prepare the Dataset</span></h2> <p data-svelte-h="svelte-1uki1hp">We’ll use the <a href="https://huggingface.co/datasets/tengomucho/simple_recipes" rel="nofollow">simple recipes dataset</a> to fine-tune our model for recipe generation.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-string">'recipes'</span>: <span class="hljs-comment">"- Preheat oven to 350 degrees\n- Butter two 9x5' loaf pans\n- Cream the sugar and the butter until light and whipped\n- Add the bananas, eggs, lemon juice, orange rind\n- Beat until blended uniformly\n- Be patient, and beat until the banana lumps are gone\n- Sift the dry ingredients together\n- Fold lightly and thoroughly into the banana mixture\n- Pour the batter into prepared loaf pans\n- Bake for 45 to 55 minutes, until the loaves are firm in the middle and the edges begin to pull away from the pans\n- Cool the loaves on racks for 30 minutes before removing from the pans\n- Freezes well"</span>,
	<span class="hljs-string">'names'</span>: <span class="hljs-string">'Beat this banana bread'</span>
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-25b86y">To load the dataset we use the <code>load_dataset()</code> method from the <code>datasets</code> library.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> random <span class="hljs-keyword">import</span> randrange

	<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset


	<span class="hljs-comment"># Load dataset from the hub</span>
	dataset_id = <span class="hljs-string">"tengomucho/simple_recipes"</span>
	recipes = load_dataset(dataset_id, split=<span class="hljs-string">"train"</span>)

	dataset_size = <span class="hljs-built_in">len</span>(recipes)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"dataset size: <span class="hljs-subst">{dataset_size}</span>"</span>)
	<span class="hljs-built_in">print</span>(recipes[randrange(dataset_size)])
	<span class="hljs-comment"># dataset size: 20000</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e4z25x">To tune our model we need to convert our structured examples into a collection of quotes with a given context, so we define our tokenization function that we will be able to map on the dataset.</p> <p data-svelte-h="svelte-13ur3kw">The dataset should be structured with input-output pairs, where each input is a prompt and the output is the expected response from the model.
	We will make use of the model’s tokenizer chat template and preprocess the dataset to be fed to the trainer.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Preprocesses the dataset</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_dataset_with_eos</span>(<span class="hljs-params">eos_token</span>):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">preprocess_function</span>(<span class="hljs-params">examples</span>):
	recipes = examples[<span class="hljs-string">"recipes"</span>]
	names = examples[<span class="hljs-string">"names"</span>]

	chats = []
	<span class="hljs-keyword">for</span> recipe, name <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(recipes, names):
	<span class="hljs-comment"># Append the EOS token to the response</span>
	recipe += eos_token

	chat = [
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">f"How can I make <span class="hljs-subst">{name}</span>?"</span>},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"assistant"</span>, <span class="hljs-string">"content"</span>: recipe},
	]

	chats.append(chat)
	<span class="hljs-keyword">return</span> {<span class="hljs-string">"messages"</span>: chats}

	dataset = recipes.<span class="hljs-built_in">map</span>(preprocess_function, batched=<span class="hljs-literal">True</span>, remove_columns=recipes.column_names)
	<span class="hljs-keyword">return</span> dataset

	<span class="hljs-comment"># Structures the dataset into prompt-expected output pairs.</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">formatting_function</span>(<span class="hljs-params">examples</span>):
	<span class="hljs-keyword">return</span> tokenizer.apply_chat_template(examples[<span class="hljs-string">"messages"</span>], tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rgxks7">Note: these functions make references of <code>eos_token</code> and <code>tokenizer</code>, they are well-defined in the <a href="https://github.com/huggingface/optimum-neuron/blob/main/examples/training/qwen3/finetune_qwen3.py" rel="nofollow">Python script</a> to run this tutorial.</p> <h2 class="relative group"><a id="3--fine-tune-qwen3-with-neuronsfttrainer-and-peft" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3--fine-tune-qwen3-with-neuronsfttrainer-and-peft"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. 🎯 Fine-tune Qwen3 with NeuronSFTTrainer and PEFT</span></h2> <p data-svelte-h="svelte-1iw1mwz">For standard PyTorch fine-tuning, you’d typically use <a href="https://github.com/huggingface/peft" rel="nofollow">PEFT</a> with LoRA adapters and the <a href="https://huggingface.co/docs/trl/en/sft_trainer" rel="nofollow"><code>SFTTrainer</code></a>.</p> <p data-svelte-h="svelte-dh3p5f">On AWS Trainium, <code>optimum-neuron</code> provides <code>NeuronSFTTrainer</code> as a drop-in replacement.</p> <p data-svelte-h="svelte-1l7hprp"><strong>Distributed Training on Trainium:</strong>
	Since Qwen3 doesn’t fit on a single accelerator, we use distributed training techniques:</p> <ul data-svelte-h="svelte-1iqb34b"><li>Data Parallel (DDP)</li> <li>Tensor Parallelism</li></ul> <p data-svelte-h="svelte-1xxdciy">Model loading and LoRA configuration work similarly to other accelerators.</p> <p data-svelte-h="svelte-1v29c1g">Combining all the pieces together, and assuming the dataset has already been loaded, we can write the following code to fine-tune Qwen3 on AWS Trainium:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model_id = <span class="hljs-string">"Qwen/Qwen3-8B"</span>

	<span class="hljs-comment"># Define the training arguments</span>
	output_dir = <span class="hljs-string">"qwen3-finetuned-recipes"</span>
	training_args = NeuronTrainingArguments(
	output_dir=output_dir,
	num_train_epochs=<span class="hljs-number">3</span>,
	do_train=<span class="hljs-literal">True</span>,
	max_steps=-<span class="hljs-number">1</span>, <span class="hljs-comment"># -1 means train until the end of the dataset</span>
	per_device_train_batch_size=<span class="hljs-number">1</span>,
	gradient_accumulation_steps=<span class="hljs-number">8</span>,
	learning_rate=<span class="hljs-number">5e-4</span>,
	bf16=<span class="hljs-literal">True</span>,
	tensor_parallel_size=<span class="hljs-number">8</span>,
	logging_steps=<span class="hljs-number">2</span>,
	lr_scheduler_type=<span class="hljs-string">"cosine"</span>,
	overwrite_output_dir=<span class="hljs-literal">True</span>,
	)

	<span class="hljs-comment"># Load the model with the NeuronModelForCausalLM class.</span>
	<span class="hljs-comment"># It will load the model with a custom modeling speficically designed for AWS Trainium.</span>
	trn_config = training_args.trn_config
	dtype = torch.bfloat16 <span class="hljs-keyword">if</span> training_args.bf16 <span class="hljs-keyword">else</span> torch.float32
	model = NeuronModelForCausalLM.from_pretrained(
	model_id,
	trn_config,
	dtype=dtype,
	<span class="hljs-comment"># Use FlashAttention2 for better performance and to be able to use larger sequence lengths.</span>
	attn_implementation=<span class="hljs-string">"flash_attention_2"</span>,
	)

	lora_config = LoraConfig(
	r=<span class="hljs-number">64</span>,
	lora_alpha=<span class="hljs-number">128</span>,
	lora_dropout=<span class="hljs-number">0.05</span>,
	target_modules=[
	<span class="hljs-string">"embed_tokens"</span>,
	<span class="hljs-string">"q_proj"</span>,
	<span class="hljs-string">"v_proj"</span>,
	<span class="hljs-string">"o_proj"</span>,
	<span class="hljs-string">"k_proj"</span>,
	<span class="hljs-string">"up_proj"</span>,
	<span class="hljs-string">"down_proj"</span>,
	<span class="hljs-string">"gate_proj"</span>,
	],
	bias=<span class="hljs-string">"none"</span>,
	task_type=<span class="hljs-string">"CAUSAL_LM"</span>,
	)

	<span class="hljs-comment"># Converting the NeuronTrainingArguments to a dictionary to feed them to the NeuronSFTConfig.</span>
	args = training_args.to_dict()

	sft_config = NeuronSFTConfig(
	max_length=<span class="hljs-number">4096</span>,
	packing=<span class="hljs-literal">True</span>,
	**args,
	)

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	dataset = preprocess_dataset_with_eos(tokenizer.eos_token)

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">formatting_function</span>(<span class="hljs-params">examples</span>):
	<span class="hljs-keyword">return</span> tokenizer.apply_chat_template(examples[<span class="hljs-string">"messages"</span>], tokenize=<span class="hljs-literal">False</span>, add_generation_prompt=<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># The NeuronSFTTrainer will use `formatting_function` to format the dataset and `lora_config` to apply LoRA on the</span>
	<span class="hljs-comment"># model.</span>
	trainer = NeuronSFTTrainer(
	args=sft_config,
	model=model,
	peft_config=lora_config,
	processing_class=tokenizer,
	train_dataset=dataset,
	formatting_func=formatting_function,
	)
	trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1loxsby">📝 <strong>Complete script available:</strong> All steps above are combined in a ready-to-use script <a href="https://github.com/huggingface/optimum-neuron/blob/main/examples/training/qwen3/finetune_qwen3.py" rel="nofollow">finetune_qwen3.py</a>.</p> <p data-svelte-h="svelte-pvqso5">To launch training, just run the following command in your AWS Trainium instance:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Flags for Neuron compilation</span>
	<span class="hljs-built_in">export</span> NEURON_CC_FLAGS=<span class="hljs-string">"--model-type transformer --retry_failed_compilation"</span>
	<span class="hljs-built_in">export</span> NEURON_FUSE_SOFTMAX=1
	<span class="hljs-built_in">export</span> NEURON_RT_ASYNC_EXEC_MAX_INFLIGHT_REQUESTS=3 <span class="hljs-comment"># Async Runtime</span>
	<span class="hljs-built_in">export</span> MALLOC_ARENA_MAX=64 <span class="hljs-comment"># Host OOM mitigation</span>

	<span class="hljs-comment"># Variables for training</span>
	PROCESSES_PER_NODE=32
	NUM_EPOCHS=3
	TP_DEGREE=8
	BS=1
	GRADIENT_ACCUMULATION_STEPS=8
	LOGGING_STEPS=2
	MODEL_NAME=<span class="hljs-string">"Qwen/Qwen3-8B"</span> <span class="hljs-comment"># Change this to the desired model name</span>
	OUTPUT_DIR=<span class="hljs-string">"<span class="hljs-subst">$(echo $MODEL_NAME \| cut -d'/' -f2)</span>-finetuned"</span>
	DISTRIBUTED_ARGS=<span class="hljs-string">"--nproc_per_node <span class="hljs-variable">$PROCESSES_PER_NODE</span>"</span>
	SCRIPT_DIR=$( <span class="hljs-built_in">cd</span> -- <span class="hljs-string">"<span class="hljs-subst">$( dirname -- <span class="hljs-string">"<span class="hljs-variable">${BASH_SOURCE[0]}</span>"</span> )</span>"</span> &> /dev/null && <span class="hljs-built_in">pwd</span> )

	<span class="hljs-keyword">if</span> [ <span class="hljs-string">"<span class="hljs-variable">$NEURON_EXTRACT_GRAPHS_ONLY</span>"</span> = <span class="hljs-string">"1"</span> ]; <span class="hljs-keyword">then</span>
	MAX_STEPS=5
	<span class="hljs-keyword">else</span>
	MAX_STEPS=-1
	<span class="hljs-keyword">fi</span>

	torchrun --nproc_per_node <span class="hljs-variable">$PROCESSES_PER_NODE</span> finetune_qwen3.py \
	--model_id <span class="hljs-variable">$MODEL_NAME</span> \
	--num_train_epochs <span class="hljs-variable">$NUM_EPOCHS</span> \
	--do_train \
	--max_steps <span class="hljs-variable">$MAX_STEPS</span> \
	--per_device_train_batch_size <span class="hljs-variable">$BS</span> \
	--gradient_accumulation_steps <span class="hljs-variable">$GRADIENT_ACCUMULATION_STEPS</span> \
	--learning_rate 8e-4 \
	--bf16 \
	--tensor_parallel_size <span class="hljs-variable">$TP_DEGREE</span> \
	--zero_1 \
	--async_save \
	--logging_steps <span class="hljs-variable">$LOGGING_STEPS</span> \
	--output_dir <span class="hljs-variable">$OUTPUT_DIR</span> \
	--lr_scheduler_type <span class="hljs-string">"cosine"</span> \
	--overwrite_output_dir<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2cag06">🔧 <strong>Single command execution:</strong> The complete bash training script <a href="https://github.com/huggingface/optimum-neuron/blob/main/examples/training/qwen3/finetune_qwen3.sh" rel="nofollow">finetune_qwen3.sh</a> is available:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./finetune_qwen3.sh<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="4--consolidate-and-test-the-fine-tuned-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4--consolidate-and-test-the-fine-tuned-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. 🔄 Consolidate and Test the Fine-Tuned Model</span></h2> <p data-svelte-h="svelte-46jexq">Optimum Neuron saves model shards separately during distributed training. These need to be consolidated before use.</p> <p data-svelte-h="svelte-9a9g75">Use the Optimum CLI to consolidate:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli neuron consolidate Qwen3-8B-finetuned Qwen3-8B-finetuned/adapter_default<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cigzjn">This will create an <code>adapter_model.safetensors</code> file, the LoRA adapter weights that we trained in the previous step. We can now reload the model and merge it, so it can be loaded for evaluation:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer
	<span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel, PeftConfig


	MODEL_NAME = <span class="hljs-string">"Qwen/Qwen3-8B"</span>
	ADAPTER_PATH = <span class="hljs-string">"Qwen3-8B-finetuned/adapter_default"</span>
	MERGED_MODEL_PATH = <span class="hljs-string">"Qwen3-8B-recipes"</span>

	<span class="hljs-comment"># Load base model</span>
	model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	<span class="hljs-comment"># Load adapter configuration and model</span>
	adapter_config = PeftConfig.from_pretrained(ADAPTER_PATH)
	finetuned_model = PeftModel.from_pretrained(model, ADAPTER_PATH, config=adapter_config)

	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Saving tokenizer"</span>)
	tokenizer.save_pretrained(MERGED_MODEL_PATH)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"Saving model"</span>)
	finetuned_model = finetuned_model.merge_and_unload()
	finetuned_model.save_pretrained(MERGED_MODEL_PATH)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vght15">Once this step is done, it is possible to test the model with a new prompt.</p> <p data-svelte-h="svelte-1sz4goc">You have successfully created a fine-tuned model from Qwen3!</p> <h2 class="relative group"><a id="5--push-to-hugging-face-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5--push-to-hugging-face-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. 🤗 Push to Hugging Face Hub</span></h2> <p data-svelte-h="svelte-a60mly">Share your fine-tuned model with the community by uploading it to the Hugging Face Hub.</p> <p data-svelte-h="svelte-z9yml0"><strong>Step 1: Authentication</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-h2brza"><strong>Step 2: Upload your model</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer

	MERGED_MODEL_PATH = <span class="hljs-string">"Qwen3-8B-recipes"</span>
	HUB_MODEL_NAME = <span class="hljs-string">"your-username/qwen3-8b-recipes"</span>

	<span class="hljs-comment"># Load and push tokenizer</span>
	tokenizer = AutoTokenizer.from_pretrained(MERGED_MODEL_PATH)
	tokenizer.push_to_hub(HUB_MODEL_NAME)

	<span class="hljs-comment"># Load and push model</span>
	model = AutoModelForCausalLM.from_pretrained(MERGED_MODEL_PATH)
	model.push_to_hub(HUB_MODEL_NAME)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fnz57k">🎉 <strong>Your fine-tuned Qwen3 model is now available on the Hub for others to use!</strong></p> <p></p>

	<script>
	{
	__sveltekit_wi4mep = {
	assets: "/docs/optimum.neuron/v0.4.4/en",
	base: "/docs/optimum.neuron/v0.4.4/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/optimum.neuron/v0.4.4/en/_app/immutable/entry/start.40e9a376.js"),
	import("/docs/optimum.neuron/v0.4.4/en/_app/immutable/entry/app.0fb5ce66.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 48],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 42.9 kB
Xet hash:: 7d4498de0ed634238d6ed3b891aa1958bd7028ea13a2948d6596e8c237e6ae72

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.