Buckets:
| import{s as ze,n as Ee,o as Re}from"../chunks/scheduler.56725da7.js";import{S as qe,i as Ye,e as o,s as a,c as p,h as Fe,a as i,d as l,b as s,f as ke,g as u,j as m,k as We,l as Pe,m as n,n as r,t as M,o as y,p as c}from"../chunks/index.18a26576.js";import{C as De}from"../chunks/CopyLLMTxtMenu.4513c8ed.js";import{C as z}from"../chunks/CodeBlock.58e3e98b.js";import{H as q}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.049405bf.js";function Oe($e){let d,Y,E,F,f,P,T,D,w,ve=`The <code>optimum-neuron</code> package includes a <a href="https://docs.vllm.ai/en/latest/" rel="nofollow">vLLM</a> plugin | |
| that registers an ‘optimum-neuron’ vLLM platform specifically designed to ease the deployment | |
| of models hosted on the Hugging Face hub to AWS Trainium and Inferentia.`,O,h,je="This platform supports two modes of operation:",K,J,Ie='<li>it can be used for the inference of pre-exported Neuron models directly from the hub,</li> <li>but it allows also the simplified deployment of vanilla models directly without recompilation using <a href="#hugging-face-neuron-cache">cached artifacts</a>.</li>',ee,U,xe="Notes",te,b,Be=`<li>only a relevant subset of all possible configurations for a given model are cached,</li> <li>you can use the <code>optimum-cli</code> to get all <a href="https://huggingface.co/docs/optimum-neuron/guides/cache_system#neuron-model-cache-lookup-inferentia-only" rel="nofollow">cached configurations</a> for each model.</li> <li>to deploy models that are not cached on the Hugging Face hub, you need to <a href="https://huggingface.co/docs/optimum-neuron/main/en/guides/export_model" rel="nofollow">export</a> | |
| them beforehand.</li>`,le,C,ne,g,Le=`The easiest way to use the <code>optimum-neuron</code> vLLM platform is to launch an Amazon ec2 instance using | |
| the <a href="https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2" rel="nofollow">Hugging Face Neuron Deep Learning AMI</a>. If you decide NOT to make your life easier by using <a href="https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2" rel="nofollow">Hugging Face Neuron Deep Learning AMI</a>, you can install this functionality into your Neuron environment with <code>pip install optimum-neuron[neuronx,vllm]</code>.`,ae,$,_e="Note: Trn2 instances are not supported by the <code>optimum-neuron</code> platform yet.",se,v,Ge='<li>After launching the instance, follow the instructions in <a href="https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html" rel="nofollow">Connect to your instance</a> to connect to the instance</li> <li>Once inside your instance, activate the pre-installed <code>optimum-neuron</code> virtual environment by running</li>',oe,j,ie,I,me,x,Xe="The easiest way to test a model is to use the python API:",pe,B,ue,L,re,_,Ae="The easiest way to serve a model is to use the <code>optimum-cli</code>:",Me,G,ye,X,Ne="The model can be a pre-exported neuron model or a standard hub model.",ce,A,He="When deploying a standard hub model, you can customize the way it will be exported:",de,N,fe,H,Se=`Note: by default <code>optimum-cli</code> will only <code>serve</code> standard models for which a cached configuration exists. | |
| This behaviour can be overridden using the <code>--allow_non_cached_model</code> argument.`,Te,S,Ze=`If you omit one parameter, <code>optimum-neuron</code> will select a default value for you based | |
| on the deployment target and prioritizing cached configurations.`,we,Z,Qe="Use the following command to test the model:",he,Q,Je,V,Ue,k,Ve="You can also launch an Open AI compatible inference server directly using vLLM entry points:",be,W,Ce,R,ge;return f=new De({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),T=new q({props:{title:"optimum-neuron plugin for vLLM",local:"optimum-neuron-plugin-for-vllm",headingTag:"h1"}}),C=new q({props:{title:"Setup",local:"setup",headingTag:"h2"}}),j=new z({props:{code:"c291cmNlJTIwJTJGb3B0JTJGYXdzX25ldXJvbnhfdmVudl9weXRvcmNoXzJfNyUyRmJpbiUyRmFjdGl2YXRl",highlighted:"source /opt/aws_neuronx_venv_pytorch_2_7/bin/activate",wrap:!1}}),I=new q({props:{title:"Generating content programmatically",local:"generating-content-programmatically",headingTag:"h2"}}),B=new z({props:{code:"ZnJvbSUyMHZsbG0lMjBpbXBvcnQlMjBMTE0lMkMlMjBTYW1wbGluZ1BhcmFtcyUwQSUwQXByb21wdHMlMjAlM0QlMjAlNUIlMEElMjAlMjAlMjAlMjAlMjJIZWxsbyUyQyUyMG15JTIwbmFtZSUyMGlzJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIyVGhlJTIwcHJlc2lkZW50JTIwb2YlMjB0aGUlMjBVbml0ZWQlMjBTdGF0ZXMlMjBpcyUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMlRoZSUyMGNhcGl0YWwlMjBvZiUyMEZyYW5jZSUyMGlzJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIyVGhlJTIwZnV0dXJlJTIwb2YlMjBBSSUyMGlzJTIyJTJDJTBBJTVEJTBBc2FtcGxpbmdfcGFyYW1zJTIwJTNEJTIwU2FtcGxpbmdQYXJhbXModGVtcGVyYXR1cmUlM0QwLjglMkMlMjB0b3BfcCUzRDAuOTUpJTBBJTBBbGxtJTIwJTNEJTIwTExNKG1vZGVsJTNEJTIydW5zbG90aCUyRkxsYW1hLTMuMi0xQi1JbnN0cnVjdCUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1heF9udW1fc2VxcyUzRDQlMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBtYXhfbW9kZWxfbGVuJTNENDA5NiUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRlbnNvcl9wYXJhbGxlbF9zaXplJTNEMiklMEElMEFvdXRwdXRzJTIwJTNEJTIwbGxtLmdlbmVyYXRlKHByb21wdHMlMkMlMjBzYW1wbGluZ19wYXJhbXMpJTBBJTBBZm9yJTIwb3V0cHV0JTIwaW4lMjBvdXRwdXRzJTNBJTBBJTIwJTIwJTIwJTIwcHJvbXB0JTIwJTNEJTIwb3V0cHV0LnByb21wdCUwQSUyMCUyMCUyMCUyMGdlbmVyYXRlZF90ZXh0JTIwJTNEJTIwb3V0cHV0Lm91dHB1dHMlNUIwJTVELnRleHQlMEElMjAlMjAlMjAlMjBwcmludChmJTIyUHJvbXB0JTNBJTIwJTdCcHJvbXB0IXIlN0QlMkMlMjBHZW5lcmF0ZWQlMjB0ZXh0JTNBJTIwJTdCZ2VuZXJhdGVkX3RleHQhciU3RCUyMik=",highlighted:`<span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> LLM, SamplingParams | |
| prompts = [ | |
| <span class="hljs-string">"Hello, my name is"</span>, | |
| <span class="hljs-string">"The president of the United States is"</span>, | |
| <span class="hljs-string">"The capital of France is"</span>, | |
| <span class="hljs-string">"The future of AI is"</span>, | |
| ] | |
| sampling_params = SamplingParams(temperature=<span class="hljs-number">0.8</span>, top_p=<span class="hljs-number">0.95</span>) | |
| llm = LLM(model=<span class="hljs-string">"unsloth/Llama-3.2-1B-Instruct"</span>, | |
| max_num_seqs=<span class="hljs-number">4</span>, | |
| max_model_len=<span class="hljs-number">4096</span>, | |
| tensor_parallel_size=<span class="hljs-number">2</span>) | |
| outputs = llm.generate(prompts, sampling_params) | |
| <span class="hljs-keyword">for</span> output <span class="hljs-keyword">in</span> outputs: | |
| prompt = output.prompt | |
| generated_text = output.outputs[<span class="hljs-number">0</span>].text | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Prompt: <span class="hljs-subst">{prompt!r}</span>, Generated text: <span class="hljs-subst">{generated_text!r}</span>"</span>)`,wrap:!1}}),L=new q({props:{title:"Serving a model",local:"serving-a-model",headingTag:"h2"}}),G=new z({props:{code:"b3B0aW11bS1jbGklMjBuZXVyb24lMjBzZXJ2ZSUyMC0tbW9kZWwlM0QlM0Ntb2RlbF9uYW1lX29yX3BhdGglM0U=",highlighted:"optimum-cli neuron serve --model=<model_name_or_path>",wrap:!1}}),N=new z({props:{code:"b3B0aW11bS1jbGklMjBuZXVyb24lMjBzZXJ2ZSUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbW9kZWwlM0QlMjJ1bnNsb3RoJTJGTGxhbWEtMy4xLTFCLUluc3RydWN0JTIyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1iYXRjaF9zaXplJTNENCUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tc2VxdWVuY2VfbGVuZ3RoJTNENDA5NiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tdGVuc29yX3BhcmFsbGVsX3NpemUlM0QyJTIwJTVD",highlighted:`optimum-cli neuron serve \\ | |
| --model="unsloth/Llama-3.1-1B-Instruct" \\ | |
| --batch_size=4 \\ | |
| --sequence_length=4096 \\ | |
| --tensor_parallel_size=2 \\`,wrap:!1}}),Q=new z({props:{code:"Y3VybCUyMDEyNy4wLjAuMSUzQTgwODAlMkZ2MSUyRmNvbXBsZXRpb25zJTIwJTVDJTBBJTIwJTIwJTIwJTIwLUglMjAnQ29udGVudC1UeXBlJTNBJTIwYXBwbGljYXRpb24lMkZqc29uJyUyMCU1QyUwQSUyMCUyMCUyMCUyMC1YJTIwUE9TVCUyMCU1QyUwQSUyMCUyMCUyMCUyMC1kJTIwJyU3QiUyMnByb21wdCUyMiUzQSUyMk9uZSUyMG9mJTIwbXklMjBmb25kZXN0JTIwbWVtb3J5JTIwaXMlMjIlMkMlMjAlMjJ0ZW1wZXJhdHVyZSUyMiUzQSUyMDAuOCUyQyUyMCUyMm1heF90b2tlbnMlMjIlM0ExMjglN0Qn",highlighted:`curl 127.0.0.1:8080/v1/completions \\ | |
| -H 'Content-Type: application/json' \\ | |
| -X POST \\ | |
| -d '{"prompt":"One of my fondest memory is", "temperature": 0.8, "max_tokens":128}'`,wrap:!1}}),V=new q({props:{title:"Custom deployment for advanced users",local:"custom-deployment-for-advanced-users",headingTag:"h2"}}),W=new z({props:{code:"cHl0aG9uJTIwLW0lMjB2bGxtLmVudHJ5cG9pbnRzLm9wZW5haS5hcGlfc2VydmVyJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tb2RlbCUzRCUyMnVuc2xvdGglMkZMbGFtYS0zLjItMUItSW5zdHJ1Y3QlMjIlMjAlNUMlMEElMjAlMjAlMjAlMjAtLW1heC1udW0tc2VxcyUzRDQlMjAlNUMlMEElMjAlMjAlMjAlMjAtLW1heC1tb2RlbC1sZW4lM0Q0MDk2JTIwJTVDJTBBJTIwJTIwJTIwJTIwLS10ZW5zb3ItcGFyYWxsZWwtc2l6ZSUzRDIlMjAlNUMlMEElMjAlMjAlMjAlMjAtLXBvcnQlM0Q4MDgw",highlighted:`python -m vllm.entrypoints.openai.api_server \\ | |
| --model="unsloth/Llama-3.2-1B-Instruct" \\ | |
| --max-num-seqs=4 \\ | |
| --max-model-len=4096 \\ | |
| --tensor-parallel-size=2 \\ | |
| --port=8080`,wrap:!1}}),{c(){d=o("meta"),Y=a(),E=o("p"),F=a(),p(f.$$.fragment),P=a(),p(T.$$.fragment),D=a(),w=o("p"),w.innerHTML=ve,O=a(),h=o("p"),h.textContent=je,K=a(),J=o("ul"),J.innerHTML=Ie,ee=a(),U=o("p"),U.textContent=xe,te=a(),b=o("ul"),b.innerHTML=Be,le=a(),p(C.$$.fragment),ne=a(),g=o("p"),g.innerHTML=Le,ae=a(),$=o("p"),$.innerHTML=_e,se=a(),v=o("ul"),v.innerHTML=Ge,oe=a(),p(j.$$.fragment),ie=a(),p(I.$$.fragment),me=a(),x=o("p"),x.textContent=Xe,pe=a(),p(B.$$.fragment),ue=a(),p(L.$$.fragment),re=a(),_=o("p"),_.innerHTML=Ae,Me=a(),p(G.$$.fragment),ye=a(),X=o("p"),X.textContent=Ne,ce=a(),A=o("p"),A.textContent=He,de=a(),p(N.$$.fragment),fe=a(),H=o("p"),H.innerHTML=Se,Te=a(),S=o("p"),S.innerHTML=Ze,we=a(),Z=o("p"),Z.textContent=Qe,he=a(),p(Q.$$.fragment),Je=a(),p(V.$$.fragment),Ue=a(),k=o("p"),k.textContent=Ve,be=a(),p(W.$$.fragment),Ce=a(),R=o("p"),this.h()},l(e){const t=Fe("svelte-u9bgzb",document.head);d=i(t,"META",{name:!0,content:!0}),t.forEach(l),Y=s(e),E=i(e,"P",{}),ke(E).forEach(l),F=s(e),u(f.$$.fragment,e),P=s(e),u(T.$$.fragment,e),D=s(e),w=i(e,"P",{"data-svelte-h":!0}),m(w)!=="svelte-28yqn4"&&(w.innerHTML=ve),O=s(e),h=i(e,"P",{"data-svelte-h":!0}),m(h)!=="svelte-z6gb1b"&&(h.textContent=je),K=s(e),J=i(e,"UL",{"data-svelte-h":!0}),m(J)!=="svelte-1y9py5l"&&(J.innerHTML=Ie),ee=s(e),U=i(e,"P",{"data-svelte-h":!0}),m(U)!=="svelte-1y2vhyh"&&(U.textContent=xe),te=s(e),b=i(e,"UL",{"data-svelte-h":!0}),m(b)!=="svelte-1isgnu3"&&(b.innerHTML=Be),le=s(e),u(C.$$.fragment,e),ne=s(e),g=i(e,"P",{"data-svelte-h":!0}),m(g)!=="svelte-valqmb"&&(g.innerHTML=Le),ae=s(e),$=i(e,"P",{"data-svelte-h":!0}),m($)!=="svelte-18hhclg"&&($.innerHTML=_e),se=s(e),v=i(e,"UL",{"data-svelte-h":!0}),m(v)!=="svelte-140kn6a"&&(v.innerHTML=Ge),oe=s(e),u(j.$$.fragment,e),ie=s(e),u(I.$$.fragment,e),me=s(e),x=i(e,"P",{"data-svelte-h":!0}),m(x)!=="svelte-12fxvlq"&&(x.textContent=Xe),pe=s(e),u(B.$$.fragment,e),ue=s(e),u(L.$$.fragment,e),re=s(e),_=i(e,"P",{"data-svelte-h":!0}),m(_)!=="svelte-3wro4u"&&(_.innerHTML=Ae),Me=s(e),u(G.$$.fragment,e),ye=s(e),X=i(e,"P",{"data-svelte-h":!0}),m(X)!=="svelte-7r6j2m"&&(X.textContent=Ne),ce=s(e),A=i(e,"P",{"data-svelte-h":!0}),m(A)!=="svelte-li5n6k"&&(A.textContent=He),de=s(e),u(N.$$.fragment,e),fe=s(e),H=i(e,"P",{"data-svelte-h":!0}),m(H)!=="svelte-1cus4pe"&&(H.innerHTML=Se),Te=s(e),S=i(e,"P",{"data-svelte-h":!0}),m(S)!=="svelte-55sgvw"&&(S.innerHTML=Ze),we=s(e),Z=i(e,"P",{"data-svelte-h":!0}),m(Z)!=="svelte-1fua9k5"&&(Z.textContent=Qe),he=s(e),u(Q.$$.fragment,e),Je=s(e),u(V.$$.fragment,e),Ue=s(e),k=i(e,"P",{"data-svelte-h":!0}),m(k)!=="svelte-16sg5pw"&&(k.textContent=Ve),be=s(e),u(W.$$.fragment,e),Ce=s(e),R=i(e,"P",{}),ke(R).forEach(l),this.h()},h(){We(d,"name","hf:doc:metadata"),We(d,"content",Ke)},m(e,t){Pe(document.head,d),n(e,Y,t),n(e,E,t),n(e,F,t),r(f,e,t),n(e,P,t),r(T,e,t),n(e,D,t),n(e,w,t),n(e,O,t),n(e,h,t),n(e,K,t),n(e,J,t),n(e,ee,t),n(e,U,t),n(e,te,t),n(e,b,t),n(e,le,t),r(C,e,t),n(e,ne,t),n(e,g,t),n(e,ae,t),n(e,$,t),n(e,se,t),n(e,v,t),n(e,oe,t),r(j,e,t),n(e,ie,t),r(I,e,t),n(e,me,t),n(e,x,t),n(e,pe,t),r(B,e,t),n(e,ue,t),r(L,e,t),n(e,re,t),n(e,_,t),n(e,Me,t),r(G,e,t),n(e,ye,t),n(e,X,t),n(e,ce,t),n(e,A,t),n(e,de,t),r(N,e,t),n(e,fe,t),n(e,H,t),n(e,Te,t),n(e,S,t),n(e,we,t),n(e,Z,t),n(e,he,t),r(Q,e,t),n(e,Je,t),r(V,e,t),n(e,Ue,t),n(e,k,t),n(e,be,t),r(W,e,t),n(e,Ce,t),n(e,R,t),ge=!0},p:Ee,i(e){ge||(M(f.$$.fragment,e),M(T.$$.fragment,e),M(C.$$.fragment,e),M(j.$$.fragment,e),M(I.$$.fragment,e),M(B.$$.fragment,e),M(L.$$.fragment,e),M(G.$$.fragment,e),M(N.$$.fragment,e),M(Q.$$.fragment,e),M(V.$$.fragment,e),M(W.$$.fragment,e),ge=!0)},o(e){y(f.$$.fragment,e),y(T.$$.fragment,e),y(C.$$.fragment,e),y(j.$$.fragment,e),y(I.$$.fragment,e),y(B.$$.fragment,e),y(L.$$.fragment,e),y(G.$$.fragment,e),y(N.$$.fragment,e),y(Q.$$.fragment,e),y(V.$$.fragment,e),y(W.$$.fragment,e),ge=!1},d(e){e&&(l(Y),l(E),l(F),l(P),l(D),l(w),l(O),l(h),l(K),l(J),l(ee),l(U),l(te),l(b),l(le),l(ne),l(g),l(ae),l($),l(se),l(v),l(oe),l(ie),l(me),l(x),l(pe),l(ue),l(re),l(_),l(Me),l(ye),l(X),l(ce),l(A),l(de),l(fe),l(H),l(Te),l(S),l(we),l(Z),l(he),l(Je),l(Ue),l(k),l(be),l(Ce),l(R)),l(d),c(f,e),c(T,e),c(C,e),c(j,e),c(I,e),c(B,e),c(L,e),c(G,e),c(N,e),c(Q,e),c(V,e),c(W,e)}}}const Ke='{"title":"optimum-neuron plugin for vLLM","local":"optimum-neuron-plugin-for-vllm","sections":[{"title":"Setup","local":"setup","sections":[],"depth":2},{"title":"Generating content programmatically","local":"generating-content-programmatically","sections":[],"depth":2},{"title":"Serving a model","local":"serving-a-model","sections":[],"depth":2},{"title":"Custom deployment for advanced users","local":"custom-deployment-for-advanced-users","sections":[],"depth":2}],"depth":1}';function et($e){return Re(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ot extends qe{constructor(d){super(),Ye(this,d,et,Oe,ze,{})}}export{ot as component}; | |
Xet Storage Details
- Size:
- 13.9 kB
- Xet hash:
- 9f770bb902d6c0b7e61133cc67374f983dad5e7a563b9fb69b76350d545aa105
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.