""" Audio Flamingo 3 — Interactive Demo HuggingFace Space with ZeroGPU. GPU is allocated dynamically per request — no idle cost. Model: nvidia/audio-flamingo-3-hf License: NVIDIA OneWay Noncommercial License """ import gradio as gr import torch import spaces from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor # ── Model loading ────────────────────────────────────────────────────────────── # With ZeroGPU, the model lives in CPU RAM between requests. # GPU is only allocated inside functions decorated with @spaces.GPU. MODEL_ID = "nvidia/audio-flamingo-3-hf" print(f"Loading {MODEL_ID}...") processor = AutoProcessor.from_pretrained(MODEL_ID) model = AudioFlamingo3ForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.float16, low_cpu_mem_usage=True, ) model.eval() print("Model ready.") # ── Inference ────────────────────────────────────────────────────────────────── @spaces.GPU(duration=120) def run_inference(audio_path, question, use_thinking, max_new_tokens): if audio_path is None: return "", "Please upload an audio file first." if not question.strip(): return "", "Please enter a question." # Move model to GPU (allocated by ZeroGPU at this point) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) # Build conversation in AF3 chat format conversation = [ { "role": "user", "content": [ {"type": "text", "text": question}, {"type": "audio", "path": audio_path}, ], } ] # Tokenize inputs = processor.apply_chat_template( conversation, tokenize=True, add_generation_prompt=True, return_dict=True, ).to(device) # Generate with torch.inference_mode(): output_ids = model.generate( **inputs, max_new_tokens=int(max_new_tokens), do_sample=False, ) # Decode new tokens only (strip the input prefix) new_tokens = output_ids[:, inputs["input_ids"].shape[1]:] raw = processor.batch_decode( new_tokens, skip_special_tokens=True, strip_prefix=True )[0].strip() # Release GPU back to ZeroGPU pool model.to("cpu") torch.cuda.empty_cache() # Parse AF-Think format: ... Answer: ... reasoning, answer = "", raw if "" in raw and "" in raw: s = raw.index("") + len("") e = raw.index("") reasoning = raw[s:e].strip() after = raw[e + len(""):].strip() answer = after.split("Answer:", 1)[1].strip() if "Answer:" in after else after or raw return reasoning, answer # ── Example audio ────────────────────────────────────────────────────────────── AUDIO_BASE = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets" EXAMPLES = [ [f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3", "What is happening in this audio? Describe in detail.", True, 300], [f"{AUDIO_BASE}/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav", "Transcribe the speech in this recording.", False, 200], [f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3", "Is this music or environmental sound? Reason step by step.", True, 400], ] # ── UI ───────────────────────────────────────────────────────────────────────── CSS = """ .gradio-container { max-width: 900px; margin: auto; } .think-box { border-left: 3px solid #FFE66D; padding: 10px 14px; background: #111; border-radius: 6px; color: #ccc; font-size: 0.9rem; } .answer-box { border-left: 3px solid #4ECDC4; padding: 10px 14px; border-radius: 6px; } .subtitle { color: #888; font-size: 0.92rem; margin-top: -6px; margin-bottom: 20px; } footer { display: none !important; } """ with gr.Blocks(css=CSS, title="Audio Flamingo 3") as demo: gr.Markdown("# 🦩 Audio Flamingo 3") gr.Markdown( "

NVIDIA's audio language model — NeurIPS 2025 Spotlight. " "Upload audio, ask a question, get a reasoned answer.

" ) with gr.Row(equal_height=False): # Left column: inputs with gr.Column(scale=1): audio_input = gr.Audio( label="Audio", type="filepath", sources=["upload", "microphone"], ) question_input = gr.Textbox( label="Question", placeholder="What do you hear? What instrument? Transcribe the speech.", lines=2, ) with gr.Row(): think_toggle = gr.Checkbox( label="🧠 AF-Think", value=True, info="Step-by-step reasoning before the answer", ) max_tokens = gr.Slider( label="Max tokens", minimum=50, maximum=600, value=300, step=50, ) submit_btn = gr.Button("Ask AF3 →", variant="primary", size="lg") # Right column: outputs with gr.Column(scale=1): reasoning_output = gr.Textbox( label="🧠 Reasoning (AF-Think)", lines=7, interactive=False, placeholder="Chain-of-thought reasoning will appear here...", elem_classes=["think-box"], ) answer_output = gr.Textbox( label="✅ Answer", lines=4, interactive=False, placeholder="Answer will appear here...", elem_classes=["answer-box"], ) # Submit on button click or Enter key submit_btn.click( fn=run_inference, inputs=[audio_input, question_input, think_toggle, max_tokens], outputs=[reasoning_output, answer_output], ) question_input.submit( fn=run_inference, inputs=[audio_input, question_input, think_toggle, max_tokens], outputs=[reasoning_output, answer_output], ) gr.Examples( examples=EXAMPLES, inputs=[audio_input, question_input, think_toggle, max_tokens], outputs=[reasoning_output, answer_output], fn=run_inference, cache_examples=False, # ZeroGPU does not support example caching label="Preset examples", ) gr.Markdown( "---\n" "**Model:** [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf) • " "**Paper:** [NeurIPS 2025 Spotlight](https://research.nvidia.com/labs/adlr/AF3/) • " "**License:** NVIDIA OneWay Noncommercial\n\n" "*AF3 processes audio in 30-second windows, up to 10 minutes total.*" ) demo.launch()