"""
Audio Flamingo 3 — Interactive Demo
HuggingFace Space with ZeroGPU.
GPU is allocated dynamically per request — no idle cost.
Model: nvidia/audio-flamingo-3-hf
License: NVIDIA OneWay Noncommercial License
"""
import gradio as gr
import torch
import spaces
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
# ── Model loading ──────────────────────────────────────────────────────────────
# With ZeroGPU, the model lives in CPU RAM between requests.
# GPU is only allocated inside functions decorated with @spaces.GPU.
MODEL_ID = "nvidia/audio-flamingo-3-hf"
print(f"Loading {MODEL_ID}...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
)
model.eval()
print("Model ready.")
# ── Inference ──────────────────────────────────────────────────────────────────
@spaces.GPU(duration=120)
def run_inference(audio_path, question, use_thinking, max_new_tokens):
if audio_path is None:
return "", "Please upload an audio file first."
if not question.strip():
return "", "Please enter a question."
# Move model to GPU (allocated by ZeroGPU at this point)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Build conversation in AF3 chat format
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "audio", "path": audio_path},
],
}
]
# Tokenize
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(device)
# Generate
with torch.inference_mode():
output_ids = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=False,
)
# Decode new tokens only (strip the input prefix)
new_tokens = output_ids[:, inputs["input_ids"].shape[1]:]
raw = processor.batch_decode(
new_tokens, skip_special_tokens=True, strip_prefix=True
)[0].strip()
# Release GPU back to ZeroGPU pool
model.to("cpu")
torch.cuda.empty_cache()
# Parse AF-Think format:
NVIDIA's audio language model — NeurIPS 2025 Spotlight. " "Upload audio, ask a question, get a reasoned answer.
" ) with gr.Row(equal_height=False): # Left column: inputs with gr.Column(scale=1): audio_input = gr.Audio( label="Audio", type="filepath", sources=["upload", "microphone"], ) question_input = gr.Textbox( label="Question", placeholder="What do you hear? What instrument? Transcribe the speech.", lines=2, ) with gr.Row(): think_toggle = gr.Checkbox( label="🧠 AF-Think", value=True, info="Step-by-step reasoning before the answer", ) max_tokens = gr.Slider( label="Max tokens", minimum=50, maximum=600, value=300, step=50, ) submit_btn = gr.Button("Ask AF3 →", variant="primary", size="lg") # Right column: outputs with gr.Column(scale=1): reasoning_output = gr.Textbox( label="🧠 Reasoning (AF-Think)", lines=7, interactive=False, placeholder="Chain-of-thought reasoning will appear here...", elem_classes=["think-box"], ) answer_output = gr.Textbox( label="✅ Answer", lines=4, interactive=False, placeholder="Answer will appear here...", elem_classes=["answer-box"], ) # Submit on button click or Enter key submit_btn.click( fn=run_inference, inputs=[audio_input, question_input, think_toggle, max_tokens], outputs=[reasoning_output, answer_output], ) question_input.submit( fn=run_inference, inputs=[audio_input, question_input, think_toggle, max_tokens], outputs=[reasoning_output, answer_output], ) gr.Examples( examples=EXAMPLES, inputs=[audio_input, question_input, think_toggle, max_tokens], outputs=[reasoning_output, answer_output], fn=run_inference, cache_examples=False, # ZeroGPU does not support example caching label="Preset examples", ) gr.Markdown( "---\n" "**Model:** [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf) • " "**Paper:** [NeurIPS 2025 Spotlight](https://research.nvidia.com/labs/adlr/AF3/) • " "**License:** NVIDIA OneWay Noncommercial\n\n" "*AF3 processes audio in 30-second windows, up to 10 minutes total.*" ) demo.launch()