AudioFlamingo3 / app.py
sonicase's picture
fix language
0889cf4 verified
"""
Audio Flamingo 3 β€” Interactive Demo
HuggingFace Space with ZeroGPU.
GPU is allocated dynamically per request β€” no idle cost.
Model: nvidia/audio-flamingo-3-hf
License: NVIDIA OneWay Noncommercial License
"""
import gradio as gr
import torch
import spaces
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
# ── Model loading ──────────────────────────────────────────────────────────────
# With ZeroGPU, the model lives in CPU RAM between requests.
# GPU is only allocated inside functions decorated with @spaces.GPU.
MODEL_ID = "nvidia/audio-flamingo-3-hf"
print(f"Loading {MODEL_ID}...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
)
model.eval()
print("Model ready.")
# ── Inference ──────────────────────────────────────────────────────────────────
@spaces.GPU(duration=120)
def run_inference(audio_path, question, use_thinking, max_new_tokens):
if audio_path is None:
return "", "Please upload an audio file first."
if not question.strip():
return "", "Please enter a question."
# Move model to GPU (allocated by ZeroGPU at this point)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Build conversation in AF3 chat format
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{"type": "audio", "path": audio_path},
],
}
]
# Tokenize
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
).to(device)
# Generate
with torch.inference_mode():
output_ids = model.generate(
**inputs,
max_new_tokens=int(max_new_tokens),
do_sample=False,
)
# Decode new tokens only (strip the input prefix)
new_tokens = output_ids[:, inputs["input_ids"].shape[1]:]
raw = processor.batch_decode(
new_tokens, skip_special_tokens=True, strip_prefix=True
)[0].strip()
# Release GPU back to ZeroGPU pool
model.to("cpu")
torch.cuda.empty_cache()
# Parse AF-Think format: <think>...</think> Answer: ...
reasoning, answer = "", raw
if "<think>" in raw and "</think>" in raw:
s = raw.index("<think>") + len("<think>")
e = raw.index("</think>")
reasoning = raw[s:e].strip()
after = raw[e + len("</think>"):].strip()
answer = after.split("Answer:", 1)[1].strip() if "Answer:" in after else after or raw
return reasoning, answer
# ── Example audio ──────────────────────────────────────────────────────────────
AUDIO_BASE = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets"
EXAMPLES = [
[f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3",
"What is happening in this audio? Describe in detail.", True, 300],
[f"{AUDIO_BASE}/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
"Transcribe the speech in this recording.", False, 200],
[f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3",
"Is this music or environmental sound? Reason step by step.", True, 400],
]
# ── UI ─────────────────────────────────────────────────────────────────────────
CSS = """
.gradio-container { max-width: 900px; margin: auto; }
.think-box { border-left: 3px solid #FFE66D; padding: 10px 14px;
background: #111; border-radius: 6px; color: #ccc; font-size: 0.9rem; }
.answer-box { border-left: 3px solid #4ECDC4; padding: 10px 14px; border-radius: 6px; }
.subtitle { color: #888; font-size: 0.92rem; margin-top: -6px; margin-bottom: 20px; }
footer { display: none !important; }
"""
with gr.Blocks(css=CSS, title="Audio Flamingo 3") as demo:
gr.Markdown("# 🦩 Audio Flamingo 3")
gr.Markdown(
"<p class='subtitle'>NVIDIA's audio language model β€” NeurIPS 2025 Spotlight. "
"Upload audio, ask a question, get a reasoned answer.</p>"
)
with gr.Row(equal_height=False):
# Left column: inputs
with gr.Column(scale=1):
audio_input = gr.Audio(
label="Audio",
type="filepath",
sources=["upload", "microphone"],
)
question_input = gr.Textbox(
label="Question",
placeholder="What do you hear? What instrument? Transcribe the speech.",
lines=2,
)
with gr.Row():
think_toggle = gr.Checkbox(
label="🧠 AF-Think",
value=True,
info="Step-by-step reasoning before the answer",
)
max_tokens = gr.Slider(
label="Max tokens", minimum=50, maximum=600, value=300, step=50,
)
submit_btn = gr.Button("Ask AF3 β†’", variant="primary", size="lg")
# Right column: outputs
with gr.Column(scale=1):
reasoning_output = gr.Textbox(
label="🧠 Reasoning (AF-Think)",
lines=7, interactive=False,
placeholder="Chain-of-thought reasoning will appear here...",
elem_classes=["think-box"],
)
answer_output = gr.Textbox(
label="βœ… Answer",
lines=4, interactive=False,
placeholder="Answer will appear here...",
elem_classes=["answer-box"],
)
# Submit on button click or Enter key
submit_btn.click(
fn=run_inference,
inputs=[audio_input, question_input, think_toggle, max_tokens],
outputs=[reasoning_output, answer_output],
)
question_input.submit(
fn=run_inference,
inputs=[audio_input, question_input, think_toggle, max_tokens],
outputs=[reasoning_output, answer_output],
)
gr.Examples(
examples=EXAMPLES,
inputs=[audio_input, question_input, think_toggle, max_tokens],
outputs=[reasoning_output, answer_output],
fn=run_inference,
cache_examples=False, # ZeroGPU does not support example caching
label="Preset examples",
)
gr.Markdown(
"---\n"
"**Model:** [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf) β€’ "
"**Paper:** [NeurIPS 2025 Spotlight](https://research.nvidia.com/labs/adlr/AF3/) β€’ "
"**License:** NVIDIA OneWay Noncommercial\n\n"
"*AF3 processes audio in 30-second windows, up to 10 minutes total.*"
)
demo.launch()