Spaces:

sonicase
/

AudioFlamingo3

Runtime error

App Files Files Community

AudioFlamingo3 / app.py

sonicase

fix language

0889cf4 verified 30 days ago

raw

history blame contribute delete

7.23 kB

	"""
	Audio Flamingo 3 — Interactive Demo
	HuggingFace Space with ZeroGPU.
	GPU is allocated dynamically per request — no idle cost.

	Model: nvidia/audio-flamingo-3-hf
	License: NVIDIA OneWay Noncommercial License
	"""

	import gradio as gr
	import torch
	import spaces
	from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

	# ── Model loading ──────────────────────────────────────────────────────────────
	# With ZeroGPU, the model lives in CPU RAM between requests.
	# GPU is only allocated inside functions decorated with @spaces.GPU.

	MODEL_ID = "nvidia/audio-flamingo-3-hf"

	print(f"Loading {MODEL_ID}...")
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True,
	)
	model.eval()
	print("Model ready.")


	# ── Inference ──────────────────────────────────────────────────────────────────
	@spaces.GPU(duration=120)
	def run_inference(audio_path, question, use_thinking, max_new_tokens):
	if audio_path is None:
	return "", "Please upload an audio file first."
	if not question.strip():
	return "", "Please enter a question."

	# Move model to GPU (allocated by ZeroGPU at this point)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	# Build conversation in AF3 chat format
	conversation = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": question},
	{"type": "audio", "path": audio_path},
	],
	}
	]

	# Tokenize
	inputs = processor.apply_chat_template(
	conversation,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	).to(device)

	# Generate
	with torch.inference_mode():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=int(max_new_tokens),
	do_sample=False,
	)

	# Decode new tokens only (strip the input prefix)
	new_tokens = output_ids[:, inputs["input_ids"].shape[1]:]
	raw = processor.batch_decode(
	new_tokens, skip_special_tokens=True, strip_prefix=True
	)[0].strip()

	# Release GPU back to ZeroGPU pool
	model.to("cpu")
	torch.cuda.empty_cache()

	# Parse AF-Think format: <think>...</think> Answer: ...
	reasoning, answer = "", raw
	if "<think>" in raw and "</think>" in raw:
	s = raw.index("<think>") + len("<think>")
	e = raw.index("</think>")
	reasoning = raw[s:e].strip()
	after = raw[e + len("</think>"):].strip()
	answer = after.split("Answer:", 1)[1].strip() if "Answer:" in after else after or raw

	return reasoning, answer


	# ── Example audio ──────────────────────────────────────────────────────────────
	AUDIO_BASE = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets"
	EXAMPLES = [
	[f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3",
	"What is happening in this audio? Describe in detail.", True, 300],
	[f"{AUDIO_BASE}/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
	"Transcribe the speech in this recording.", False, 200],
	[f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3",
	"Is this music or environmental sound? Reason step by step.", True, 400],
	]


	# ── UI ─────────────────────────────────────────────────────────────────────────
	CSS = """
	.gradio-container { max-width: 900px; margin: auto; }
	.think-box { border-left: 3px solid #FFE66D; padding: 10px 14px;
	background: #111; border-radius: 6px; color: #ccc; font-size: 0.9rem; }
	.answer-box { border-left: 3px solid #4ECDC4; padding: 10px 14px; border-radius: 6px; }
	.subtitle { color: #888; font-size: 0.92rem; margin-top: -6px; margin-bottom: 20px; }
	footer { display: none !important; }
	"""

	with gr.Blocks(css=CSS, title="Audio Flamingo 3") as demo:

	gr.Markdown("# 🦩 Audio Flamingo 3")
	gr.Markdown(
	"<p class='subtitle'>NVIDIA's audio language model — NeurIPS 2025 Spotlight. "
	"Upload audio, ask a question, get a reasoned answer.</p>"
	)

	with gr.Row(equal_height=False):

	# Left column: inputs
	with gr.Column(scale=1):
	audio_input = gr.Audio(
	label="Audio",
	type="filepath",
	sources=["upload", "microphone"],
	)
	question_input = gr.Textbox(
	label="Question",
	placeholder="What do you hear? What instrument? Transcribe the speech.",
	lines=2,
	)
	with gr.Row():
	think_toggle = gr.Checkbox(
	label="🧠 AF-Think",
	value=True,
	info="Step-by-step reasoning before the answer",
	)
	max_tokens = gr.Slider(
	label="Max tokens", minimum=50, maximum=600, value=300, step=50,
	)
	submit_btn = gr.Button("Ask AF3 →", variant="primary", size="lg")

	# Right column: outputs
	with gr.Column(scale=1):
	reasoning_output = gr.Textbox(
	label="🧠 Reasoning (AF-Think)",
	lines=7, interactive=False,
	placeholder="Chain-of-thought reasoning will appear here...",
	elem_classes=["think-box"],
	)
	answer_output = gr.Textbox(
	label="✅ Answer",
	lines=4, interactive=False,
	placeholder="Answer will appear here...",
	elem_classes=["answer-box"],
	)

	# Submit on button click or Enter key
	submit_btn.click(
	fn=run_inference,
	inputs=[audio_input, question_input, think_toggle, max_tokens],
	outputs=[reasoning_output, answer_output],
	)
	question_input.submit(
	fn=run_inference,
	inputs=[audio_input, question_input, think_toggle, max_tokens],
	outputs=[reasoning_output, answer_output],
	)

	gr.Examples(
	examples=EXAMPLES,
	inputs=[audio_input, question_input, think_toggle, max_tokens],
	outputs=[reasoning_output, answer_output],
	fn=run_inference,
	cache_examples=False, # ZeroGPU does not support example caching
	label="Preset examples",
	)

	gr.Markdown(
	"---\n"
	"Model: [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf) • "
	"Paper: [NeurIPS 2025 Spotlight](https://research.nvidia.com/labs/adlr/AF3/) • "
	"License: NVIDIA OneWay Noncommercial\n\n"
	"AF3 processes audio in 30-second windows, up to 10 minutes total."
	)

	demo.launch()