Spaces:
Runtime error
Runtime error
| """ | |
| Audio Flamingo 3 β Interactive Demo | |
| HuggingFace Space with ZeroGPU. | |
| GPU is allocated dynamically per request β no idle cost. | |
| Model: nvidia/audio-flamingo-3-hf | |
| License: NVIDIA OneWay Noncommercial License | |
| """ | |
| import gradio as gr | |
| import torch | |
| import spaces | |
| from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor | |
| # ββ Model loading ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # With ZeroGPU, the model lives in CPU RAM between requests. | |
| # GPU is only allocated inside functions decorated with @spaces.GPU. | |
| MODEL_ID = "nvidia/audio-flamingo-3-hf" | |
| print(f"Loading {MODEL_ID}...") | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| model = AudioFlamingo3ForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True, | |
| ) | |
| model.eval() | |
| print("Model ready.") | |
| # ββ Inference ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_inference(audio_path, question, use_thinking, max_new_tokens): | |
| if audio_path is None: | |
| return "", "Please upload an audio file first." | |
| if not question.strip(): | |
| return "", "Please enter a question." | |
| # Move model to GPU (allocated by ZeroGPU at this point) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model.to(device) | |
| # Build conversation in AF3 chat format | |
| conversation = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": question}, | |
| {"type": "audio", "path": audio_path}, | |
| ], | |
| } | |
| ] | |
| # Tokenize | |
| inputs = processor.apply_chat_template( | |
| conversation, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_dict=True, | |
| ).to(device) | |
| # Generate | |
| with torch.inference_mode(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=int(max_new_tokens), | |
| do_sample=False, | |
| ) | |
| # Decode new tokens only (strip the input prefix) | |
| new_tokens = output_ids[:, inputs["input_ids"].shape[1]:] | |
| raw = processor.batch_decode( | |
| new_tokens, skip_special_tokens=True, strip_prefix=True | |
| )[0].strip() | |
| # Release GPU back to ZeroGPU pool | |
| model.to("cpu") | |
| torch.cuda.empty_cache() | |
| # Parse AF-Think format: <think>...</think> Answer: ... | |
| reasoning, answer = "", raw | |
| if "<think>" in raw and "</think>" in raw: | |
| s = raw.index("<think>") + len("<think>") | |
| e = raw.index("</think>") | |
| reasoning = raw[s:e].strip() | |
| after = raw[e + len("</think>"):].strip() | |
| answer = after.split("Answer:", 1)[1].strip() if "Answer:" in after else after or raw | |
| return reasoning, answer | |
| # ββ Example audio ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| AUDIO_BASE = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets" | |
| EXAMPLES = [ | |
| [f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3", | |
| "What is happening in this audio? Describe in detail.", True, 300], | |
| [f"{AUDIO_BASE}/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav", | |
| "Transcribe the speech in this recording.", False, 200], | |
| [f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3", | |
| "Is this music or environmental sound? Reason step by step.", True, 400], | |
| ] | |
| # ββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| .gradio-container { max-width: 900px; margin: auto; } | |
| .think-box { border-left: 3px solid #FFE66D; padding: 10px 14px; | |
| background: #111; border-radius: 6px; color: #ccc; font-size: 0.9rem; } | |
| .answer-box { border-left: 3px solid #4ECDC4; padding: 10px 14px; border-radius: 6px; } | |
| .subtitle { color: #888; font-size: 0.92rem; margin-top: -6px; margin-bottom: 20px; } | |
| footer { display: none !important; } | |
| """ | |
| with gr.Blocks(css=CSS, title="Audio Flamingo 3") as demo: | |
| gr.Markdown("# 𦩠Audio Flamingo 3") | |
| gr.Markdown( | |
| "<p class='subtitle'>NVIDIA's audio language model β NeurIPS 2025 Spotlight. " | |
| "Upload audio, ask a question, get a reasoned answer.</p>" | |
| ) | |
| with gr.Row(equal_height=False): | |
| # Left column: inputs | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| label="Audio", | |
| type="filepath", | |
| sources=["upload", "microphone"], | |
| ) | |
| question_input = gr.Textbox( | |
| label="Question", | |
| placeholder="What do you hear? What instrument? Transcribe the speech.", | |
| lines=2, | |
| ) | |
| with gr.Row(): | |
| think_toggle = gr.Checkbox( | |
| label="π§ AF-Think", | |
| value=True, | |
| info="Step-by-step reasoning before the answer", | |
| ) | |
| max_tokens = gr.Slider( | |
| label="Max tokens", minimum=50, maximum=600, value=300, step=50, | |
| ) | |
| submit_btn = gr.Button("Ask AF3 β", variant="primary", size="lg") | |
| # Right column: outputs | |
| with gr.Column(scale=1): | |
| reasoning_output = gr.Textbox( | |
| label="π§ Reasoning (AF-Think)", | |
| lines=7, interactive=False, | |
| placeholder="Chain-of-thought reasoning will appear here...", | |
| elem_classes=["think-box"], | |
| ) | |
| answer_output = gr.Textbox( | |
| label="β Answer", | |
| lines=4, interactive=False, | |
| placeholder="Answer will appear here...", | |
| elem_classes=["answer-box"], | |
| ) | |
| # Submit on button click or Enter key | |
| submit_btn.click( | |
| fn=run_inference, | |
| inputs=[audio_input, question_input, think_toggle, max_tokens], | |
| outputs=[reasoning_output, answer_output], | |
| ) | |
| question_input.submit( | |
| fn=run_inference, | |
| inputs=[audio_input, question_input, think_toggle, max_tokens], | |
| outputs=[reasoning_output, answer_output], | |
| ) | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=[audio_input, question_input, think_toggle, max_tokens], | |
| outputs=[reasoning_output, answer_output], | |
| fn=run_inference, | |
| cache_examples=False, # ZeroGPU does not support example caching | |
| label="Preset examples", | |
| ) | |
| gr.Markdown( | |
| "---\n" | |
| "**Model:** [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf) β’ " | |
| "**Paper:** [NeurIPS 2025 Spotlight](https://research.nvidia.com/labs/adlr/AF3/) β’ " | |
| "**License:** NVIDIA OneWay Noncommercial\n\n" | |
| "*AF3 processes audio in 30-second windows, up to 10 minutes total.*" | |
| ) | |
| demo.launch() | |