"""
Audio Flamingo 3 — Interactive Demo
HuggingFace Space with ZeroGPU.
GPU is allocated dynamically per request — no idle cost.

Model: nvidia/audio-flamingo-3-hf
License: NVIDIA OneWay Noncommercial License
"""

import gradio as gr
import torch
import spaces
from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor

# ── Model loading ──────────────────────────────────────────────────────────────
# With ZeroGPU, the model lives in CPU RAM between requests.
# GPU is only allocated inside functions decorated with @spaces.GPU.

MODEL_ID = "nvidia/audio-flamingo-3-hf"

print(f"Loading {MODEL_ID}...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AudioFlamingo3ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)
model.eval()
print("Model ready.")


# ── Inference ──────────────────────────────────────────────────────────────────
@spaces.GPU(duration=120)
def run_inference(audio_path, question, use_thinking, max_new_tokens):
    if audio_path is None:
        return "", "Please upload an audio file first."
    if not question.strip():
        return "", "Please enter a question."

    # Move model to GPU (allocated by ZeroGPU at this point)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Build conversation in AF3 chat format
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question},
                {"type": "audio", "path": audio_path},
            ],
        }
    ]

    # Tokenize
    inputs = processor.apply_chat_template(
        conversation,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
    ).to(device)

    # Generate
    with torch.inference_mode():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=int(max_new_tokens),
            do_sample=False,
        )

    # Decode new tokens only (strip the input prefix)
    new_tokens = output_ids[:, inputs["input_ids"].shape[1]:]
    raw = processor.batch_decode(
        new_tokens, skip_special_tokens=True, strip_prefix=True
    )[0].strip()

    # Release GPU back to ZeroGPU pool
    model.to("cpu")
    torch.cuda.empty_cache()

    # Parse AF-Think format: <think>...</think> Answer: ...
    reasoning, answer = "", raw
    if "<think>" in raw and "</think>" in raw:
        s = raw.index("<think>") + len("<think>")
        e = raw.index("</think>")
        reasoning = raw[s:e].strip()
        after = raw[e + len("</think>"):].strip()
        answer = after.split("Answer:", 1)[1].strip() if "Answer:" in after else after or raw

    return reasoning, answer


# ── Example audio ──────────────────────────────────────────────────────────────
AUDIO_BASE = "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets"
EXAMPLES = [
    [f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3",
     "What is happening in this audio? Describe in detail.", True, 300],
    [f"{AUDIO_BASE}/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
     "Transcribe the speech in this recording.", False, 200],
    [f"{AUDIO_BASE}/WhDJDIviAOg_120_10.mp3",
     "Is this music or environmental sound? Reason step by step.", True, 400],
]


# ── UI ─────────────────────────────────────────────────────────────────────────
CSS = """
.gradio-container { max-width: 900px; margin: auto; }
.think-box { border-left: 3px solid #FFE66D; padding: 10px 14px;
             background: #111; border-radius: 6px; color: #ccc; font-size: 0.9rem; }
.answer-box { border-left: 3px solid #4ECDC4; padding: 10px 14px; border-radius: 6px; }
.subtitle { color: #888; font-size: 0.92rem; margin-top: -6px; margin-bottom: 20px; }
footer { display: none !important; }
"""

with gr.Blocks(css=CSS, title="Audio Flamingo 3") as demo:

    gr.Markdown("# 🦩 Audio Flamingo 3")
    gr.Markdown(
        "<p class='subtitle'>NVIDIA's audio language model — NeurIPS 2025 Spotlight. "
        "Upload audio, ask a question, get a reasoned answer.</p>"
    )

    with gr.Row(equal_height=False):

        # Left column: inputs
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                label="Audio",
                type="filepath",
                sources=["upload", "microphone"],
            )
            question_input = gr.Textbox(
                label="Question",
                placeholder="What do you hear? What instrument? Transcribe the speech.",
                lines=2,
            )
            with gr.Row():
                think_toggle = gr.Checkbox(
                    label="🧠 AF-Think",
                    value=True,
                    info="Step-by-step reasoning before the answer",
                )
                max_tokens = gr.Slider(
                    label="Max tokens", minimum=50, maximum=600, value=300, step=50,
                )
            submit_btn = gr.Button("Ask AF3 →", variant="primary", size="lg")

        # Right column: outputs
        with gr.Column(scale=1):
            reasoning_output = gr.Textbox(
                label="🧠 Reasoning (AF-Think)",
                lines=7, interactive=False,
                placeholder="Chain-of-thought reasoning will appear here...",
                elem_classes=["think-box"],
            )
            answer_output = gr.Textbox(
                label="✅ Answer",
                lines=4, interactive=False,
                placeholder="Answer will appear here...",
                elem_classes=["answer-box"],
            )

    # Submit on button click or Enter key
    submit_btn.click(
        fn=run_inference,
        inputs=[audio_input, question_input, think_toggle, max_tokens],
        outputs=[reasoning_output, answer_output],
    )
    question_input.submit(
        fn=run_inference,
        inputs=[audio_input, question_input, think_toggle, max_tokens],
        outputs=[reasoning_output, answer_output],
    )

    gr.Examples(
        examples=EXAMPLES,
        inputs=[audio_input, question_input, think_toggle, max_tokens],
        outputs=[reasoning_output, answer_output],
        fn=run_inference,
        cache_examples=False,  # ZeroGPU does not support example caching
        label="Preset examples",
    )

    gr.Markdown(
        "---\n"
        "**Model:** [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf) • "
        "**Paper:** [NeurIPS 2025 Spotlight](https://research.nvidia.com/labs/adlr/AF3/) • "
        "**License:** NVIDIA OneWay Noncommercial\n\n"
        "*AF3 processes audio in 30-second windows, up to 10 minutes total.*"
    )

demo.launch()