#include "models.h"

// Nanochat d24: ReLU^2, QK-norm after RoPE, logit softcap 15,
// per-layer residual scalars (x = rl*x + xl*x0), value embeddings
// on alternating layers, backout (subtract mid-layer residual).
// All norms are unweighted RMSNorm (pass NULL weight).
// Scalar params read as float from model struct (not ggml tensors,
// because ggml_mul with {1} tensors causes precision issues on CPU).

llm_build_nanochat::llm_build_nanochat(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    const int64_t n_embd_head = hparams.n_embd_head_v();
    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;

    inpL = build_inp_embd(model.tok_embd);
    ggml_tensor * inp_tokens = res->t_inp_tokens;

    // Embedding norm (unweighted)
    inpL = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1);
    cb(inpL, "inp_norm", -1);

    // x0 for residual scaling (explicit copy for lifetime tracking)
    ggml_tensor * x0 = ggml_cont(ctx0, inpL);
    ggml_set_name(x0, "x0");
    ggml_build_forward_expand(gf, x0);

    ggml_tensor * inp_pos = build_inp_pos();
    auto * inp_attn = build_attn_inp_kv();

    const float kq_scale = 1.0f / sqrtf(float(n_embd_head));
    const int backout_layer = n_layer / 2;
    ggml_tensor * x_backout = nullptr;

    for (int il = 0; il < n_layer; ++il) {
        auto & layer = model.layers[il];

        // Per-layer residual scaling: x = resid_lambda * x + x0_lambda * x0
        {
            float rl = model.nanochat_resid_lambda[il];
            float xl = model.nanochat_x0_lambda[il];
            inpL = ggml_add(ctx0, ggml_scale(ctx0, inpL, rl), ggml_scale(ctx0, x0, xl));
        }

        ggml_tensor * inpSA = inpL;

        // Pre-attention norm (unweighted)
        cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il);
        cb(cur, "attn_norm", il);

        // Q, K, V
        ggml_tensor * Qcur = build_lora_mm(layer.wq, cur);
        ggml_tensor * Kcur = build_lora_mm(layer.wk, cur);
        ggml_tensor * Vcur = build_lora_mm(layer.wv, cur);

        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);

        // Value embeddings on alternating layers
        if (layer.value_embd && layer.wqkv_gate) {
            ggml_tensor * ve = ggml_get_rows(ctx0, layer.value_embd, inp_tokens);
            ve = ggml_reshape_3d(ctx0, ve, n_embd_head, n_head_kv, n_tokens);

            ggml_tensor * gate_in = ggml_view_2d(ctx0, cur, 12, n_tokens, cur->nb[1], 0);
            ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, gate_in);
            gate = ggml_sigmoid(ctx0, ggml_scale(ctx0, gate, 3.0f));
            gate = ggml_reshape_3d(ctx0, gate, 1, n_head_kv, n_tokens);
            Vcur = ggml_add(ctx0, Vcur, ggml_mul(ctx0, ve, gate));
        }

        // RoPE (before QK-norm, nanochat order)
        ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
        Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
            ext_factor, attn_factor, beta_fast, beta_slow);
        Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
            ext_factor, attn_factor, beta_fast, beta_slow);

        // QK-norm (after RoPE) + 1.15 sharpening
        Qcur = ggml_scale(ctx0, build_norm(Qcur, NULL, NULL, LLM_NORM_RMS, il), 1.15f);
        Kcur = ggml_scale(ctx0, build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il), 1.15f);

        cb(Qcur, "Qcur", il);
        cb(Kcur, "Kcur", il);
        cb(Vcur, "Vcur", il);

        // Attention + output proj
        cur = build_attn(inp_attn,
            layer.wo, NULL,
            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);

        // Attention residual
        cur = ggml_add(ctx0, cur, inpSA);

        if (il == backout_layer) {
            x_backout = cur;
        }

        ggml_tensor * ffn_inp = cur;

        // Pre-FFN norm (unweighted)
        cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il);
        cb(cur, "ffn_norm", il);

        // FFN: ReLU^2
        cur = build_ffn(cur,
            layer.ffn_up,   NULL, NULL,
            NULL,           NULL, NULL,
            layer.ffn_down, NULL, NULL,
            NULL,
            LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il);

        // FFN residual
        cur = ggml_add(ctx0, cur, ffn_inp);
        cb(cur, "l_out", il);

        inpL = cur;
    }

    cur = inpL;

    // Backout: subtract mid-layer residual
    if (x_backout && model.nanochat_backout != 0.0f) {
        cur = ggml_sub(ctx0, cur, ggml_scale(ctx0, x_backout, model.nanochat_backout));
    }

    // Final norm (unweighted)
    cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;

    // lm_head
    cur = build_lora_mm(model.output, cur);

    // Logit softcap
    if (hparams.f_final_logit_softcapping) {
        cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping);
        cur = ggml_tanh(ctx0, cur);
        cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
    }

    cb(cur, "result_output", -1);
    res->t_logits = cur;
    ggml_build_forward_expand(gf, cur);
}