Spaces:
No application file
No application file
| // Nanochat d24: ReLU^2, QK-norm after RoPE, logit softcap 15, | |
| // per-layer residual scalars (x = rl*x + xl*x0), value embeddings | |
| // on alternating layers, backout (subtract mid-layer residual). | |
| // All norms are unweighted RMSNorm (pass NULL weight). | |
| // Scalar params read as float from model struct (not ggml tensors, | |
| // because ggml_mul with {1} tensors causes precision issues on CPU). | |
| llm_build_nanochat::llm_build_nanochat(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { | |
| const int64_t n_embd_head = hparams.n_embd_head_v(); | |
| GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); | |
| ggml_tensor * cur; | |
| ggml_tensor * inpL; | |
| inpL = build_inp_embd(model.tok_embd); | |
| ggml_tensor * inp_tokens = res->t_inp_tokens; | |
| // Embedding norm (unweighted) | |
| inpL = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, -1); | |
| cb(inpL, "inp_norm", -1); | |
| // x0 for residual scaling (explicit copy for lifetime tracking) | |
| ggml_tensor * x0 = ggml_cont(ctx0, inpL); | |
| ggml_set_name(x0, "x0"); | |
| ggml_build_forward_expand(gf, x0); | |
| ggml_tensor * inp_pos = build_inp_pos(); | |
| auto * inp_attn = build_attn_inp_kv(); | |
| const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); | |
| const int backout_layer = n_layer / 2; | |
| ggml_tensor * x_backout = nullptr; | |
| for (int il = 0; il < n_layer; ++il) { | |
| auto & layer = model.layers[il]; | |
| // Per-layer residual scaling: x = resid_lambda * x + x0_lambda * x0 | |
| { | |
| float rl = model.nanochat_resid_lambda[il]; | |
| float xl = model.nanochat_x0_lambda[il]; | |
| inpL = ggml_add(ctx0, ggml_scale(ctx0, inpL, rl), ggml_scale(ctx0, x0, xl)); | |
| } | |
| ggml_tensor * inpSA = inpL; | |
| // Pre-attention norm (unweighted) | |
| cur = build_norm(inpL, NULL, NULL, LLM_NORM_RMS, il); | |
| cb(cur, "attn_norm", il); | |
| // Q, K, V | |
| ggml_tensor * Qcur = build_lora_mm(layer.wq, cur); | |
| ggml_tensor * Kcur = build_lora_mm(layer.wk, cur); | |
| ggml_tensor * Vcur = build_lora_mm(layer.wv, cur); | |
| Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); | |
| Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); | |
| Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); | |
| // Value embeddings on alternating layers | |
| if (layer.value_embd && layer.wqkv_gate) { | |
| ggml_tensor * ve = ggml_get_rows(ctx0, layer.value_embd, inp_tokens); | |
| ve = ggml_reshape_3d(ctx0, ve, n_embd_head, n_head_kv, n_tokens); | |
| ggml_tensor * gate_in = ggml_view_2d(ctx0, cur, 12, n_tokens, cur->nb[1], 0); | |
| ggml_tensor * gate = build_lora_mm(layer.wqkv_gate, gate_in); | |
| gate = ggml_sigmoid(ctx0, ggml_scale(ctx0, gate, 3.0f)); | |
| gate = ggml_reshape_3d(ctx0, gate, 1, n_head_kv, n_tokens); | |
| Vcur = ggml_add(ctx0, Vcur, ggml_mul(ctx0, ve, gate)); | |
| } | |
| // RoPE (before QK-norm, nanochat order) | |
| ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); | |
| Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, | |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | |
| ext_factor, attn_factor, beta_fast, beta_slow); | |
| Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, | |
| n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, | |
| ext_factor, attn_factor, beta_fast, beta_slow); | |
| // QK-norm (after RoPE) + 1.15 sharpening | |
| Qcur = ggml_scale(ctx0, build_norm(Qcur, NULL, NULL, LLM_NORM_RMS, il), 1.15f); | |
| Kcur = ggml_scale(ctx0, build_norm(Kcur, NULL, NULL, LLM_NORM_RMS, il), 1.15f); | |
| cb(Qcur, "Qcur", il); | |
| cb(Kcur, "Kcur", il); | |
| cb(Vcur, "Vcur", il); | |
| // Attention + output proj | |
| cur = build_attn(inp_attn, | |
| layer.wo, NULL, | |
| Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); | |
| // Attention residual | |
| cur = ggml_add(ctx0, cur, inpSA); | |
| if (il == backout_layer) { | |
| x_backout = cur; | |
| } | |
| ggml_tensor * ffn_inp = cur; | |
| // Pre-FFN norm (unweighted) | |
| cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, il); | |
| cb(cur, "ffn_norm", il); | |
| // FFN: ReLU^2 | |
| cur = build_ffn(cur, | |
| layer.ffn_up, NULL, NULL, | |
| NULL, NULL, NULL, | |
| layer.ffn_down, NULL, NULL, | |
| NULL, | |
| LLM_FFN_RELU_SQR, LLM_FFN_SEQ, il); | |
| // FFN residual | |
| cur = ggml_add(ctx0, cur, ffn_inp); | |
| cb(cur, "l_out", il); | |
| inpL = cur; | |
| } | |
| cur = inpL; | |
| // Backout: subtract mid-layer residual | |
| if (x_backout && model.nanochat_backout != 0.0f) { | |
| cur = ggml_sub(ctx0, cur, ggml_scale(ctx0, x_backout, model.nanochat_backout)); | |
| } | |
| // Final norm (unweighted) | |
| cur = build_norm(cur, NULL, NULL, LLM_NORM_RMS, -1); | |
| cb(cur, "result_norm", -1); | |
| res->t_embd = cur; | |
| // lm_head | |
| cur = build_lora_mm(model.output, cur); | |
| // Logit softcap | |
| if (hparams.f_final_logit_softcapping) { | |
| cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_final_logit_softcapping); | |
| cur = ggml_tanh(ctx0, cur); | |
| cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping); | |
| } | |
| cb(cur, "result_output", -1); | |
| res->t_logits = cur; | |
| ggml_build_forward_expand(gf, cur); | |
| } | |