Spaces:

LisaMegaWatts
/

JuliaSLM-compressed-svd

Sleeping

App Files Files Community

LisaMegaWatts commited on Feb 28

Commit

7759445

verified ·

1 Parent(s): c8115ca

Upload juliaslm_svd_model.py with huggingface_hub

Browse files

Files changed (1) hide show

juliaslm_svd_model.py +267 -0

juliaslm_svd_model.py ADDED Viewed

	@@ -0,0 +1,267 @@

+"""JuliaSLM-compressed-svd — SVD-compressed inference model.
+LLaMA-style decoder with SVD-factored weight matrices. Each linear layer
+stores low-rank factors A (out, rank) and B (rank, in) instead of the full
+weight matrix, reducing parameter count while preserving model quality.
+Architecture: MHA (4 heads), RMSNorm, SwiGLU, RoPE, weight-tied output.
+Base config: d_model=256, n_layers=6, n_heads=4, head_dim=64, ctx=256,
+             vocab=2000, SVD-90 compression (~4.81M params).
+"""
+import math
+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# ═══════════════════════════════════════════════════════════════════
+# Configuration
+# ═══════════════════════════════════════════════════════════════════
+@dataclass
+class SVDConfig:
+    d_model: int = 256
+    n_layers: int = 6
+    n_heads: int = 4
+    head_dim: int = 64
+    ffn_inner: int = 640
+    context_length: int = 256
+    vocab_size: int = 2000
+    weight_tying: bool = True
+    rope_base: float = 10000.0
+    # Per-layer SVD ranks: list of dicts with keys wq, wk, wv, wo, w1, v, w2
+    layer_ranks: list = field(default_factory=list)
+    @staticmethod
+    def from_checkpoint(state_dict: dict) -> "SVDConfig":
+        """Build config by inspecting checkpoint tensor shapes."""
+        vocab_size, d_model = state_dict["tok_emb.weight"].shape
+        ctx_len = state_dict["rope.cos_cache"].shape[0]
+        head_dim = state_dict["rope.cos_cache"].shape[1] * 2  # cos_cache is half
+        n_heads = d_model // head_dim
+        ffn_inner = state_dict["blocks.0.ffn.w1.A"].shape[0]
+        n_layers = max(
+            int(k.split(".")[1])
+            for k in state_dict
+            if k.startswith("blocks.")
+        ) + 1
+        layer_ranks = []
+        for i in range(n_layers):
+            ranks = {}
+            for name in ("wq", "wk", "wv", "wo"):
+                ranks[name] = state_dict[f"blocks.{i}.attn.{name}.A"].shape[1]
+            for name in ("w1", "v", "w2"):
+                ranks[name] = state_dict[f"blocks.{i}.ffn.{name}.A"].shape[1]
+            layer_ranks.append(ranks)
+        return SVDConfig(
+            d_model=d_model,
+            n_layers=n_layers,
+            n_heads=n_heads,
+            head_dim=head_dim,
+            ffn_inner=ffn_inner,
+            context_length=ctx_len,
+            vocab_size=vocab_size,
+            layer_ranks=layer_ranks,
+        )
+# ═══════════════════════════════════════════════════════════════════
+# Building blocks
+# ═══════════════════════════════════════════════════════════════════
+class SVDLinear(nn.Module):
+    """Linear layer stored as low-rank A @ B factorization.
+    Forward: x @ B^T @ A^T  (equivalent to x @ (A @ B)^T = x @ W^T)
+    where W ≈ A @ B with A: (out, rank), B: (rank, in).
+    """
+    def __init__(self, out_features: int, rank: int, in_features: int):
+        super().__init__()
+        self.A = nn.Parameter(torch.empty(out_features, rank))
+        self.B = nn.Parameter(torch.empty(rank, in_features))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return F.linear(F.linear(x, self.B), self.A)
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rms = torch.sqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+        return x / rms * self.weight
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, max_seq_len: int = 256, base: float = 10000.0):
+        super().__init__()
+        freqs = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        positions = torch.arange(max_seq_len).float()
+        angles = torch.outer(positions, freqs)
+        self.register_buffer("cos_cache", angles.cos())
+        self.register_buffer("sin_cache", angles.sin())
+    def forward(self, x: torch.Tensor, start_pos: int = 0) -> torch.Tensor:
+        seq_len = x.size(2)
+        half = x.size(-1) // 2
+        x1, x2 = x[..., :half], x[..., half:]
+        cos = self.cos_cache[start_pos:start_pos + seq_len, :half].unsqueeze(0).unsqueeze(0)
+        sin = self.sin_cache[start_pos:start_pos + seq_len, :half].unsqueeze(0).unsqueeze(0)
+        return torch.cat([x1 * cos - x2 * sin, x1 * sin + x2 * cos], dim=-1)
+class SVDSwiGLU(nn.Module):
+    """SwiGLU FFN with SVD-compressed linear layers."""
+    def __init__(self, d_model: int, inner_dim: int, ranks: dict):
+        super().__init__()
+        self.w1 = SVDLinear(inner_dim, ranks["w1"], d_model)
+        self.v = SVDLinear(inner_dim, ranks["v"], d_model)
+        self.w2 = SVDLinear(d_model, ranks["w2"], inner_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.v(x))
+class SVDCausalAttention(nn.Module):
+    """Multi-head attention with SVD-compressed projections and KV cache."""
+    def __init__(self, d_model: int, n_heads: int, head_dim: int, ranks: dict):
+        super().__init__()
+        self.n_heads = n_heads
+        self.head_dim = head_dim
+        self.scale = 1.0 / math.sqrt(head_dim)
+        self.wq = SVDLinear(n_heads * head_dim, ranks["wq"], d_model)
+        self.wk = SVDLinear(n_heads * head_dim, ranks["wk"], d_model)
+        self.wv = SVDLinear(n_heads * head_dim, ranks["wv"], d_model)
+        self.wo = SVDLinear(d_model, ranks["wo"], n_heads * head_dim)
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RotaryEmbedding,
+        mask: Optional[torch.Tensor],
+        kv_cache: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        start_pos: int = 0,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        B, T, _ = x.shape
+        H, HD = self.n_heads, self.head_dim
+        q = self.wq(x).view(B, T, H, HD).transpose(1, 2)
+        k = self.wk(x).view(B, T, H, HD).transpose(1, 2)
+        v = self.wv(x).view(B, T, H, HD).transpose(1, 2)
+        q = rope(q, start_pos)
+        k = rope(k, start_pos)
+        if kv_cache is not None:
+            prev_k, prev_v = kv_cache
+            k = torch.cat([prev_k, k], dim=2)
+            v = torch.cat([prev_v, v], dim=2)
+        new_cache = (k, v)
+        attn = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+        if mask is not None:
+            attn = attn + mask
+        attn = F.softmax(attn, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(B, T, H * HD)
+        return self.wo(out), new_cache
+# ═══════════════════════════════════════════════════════════════════
+# Transformer block and model
+# ═══════════════════════════════════════════════════════════════════
+class SVDTransformerBlock(nn.Module):
+    def __init__(self, config: SVDConfig, layer_idx: int):
+        super().__init__()
+        ranks = config.layer_ranks[layer_idx]
+        self.ln1 = RMSNorm(config.d_model)
+        self.attn = SVDCausalAttention(
+            config.d_model, config.n_heads, config.head_dim, ranks
+        )
+        self.ln2 = RMSNorm(config.d_model)
+        self.ffn = SVDSwiGLU(config.d_model, config.ffn_inner, ranks)
+    def forward(
+        self,
+        x: torch.Tensor,
+        rope: RotaryEmbedding,
+        mask: Optional[torch.Tensor],
+        kv_cache: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        start_pos: int = 0,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        attn_out, new_cache = self.attn(self.ln1(x), rope, mask, kv_cache, start_pos)
+        x = x + attn_out
+        x = x + self.ffn(self.ln2(x))
+        return x, new_cache
+class JuliaSLM_SVD(nn.Module):
+    def __init__(self, config: SVDConfig):
+        super().__init__()
+        self.config = config
+        self.tok_emb = nn.Embedding(config.vocab_size, config.d_model)
+        self.rope = RotaryEmbedding(config.head_dim, config.context_length, config.rope_base)
+        self.blocks = nn.ModuleList(
+            [SVDTransformerBlock(config, i) for i in range(config.n_layers)]
+        )
+        self.ln_f = RMSNorm(config.d_model)
+        causal = torch.triu(
+            torch.full((config.context_length, config.context_length), float("-inf")),
+            diagonal=1,
+        )
+        self.register_buffer("causal_mask", causal)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        kv_caches: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None,
+    ) -> tuple[torch.Tensor, list[tuple[torch.Tensor, torch.Tensor]]]:
+        """Forward pass with optional KV cache.
+        Without cache (prefill): processes full sequence with causal mask.
+        With cache (decode): processes only new token(s), O(1) per token.
+        """
+        B, T = input_ids.shape
+        x = self.tok_emb(input_ids)
+        if kv_caches is not None:
+            start_pos = kv_caches[0][0].size(2)
+            mask = None
+        else:
+            start_pos = 0
+            mask = self.causal_mask[:T, :T].to(dtype=x.dtype)
+            kv_caches = [None] * len(self.blocks)
+        new_caches = []
+        for block, cache in zip(self.blocks, kv_caches):
+            x, new_cache = block(x, self.rope, mask, cache, start_pos)
+            new_caches.append(new_cache)
+        x = self.ln_f(x)
+        # Weight-tied output projection
+        logits = F.linear(x, self.tok_emb.weight)
+        return logits, new_caches
+    @property
+    def num_parameters(self) -> int:
+        return sum(p.numel() for p in self.parameters())