lew96123
/

qwen3.5-0.8b-custom-packed-8bit

@@ -1,6 +1,8 @@
 from __future__ import annotations
 import json
 from pathlib import Path
 import torch
@@ -8,29 +10,230 @@ from safetensors.torch import load_file
 from transformers import AutoProcessor, AutoTokenizer
 MANIFEST_FILENAME = "quant_manifest.json"
-WEIGHTS_FILENAME = "packed_weights.safetensors"
-def _unpack_codes(packed: torch.Tensor, bit_width: int, total_values: int) -> torch.Tensor:
     packed = packed.reshape(-1).to(torch.uint8).cpu()
-    if bit_width == 8:
-        return packed[:total_values].contiguous()
     values_per_byte = 8 // bit_width
     mask = (1 << bit_width) - 1
     packed_i32 = packed.to(torch.int32)
     parts = []
     for index in range(values_per_byte):
         parts.append(((packed_i32 >> (index * bit_width)) & mask).to(torch.uint8))
-    return torch.stack(parts, dim=1).reshape(-1)[:total_values].contiguous()
-def _name_to_dtype(name: str) -> torch.dtype:
-    return getattr(torch, name)
 def _create_empty_model(repo_dir: Path, loader_kind: str):
     from transformers import AutoConfig
     config = AutoConfig.from_pretrained(repo_dir, trust_remote_code=True)
     if loader_kind == "causal-lm":
         from transformers import AutoModelForCausalLM
@@ -41,63 +244,31 @@ def _create_empty_model(repo_dir: Path, loader_kind: str):
     raise ValueError(f"Unsupported loader kind: {loader_kind}")
-def _load_model_from_state_dict(repo_dir: Path, loader_kind: str, state_dict: dict[str, torch.Tensor]):
     model = _create_empty_model(repo_dir, loader_kind)
     incompatible = model.load_state_dict(state_dict, strict=False, assign=True)
     if hasattr(model, "tie_weights"):
         model.tie_weights()
     allowed_missing = {"lm_head.weight"}
     allowed_unexpected_prefixes = ("mtp.",)
-    disallowed_missing = sorted(key for key in incompatible.missing_keys if key not in allowed_missing)
     disallowed_unexpected = sorted(
-        key for key in incompatible.unexpected_keys if not key.startswith(allowed_unexpected_prefixes)
     )
     if disallowed_missing or disallowed_unexpected:
         raise RuntimeError(
-            "Unexpected state_dict mismatch while loading quantized model: "
             f"missing={disallowed_missing}, unexpected={disallowed_unexpected}"
         )
     return model
-def load_quantized_state_dict(repo_dir: str | Path):
-    repo_dir = Path(repo_dir)
-    manifest = json.loads((repo_dir / MANIFEST_FILENAME).read_text(encoding="utf-8"))
-    stored = load_file(str(repo_dir / WEIGHTS_FILENAME), device="cpu")
-    state_dict = {}
-    for name, spec in manifest["parameter_specs"].items():
-        prefix = spec["storage_prefix"]
-        if spec["quantized"]:
-            quantization_scheme = spec.get("quantization_scheme", manifest.get("quantization_scheme", "rowwise_symmetric"))
-            bit_width = int(spec["bit_width"])
-            group_size = int(spec["group_size"])
-            num_groups = int(spec["num_groups"])
-            total_group_values = int(spec.get("total_group_values", num_groups * group_size))
-            dtype = _name_to_dtype(spec["original_dtype"])
-            unpacked = _unpack_codes(stored[f"{prefix}__packed"], bit_width, total_group_values).to(torch.float32)
-            unpacked = unpacked.view(num_groups, group_size)
-            scales = stored[f"{prefix}__scales"].to(torch.float32).view(num_groups, 1)
-            if quantization_scheme != "rowwise_symmetric":
-                raise ValueError(f"Unsupported quantization scheme: {quantization_scheme}")
-            if bit_width == 1:
-                restored_groups = torch.where(unpacked > 0, torch.ones_like(unpacked), -torch.ones_like(unpacked)) * scales
-            else:
-                qmax = (1 << (bit_width - 1)) - 1
-                restored_groups = (unpacked - float(qmax)) * scales
-            row_count = int(spec["row_count"])
-            row_length = int(spec["row_length"])
-            groups_per_row = int(spec["groups_per_row"])
-            padded_row_length = groups_per_row * group_size
-            restored = restored_groups.view(row_count, groups_per_row, group_size).reshape(row_count, padded_row_length)
-            restored = restored[:, :row_length].contiguous().view(tuple(int(value) for value in spec["shape"]))
-            state_dict[name] = restored.to(dtype)
-            continue
-        state_dict[name] = stored[f"{prefix}__passthrough"].to(_name_to_dtype(spec["original_dtype"]))
-    return state_dict, manifest
 def load_quantized_model(repo_dir: str | Path, device: str | torch.device = "cpu"):
     repo_dir = Path(repo_dir)
     state_dict, manifest = load_quantized_state_dict(repo_dir)
     model = _load_model_from_state_dict(repo_dir, manifest["loader_kind"], state_dict)
@@ -107,6 +278,7 @@ def load_quantized_model(repo_dir: str | Path, device: str | torch.device = "cpu
 def load_tokenizer(repo_dir: str | Path):
     repo_dir = Path(repo_dir)
     if (repo_dir / "preprocessor_config.json").exists():
         processor = AutoProcessor.from_pretrained(repo_dir, trust_remote_code=True)

 from __future__ import annotations
 import json
+import math
+from functools import lru_cache
 from pathlib import Path
 import torch
 from transformers import AutoProcessor, AutoTokenizer
 MANIFEST_FILENAME = "quant_manifest.json"
+TURBOQUANT_WEIGHTS_FILENAME = "turboquant_weights.safetensors"
+def _name_to_dtype(name: str) -> torch.dtype:
+    return getattr(torch, name)
+def _unpack_indices(packed: torch.Tensor, bit_width: int, total_values: int) -> torch.Tensor:
     packed = packed.reshape(-1).to(torch.uint8).cpu()
+    if bit_width >= 8:
+        return packed[:total_values].to(torch.int64).contiguous()
     values_per_byte = 8 // bit_width
     mask = (1 << bit_width) - 1
     packed_i32 = packed.to(torch.int32)
     parts = []
     for index in range(values_per_byte):
         parts.append(((packed_i32 >> (index * bit_width)) & mask).to(torch.uint8))
+    return torch.stack(parts, dim=1).reshape(-1)[:total_values].to(torch.int64).contiguous()
+def _unpack_signs(packed: torch.Tensor, total_values: int) -> torch.Tensor:
+    bits = _unpack_indices(packed, 1, total_values)
+    return (bits.to(torch.int8) * 2 - 1)
+@lru_cache(maxsize=None)
+def _sphere_coordinate_density(dimension: int, grid_size: int):
+    """Lemma 1: Beta distribution density on unit sphere coordinates.
+    f_X(x) = Gamma(d/2) / (sqrt(pi) * Gamma((d-1)/2)) * (1 - x^2)^((d-3)/2)
+    Reference: arXiv:2504.19874v1, Lemma 1
+    """
+    eps = 1e-6
+    grid = torch.linspace(-1.0 + eps, 1.0 - eps, steps=grid_size, dtype=torch.float64)
+    exponent = 0.5 * float(dimension - 3)
+    log_const = (
+        torch.lgamma(torch.tensor(float(dimension) / 2.0, dtype=torch.float64))
+        - 0.5 * math.log(math.pi)
+        - torch.lgamma(torch.tensor(float(dimension - 1) / 2.0, dtype=torch.float64))
+    )
+    interior = torch.clamp(1.0 - grid.square(), min=1e-24)
+    density = torch.exp(log_const + exponent * torch.log(interior))
+    step = (2.0 - 2.0 * eps) / float(grid_size - 1)
+    weights = density * step
+    weights = weights / torch.sum(weights)
+    return grid, weights
+@lru_cache(maxsize=None)
+def _compute_codebook(dimension: int, bit_width: int, grid_size: int = 8193, iterations: int = 96):
+    """Lloyd-Max optimal scalar quantization on Beta distribution.
+    Solves Eq (4): argmin_{c_1,...,c_K} E[min_k |X - c_k|^2]
+    Reference: arXiv:2504.19874v1, Section 3.1, Eq (4)
+    """
+    if bit_width == 0:
+        return torch.empty(0, dtype=torch.float64)
+    grid, weights = _sphere_coordinate_density(dimension, grid_size)
+    codebook_size = 1 << bit_width
+    cumulative = torch.cumsum(weights, dim=0)
+    cumulative = cumulative / cumulative[-1]
+    targets = (torch.arange(codebook_size, dtype=torch.float64) + 0.5) / float(codebook_size)
+    init_idx = torch.clamp(torch.searchsorted(cumulative, targets), max=grid.numel() - 1)
+    centroids = torch.sort(grid[init_idx]).values
+    for _ in range(iterations):
+        if centroids.numel() == 1:
+            break
+        boundaries = torch.empty(codebook_size + 1, dtype=torch.float64)
+        boundaries[0] = -1.0
+        boundaries[-1] = 1.0
+        boundaries[1:-1] = 0.5 * (centroids[:-1] + centroids[1:])
+        bucket_ids = torch.bucketize(grid, boundaries[1:-1])
+        updated = centroids.clone()
+        for bucket in range(codebook_size):
+            mask = bucket_ids == bucket
+            if not bool(torch.any(mask)):
+                continue
+            w = weights[mask]
+            ws = torch.sum(w)
+            if float(ws.item()) <= 0.0:
+                continue
+            updated[bucket] = torch.sum(grid[mask] * w) / ws
+        updated = torch.sort(updated).values
+        if float(torch.max(torch.abs(updated - centroids)).item()) < 1e-12:
+            break
+        centroids = updated
+    return centroids
+@lru_cache(maxsize=None)
+def _cached_rotation_matrix(dimension: int, seed: int):
+    """Random rotation via QR decomposition of Gaussian matrix.
+    Reference: arXiv:2504.19874v1, Algorithm 1 (step: generate random rotation Pi)
+    """
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    gaussian = torch.randn((dimension, dimension), generator=gen, dtype=torch.float64)
+    q, r = torch.linalg.qr(gaussian, mode="reduced")
+    diag = torch.sign(torch.diag(r))
+    diag = torch.where(diag == 0, torch.ones_like(diag), diag)
+    q = q * diag.unsqueeze(0)
+    return q
+@lru_cache(maxsize=None)
+def _cached_projection_matrix(dimension: int, seed: int):
+    """Random Gaussian projection matrix for QJL.
+    Reference: arXiv:2504.19874v1, Definition 1 (S with iid N(0,1) entries)
+    """
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    return torch.randn((dimension, dimension), generator=gen, dtype=torch.float64)
+def _dequantize_mse(indices: torch.Tensor, norms: torch.Tensor, dimension: int,
+                    bit_width: int, seed: int, grid_size: int = 8193,
+                    iterations: int = 96):
+    """TurboQuant_MSE dequantization (Algorithm 1).
+    Dequant_mse(idx):
+      y_j = c_{idx_j}  (codebook lookup)
+      x_hat = Pi^T * y  (rotate back)
+      return x_hat * ||x||
+    Reference: arXiv:2504.19874v1, Algorithm 1
+    """
+    row_count = int(indices.shape[0])
+    codebook = _compute_codebook(dimension, bit_width, grid_size, iterations).to(torch.float32)
+    rotation = _cached_rotation_matrix(dimension, seed).to(torch.float32)
+    rotated = codebook[indices]
+    reconstructed = rotated @ rotation
+    return reconstructed * norms.to(torch.float32).unsqueeze(-1)
+def _dequantize_prod(mse_indices: torch.Tensor, qjl_signs: torch.Tensor,
+                     norms: torch.Tensor, residual_norms: torch.Tensor,
+                     dimension: int, bit_width: int, seed: int,
+                     grid_size: int = 8193, iterations: int = 96):
+    """TurboQuant_prod dequantization (Algorithm 2).
+    Dequant_prod(idx, qjl, gamma):
+      x_mse = Dequant_mse(idx)
+      x_qjl = sqrt(pi/2)/d * S^T * qjl
+      x_hat = x_mse + gamma * x_qjl
+      return x_hat * ||x||
+    Reference: arXiv:2504.19874v1, Algorithm 2
+    """
+    mse_bw = max(bit_width - 1, 0)
+    codebook = (_compute_codebook(dimension, mse_bw, grid_size, iterations).to(torch.float32)
+                if mse_bw > 0 else torch.zeros(1, dtype=torch.float32))
+    rotation = _cached_rotation_matrix(dimension, seed).to(torch.float32)
+    if mse_bw > 0:
+        mse_reconstructed = codebook[mse_indices] @ rotation
+    else:
+        mse_reconstructed = torch.zeros(mse_indices.shape[0], dimension, dtype=torch.float32)
+    projection = _cached_projection_matrix(dimension, seed + 1).to(torch.float32)
+    qjl_reconstructed = (math.sqrt(math.pi / 2.0) / float(dimension)) * (
+        qjl_signs.to(torch.float32) @ projection
+    )
+    reconstructed_unit = (mse_reconstructed
+                          + residual_norms.to(torch.float32).unsqueeze(-1) * qjl_reconstructed)
+    return reconstructed_unit * norms.to(torch.float32).unsqueeze(-1)
+def load_quantized_state_dict(repo_dir: str | Path):
+    """Load quantized state dict from TurboQuant checkpoint files."""
+    repo_dir = Path(repo_dir)
+    manifest = json.loads((repo_dir / MANIFEST_FILENAME).read_text(encoding="utf-8"))
+    stored = load_file(str(repo_dir / TURBOQUANT_WEIGHTS_FILENAME), device="cpu")
+    state_dict = {}
+    for name, spec in manifest["parameter_specs"].items():
+        prefix = spec["storage_prefix"]
+        if not spec["quantized"]:
+            state_dict[name] = stored[f"{prefix}__passthrough"].to(
+                _name_to_dtype(spec["original_dtype"])
+            )
+            continue
+        quantizer_type = spec["quantizer_type"]
+        dimension = int(spec["dimension"])
+        row_count = int(spec["row_count"])
+        bit_width = int(spec["bit_width"])
+        seed = int(spec["seed"])
+        grid_size = int(spec["grid_size"])
+        iterations = int(spec["iterations"])
+        original_shape = [int(v) for v in spec["shape"]]
+        original_dtype = _name_to_dtype(spec["original_dtype"])
+        if quantizer_type == "turboquant_mse":
+            total_idx = int(spec["total_index_values"])
+            indices = _unpack_indices(
+                stored[f"{prefix}__packed_indices"], bit_width, total_idx
+            ).reshape(row_count, dimension)
+            norms = stored[f"{prefix}__norms"].to(torch.float32)
+            state_dict[name] = _dequantize_mse(
+                indices, norms, dimension, bit_width, seed, grid_size, iterations
+            ).reshape(original_shape).to(original_dtype)
+        elif quantizer_type == "turboquant_prod":
+            mse_bw = max(bit_width - 1, 0)
+            total_idx = int(spec["total_index_values"])
+            total_signs = int(spec["total_sign_values"])
+            if mse_bw > 0:
+                mse_indices = _unpack_indices(
+                    stored[f"{prefix}__packed_indices"], mse_bw, total_idx
+                ).reshape(row_count, dimension)
+            else:
+                mse_indices = torch.zeros(row_count, dimension, dtype=torch.int64)
+            qjl_signs = _unpack_signs(
+                stored[f"{prefix}__packed_signs"], total_signs
+            ).reshape(row_count, dimension)
+            norms = stored[f"{prefix}__norms"].to(torch.float32)
+            residual_norms = stored[f"{prefix}__residual_norms"].to(torch.float32)
+            state_dict[name] = _dequantize_prod(
+                mse_indices, qjl_signs, norms, residual_norms,
+                dimension, bit_width, seed, grid_size, iterations
+            ).reshape(original_shape).to(original_dtype)
+        else:
+            raise ValueError(f"Unknown quantizer_type: {quantizer_type}")
+    return state_dict, manifest
 def _create_empty_model(repo_dir: Path, loader_kind: str):
     from transformers import AutoConfig
     config = AutoConfig.from_pretrained(repo_dir, trust_remote_code=True)
     if loader_kind == "causal-lm":
         from transformers import AutoModelForCausalLM
     raise ValueError(f"Unsupported loader kind: {loader_kind}")
+def _load_model_from_state_dict(repo_dir: Path, loader_kind: str,
+                                state_dict: dict[str, torch.Tensor]):
     model = _create_empty_model(repo_dir, loader_kind)
     incompatible = model.load_state_dict(state_dict, strict=False, assign=True)
     if hasattr(model, "tie_weights"):
         model.tie_weights()
     allowed_missing = {"lm_head.weight"}
     allowed_unexpected_prefixes = ("mtp.",)
+    disallowed_missing = sorted(
+        key for key in incompatible.missing_keys if key not in allowed_missing
+    )
     disallowed_unexpected = sorted(
+        key for key in incompatible.unexpected_keys
+        if not key.startswith(allowed_unexpected_prefixes)
     )
     if disallowed_missing or disallowed_unexpected:
         raise RuntimeError(
+            "Unexpected state_dict mismatch: "
             f"missing={disallowed_missing}, unexpected={disallowed_unexpected}"
         )
     return model
 def load_quantized_model(repo_dir: str | Path, device: str | torch.device = "cpu"):
+    """Load the quantized model, dequantize weights, and return (model, manifest)."""
     repo_dir = Path(repo_dir)
     state_dict, manifest = load_quantized_state_dict(repo_dir)
     model = _load_model_from_state_dict(repo_dir, manifest["loader_kind"], state_dict)
 def load_tokenizer(repo_dir: str | Path):
+    """Load the tokenizer from the repo directory."""
     repo_dir = Path(repo_dir)
     if (repo_dir / "preprocessor_config.json").exists():
         processor = AutoProcessor.from_pretrained(repo_dir, trust_remote_code=True)