feat: mvp kernel

Files changed (8) hide show

.gitignore +2 -0
build.toml +3 -0
flake.lock +168 -0
flake.nix +17 -0
tests/__init__.py +0 -0
tests/test_triton_moe.py +9 -0
torch-ext/triton_moe/__init__.py +65 -0
torch-ext/triton_moe/layers.py +118 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .venv
2	+ __pycache__

build.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[general]
+name = "triton_moe"
+universal = true

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1733328505,
+        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1747919133,
+        "narHash": "sha256-VvF1naQOvv7yulQ5/cDiaxkNxlh1Y84QMZnderv1szk=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "9c71e026d6c7c8588ef85a5f7c77f57d598e038c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1748620233,
+        "narHash": "sha256-VULm9HgGXvo3pyfsPy3SOhoqgkuqbGSaSemvzNUbdIU=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "da3340e5b3cbb6086600420f4814b033395788d1",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1747820358,
+        "narHash": "sha256-fTqsZsUX6M3yeEvgyQvXcbGmT2CaRVyVwsi8eK29Oj4=",
+        "owner": "danieldk",
+        "repo": "nixpkgs",
+        "rev": "d3c1681180717528068082103bf323147de6ab0b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "danieldk",
+        "ref": "cudatoolkit-12.9-kernel-builder",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  description = "Flake for triton_moe kernel";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs =
+    {
+      self,
+      kernel-builder,
+    }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

tests/__init__.py ADDED Viewed

File without changes

tests/test_triton_moe.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+import torch.nn.functional as F
+import triton_moe
+# def test_relu():
+#     x = torch.randn(1024, 1024, dtype=torch.float32, device="cuda")
+#     torch.testing.assert_allclose(F.relu(x), relu.relu(x))

torch-ext/triton_moe/__init__.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Optional
+import torch
+from ._ops import ops
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+import time
+import triton
+import triton.language as tl
+# Triton kernel for fused GLU + scaling operations
+@triton.jit
+def fused_glu_kernel(
+    gate_ptr,
+    up_ptr,
+    output_ptr,
+    n_elements,
+    alpha: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    # Load gate and up values - cast to float32 for computation stability
+    gate = tl.load(gate_ptr + offsets, mask=mask).to(tl.float32)
+    up = tl.load(up_ptr + offsets, mask=mask).to(tl.float32)
+    # Compute GLU: gate * sigmoid(gate * alpha) * (up + 1)
+    # Clamp scaled_gate to prevent overflow in sigmoid
+    scaled_gate = tl.math.fma(gate, alpha, 0.0)  # gate * alpha
+    scaled_gate = tl.clamp(scaled_gate, -20.0, 20.0)  # Prevent sigmoid overflow
+    sigmoid_gate = tl.sigmoid(scaled_gate)
+    glu = gate * sigmoid_gate
+    result = glu * (up + 1.0)
+    # Store result - cast back to original dtype
+    tl.store(output_ptr + offsets, result, mask=mask)
+def fused_glu_triton(gate_up_out: torch.Tensor, alpha: float) -> torch.Tensor:
+    batch_size, max_tokens, doubled_dim = gate_up_out.shape
+    gate, up = gate_up_out.chunk(2, dim=-1)
+    # Flatten for kernel processing
+    gate_flat = gate.contiguous().view(-1)
+    up_flat = up.contiguous().view(-1)
+    output_flat = torch.empty_like(gate_flat)
+    n_elements = gate_flat.numel()
+    # Launch Triton kernel
+    grid = (triton.cdiv(n_elements, 1024),)
+    fused_glu_kernel[grid](
+        gate_flat, up_flat, output_flat, n_elements, alpha, BLOCK_SIZE=1024
+    )
+    return output_flat.view(batch_size, max_tokens, -1)

torch-ext/triton_moe/layers.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+from .triton_moe import fused_glu_triton
+class MoE(nn.Module):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        router_idx: torch.Tensor,
+        router_wt: torch.Tensor,
+        alpha: float,
+        gate_up_weights: torch.Tensor,
+        gate_up_bias: torch.Tensor,
+        down_weights: torch.Tensor,
+        down_bias: torch.Tensor,
+    ):
+        num_tokens, hidden_dim = hidden_states.shape
+        num_experts = gate_up_weights.shape[0]
+        # Flatten routing indices and weights
+        flat_idx = router_idx.view(-1)
+        flat_wt = router_wt.view(-1)
+        # Create token indices for each routing decision
+        token_idx = (
+            torch.arange(num_tokens, device=hidden_states.device)
+            .unsqueeze(1)
+            .expand(-1, router_idx.shape[1])
+            .reshape(-1)
+        )
+        # Filter out invalid routes
+        valid_mask = flat_idx >= 0
+        if not valid_mask.all():
+            flat_idx = flat_idx[valid_mask]
+            flat_wt = flat_wt[valid_mask]
+            token_idx = token_idx[valid_mask]
+        if len(flat_idx) == 0:
+            return torch.zeros_like(hidden_states), torch.tensor(
+                0.0, device=hidden_states.device
+            )
+        # Count tokens per expert for efficient batching
+        expert_counts = torch.bincount(flat_idx, minlength=num_experts)
+        active_experts = (expert_counts > 0).nonzero().squeeze(-1)
+        if len(active_experts) == 0:
+            return torch.zeros_like(hidden_states), torch.tensor(
+                0.0, device=hidden_states.device
+            )
+        # Prepare batched tensors
+        max_tokens_per_expert = expert_counts.max().item()
+        batch_size = len(active_experts)
+        batched_tokens = torch.zeros(
+            batch_size,
+            max_tokens_per_expert,
+            hidden_dim,
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+        batched_weights = torch.zeros(
+            batch_size,
+            max_tokens_per_expert,
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+        batched_token_indices = torch.full(
+            (batch_size, max_tokens_per_expert),
+            -1,
+            device=hidden_states.device,
+            dtype=torch.long,
+        )
+        # Fill batched tensors
+        for i, expert_id in enumerate(active_experts):
+            expert_mask = flat_idx == expert_id
+            expert_token_indices = token_idx[expert_mask]
+            expert_weights = flat_wt[expert_mask]
+            num_expert_tokens = len(expert_token_indices)
+            if num_expert_tokens > 0:
+                batched_tokens[i, :num_expert_tokens] = hidden_states[
+                    expert_token_indices
+                ]
+                batched_weights[i, :num_expert_tokens] = expert_weights
+                batched_token_indices[i, :num_expert_tokens] = expert_token_indices
+        # Gate-up projection
+        gate_up_weights = gate_up_weights[active_experts]
+        gate_up_bias = gate_up_bias[active_experts]
+        gate_up_out = torch.bmm(
+            batched_tokens, gate_up_weights
+        ) + gate_up_bias.unsqueeze(1)
+        # Triton Fused GLU activation
+        fused = fused_glu_triton(gate_up_out, alpha)
+        # Down projection
+        down_weights = down_weights[active_experts]
+        down_bias = down_bias[active_experts]
+        expert_outputs = torch.bmm(fused, down_weights) + down_bias.unsqueeze(1)
+        # Apply routing weights and scatter back
+        weighted_outputs = expert_outputs * batched_weights.unsqueeze(-1)
+        output = torch.zeros_like(hidden_states)
+        for i in range(batch_size):
+            valid_indices = batched_token_indices[i][batched_token_indices[i] >= 0]
+            if len(valid_indices) > 0:
+                valid_outputs = weighted_outputs[i, : len(valid_indices)]
+                output.index_add_(0, valid_indices, valid_outputs)
+        return output