mlc-ai
diff --git a/‎python/mlc_chat/interface/convert_weight.py‎
Lines changed: 4 additions & 1 deletion b/‎python/mlc_chat/interface/convert_weight.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎python/mlc_chat/model/gpt2/gpt2_model.py‎
Lines changed: 7 additions & 21 deletions b/‎python/mlc_chat/model/gpt2/gpt2_model.py‎
Lines changed: 7 additions & 21 deletions
diff --git a/‎python/mlc_chat/model/llama/llama_model.py‎
Lines changed: 2 additions & 2 deletions b/‎python/mlc_chat/model/llama/llama_model.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/mlc_chat/model/mistral/mistral_model.py‎
Lines changed: 1 addition & 1 deletion b/‎python/mlc_chat/model/mistral/mistral_model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/mlc_chat/model/mixtral/__init__.py‎ b/‎python/mlc_chat/model/mixtral/__init__.py‎
diff --git a/‎python/mlc_chat/model/mixtral/mixtral_loader.py‎
Lines changed: 129 additions & 0 deletions b/‎python/mlc_chat/model/mixtral/mixtral_loader.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎python/mlc_chat/model/mixtral/mixtral_model.py‎
Lines changed: 174 additions & 0 deletions b/‎python/mlc_chat/model/mixtral/mixtral_model.py‎
Lines changed: 174 additions & 0 deletions
@@ -51,7 +51,10 @@ def _device_to_str(device: Device) -> str:
 
 
 def _calc_total_params(model: nn.Module) -> int:
-    _, named_params, _ = model.export_tvm(spec=model.get_default_spec(), allow_extern=True)
+    _, named_params, _ = model.export_tvm(  # type: ignore[misc]
+        spec=model.get_default_spec(),  # type: ignore[attr-defined]
+        allow_extern=True,
+    )
     total_params = 0
     for _, param in named_params:
         total_params += math.prod(param.shape)
 
@@ -3,13 +3,13 @@
 TODO: add docstring
 """
 import dataclasses
-import math
 from typing import Any, Dict, Optional
 
 from tvm import te, tir
 from tvm.relax.frontend import nn
 from tvm.relax.frontend.nn import Tensor, op
 
+from mlc_chat import op as op_ext
 from mlc_chat.support import logging
 from mlc_chat.support.config import ConfigBase
 from mlc_chat.support.style import bold
@@ -110,29 +110,15 @@ def forward(
 
         self.k_cache.append(op.squeeze(k, axis=0))
         self.v_cache.append(op.squeeze(v, axis=0))
-        k = op.reshape(self.k_cache.view(t), (b, t, h, d))
-        v = op.reshape(self.v_cache.view(t), (b, t, h, d))
-
-        q = q.permute_dims([0, 2, 1, 3])  # [b, h, s, d]
-        k = k.permute_dims([0, 2, 1, 3])  # [b, h, t, d]
-        v = v.permute_dims([0, 2, 1, 3])  # [b, h, t, d]
-
-        attn_weights = op.matmul(
-            q, k.permute_dims([0, 1, 3, 2])  # [b, h, s, d] x [b, h, d, t] = [b, h, s, t]
-        ) / math.sqrt(d)
+        k = self.k_cache.view(t)
+        v = self.v_cache.view(t)
 
         if self.scale_attn_by_inverse_layer_idx:
-            attn_weights = attn_weights / float(self.layer_idx + 1)
-
-        dtype = attn_weights.dtype
-        attn_weights = attn_weights.maximum(tir.min_value(dtype)).minimum(attention_mask)
-        if dtype == "float32":
-            attn_weights = op.softmax(attn_weights, axis=-1)
+            attn_score_scaling_factor = 1.0 / float(self.layer_idx + 1)
         else:
-            attn_weights = op.softmax(attn_weights.astype("float32"), axis=-1).astype(dtype)
-        # [b, h, s, t] x [b, h, t, d] => [b, h, s, d] => [b, s, h, d]
-        output = op.matmul(attn_weights, v)
-        return self.c_proj(output.permute_dims([0, 2, 1, 3]).reshape((b, s, h * d)))
+            attn_score_scaling_factor = 1.0
+        output = op_ext.attention(q, k, v, attention_mask, attn_score_scaling_factor)
+        return self.c_proj(output)
 
 
 class GPT2MLP(nn.Module):
 
@@ -10,7 +10,7 @@
 from tvm.relax.frontend.nn import Tensor, op
 
 from mlc_chat import op as op_ext
-from mlc_chat.nn.kv_cache import FlashInferPagedKVCache, PagedKVCache
+from mlc_chat.nn import FlashInferPagedKVCache, PagedKVCache
 from mlc_chat.support import logging
 from mlc_chat.support import tensor_parallel as tp
 from mlc_chat.support.config import ConfigBase
@@ -342,7 +342,7 @@ def create_flashinfer_paged_kv_cache(
         num_kv_heads = self.num_key_value_heads // self.tensor_parallel_shards
         # Note: Right now we only have FlashInfer-based KV cache supported.
         # TIR version will be introduced soon.
-        return FlashInferPagedKVCache.create(
+        return FlashInferPagedKVCache(
             max_batch_size=max_batch_size,
             max_total_seq_len=max_total_seq_len,
             page_size=page_size,
 
@@ -358,7 +358,7 @@ def __init__(self, config: MistralConfig):
             [MistralDecoderLayer(config, rotary_embedding) for _ in range(config.num_hidden_layers)]
         )
         self.norm = nn.RMSNorm(config.hidden_size, -1, config.rms_norm_eps, bias=False)
-        self.tensor_parallel_shards = config.tensor_parallel_shards > 1
+        self.tensor_parallel_shards = config.tensor_parallel_shards
 
     def forward(  # pylint: disable=too-many-arguments
         self,
 
@@ -0,0 +1,129 @@
+"""
+This file specifies how MLC's Mixtral parameter maps from other formats, for example HuggingFace
+PyTorch, HuggingFace safetensors.
+"""
+import functools
+
+import numpy as np
+
+from mlc_chat.loader import ExternMapping
+from mlc_chat.quantization import Quantization
+
+from .mixtral_model import MixtralConfig, MixtralForCasualLM
+
+
+def huggingface(model_config: MixtralConfig, quantization: Quantization) -> ExternMapping:
+    """Returns a parameter mapping that maps from the names of MLC LLM parameters to
+    the names of HuggingFace PyTorch parameters.
+
+    Parameters
+    ----------
+    model_config : MixtralConfig
+        The configuration of the Mixtral model.
+
+    quantization : Quantization
+        The quantization configuration.
+
+    Returns
+    -------
+    param_map : ExternMapping
+        The parameter mapping from MLC to HuggingFace PyTorch.
+    """
+    model = MixtralForCasualLM(model_config)
+    if quantization is not None:
+        model.to(quantization.model_dtype)
+    _, _named_params, _ = model.export_tvm(  # type: ignore[misc]
+        spec=model.get_default_spec(),
+        allow_extern=True,
+    )
+    named_parameters = dict(_named_params)
+
+    mapping = ExternMapping()
+
+    for i in range(model_config.num_hidden_layers):
+        # Add QKV in self attention
+        attn = f"model.layers.{i}.self_attn"
+        mlc_name = f"{attn}.qkv_proj.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                f"{attn}.q_proj.weight",
+                f"{attn}.k_proj.weight",
+                f"{attn}.v_proj.weight",
+            ],
+            functools.partial(
+                lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        # Add gates in MLP (when MoE is enabled)
+        mlp = f"model.layers.{i}.block_sparse_moe"
+        mlc_mlp = f"model.layers.{i}.moe"
+        mlc_name = f"{mlc_mlp}.e1_e3.weight"
+        mlc_param = named_parameters[mlc_name]
+
+        def combine_expert_gate_up(*hf_params, dtype):
+            stack = []
+            for i in range(0, len(hf_params), 2):
+                stack.append(np.concatenate([hf_params[i], hf_params[i + 1]], axis=0))
+            return np.stack(stack, axis=0).astype(dtype)
+
+        mapping.add_mapping(
+            mlc_name,
+            functools.reduce(
+                lambda a, b: a + b,
+                [
+                    [
+                        f"{mlp}.experts.{expert_id}.w1.weight",
+                        f"{mlp}.experts.{expert_id}.w3.weight",
+                    ]
+                    for expert_id in range(model_config.num_local_experts)
+                ],
+            ),
+            functools.partial(
+                combine_expert_gate_up,
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        mlc_name = f"{mlc_mlp}.e2.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [
+                f"{mlp}.experts.{expert_id}.w2.weight"
+                for expert_id in range(model_config.num_local_experts)
+            ],
+            functools.partial(
+                lambda *hf_params, dtype: np.stack(hf_params, axis=0).astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        mlc_name = f"{mlc_mlp}.gate.weight"
+        mlc_param = named_parameters[mlc_name]
+        mapping.add_mapping(
+            mlc_name,
+            [f"{mlp}.gate.weight"],
+            functools.partial(
+                lambda x, dtype: x.astype(dtype),
+                dtype=mlc_param.dtype,
+            ),
+        )
+
+        # inv_freq is not used in the model
+        mapping.add_unused(f"{attn}.rotary_emb.inv_freq")
+
+    for mlc_name, mlc_param in named_parameters.items():
+        if mlc_name not in mapping.param_map:
+            mapping.add_mapping(
+                mlc_name,
+                [mlc_name],
+                functools.partial(
+                    lambda x, dtype: x.astype(dtype),
+                    dtype=mlc_param.dtype,
+                ),
+            )
+    return mapping
@@ -0,0 +1,174 @@
+"""Implementation for Mistral architecture."""
+import dataclasses
+
+from tvm import tir
+from tvm.relax.frontend import nn
+from tvm.relax.frontend.nn import Tensor, op
+
+from mlc_chat import op as op_ext
+from mlc_chat.model.mistral.mistral_model import (
+    MistralAttention,
+    MistralConfig,
+    MistralForCasualLM,
+    MistralModel,
+    RotaryEmbedding,
+)
+from mlc_chat.nn.expert import MixtralExperts
+from mlc_chat.support import logging
+from mlc_chat.support import tensor_parallel as tp
+
+logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class MixtralConfig(MistralConfig):  # pylint: disable=too-many-instance-attributes
+    """Configuration of the Mixtral model."""
+
+    num_local_experts: int = 0
+    num_experts_per_tok: int = 0
+
+
+# pylint: disable=invalid-name,missing-docstring,too-many-locals,fixme
+
+
+class MixtralMoE(nn.Module):
+    """Mixture of experts"""
+
+    def __init__(self, config: MixtralConfig):
+        super().__init__()
+        self.num_experts_per_tok = config.num_experts_per_tok
+        self.num_local_experts = config.num_local_experts
+        self.intermediate_size = config.intermediate_size // config.tensor_parallel_shards
+        self.gate = nn.Linear(
+            in_features=config.hidden_size,
+            out_features=config.num_local_experts,
+            bias=False,
+        )
+        self.e1_e3 = MixtralExperts(
+            self.num_local_experts,
+            in_features=config.hidden_size,
+            out_features=2 * self.intermediate_size,
+        )
+        self.e2 = MixtralExperts(
+            self.num_local_experts,
+            in_features=self.intermediate_size,
+            out_features=config.hidden_size,
+        )
+        self.dtype = "float32"
+
+    def forward(self, x: Tensor):
+        def _expert_forward(x: Tensor, indptr: Tensor):
+            x1_x3 = self.e1_e3(x, indptr)
+            x1, x3 = op.split(x1_x3, indices_or_sections=2, axis=-1)
+            x = self.e2(op.silu(x1) * x3, indptr)
+            return x
+
+        experts_per_tok = self.num_experts_per_tok  # activated experts per token
+        local_experts = self.num_local_experts  # total number of experts
+        batch_size, seq_len, hidden_size = x.shape
+        num_tokens = batch_size * seq_len
+        x = x.reshape(num_tokens, hidden_size)
+        # gate: [num_tokens, local_experts]
+        gate: Tensor = self.gate(x)
+        # expert_weights: [num_tokens, experts_per_tok]
+        # expert_indices: [num_tokens, experts_per_tok]
+        expert_weights, expert_indices = op_ext.moe_misc.topk(gate, experts_per_tok)
+        expert_weights = op.softmax(expert_weights.astype("float32"), axis=-1).astype(self.dtype)
+        if num_tokens == 1:
+            # x: [num_tokens * experts_per_tok, hidden_size]
+            x = _expert_forward(x, expert_indices)
+        else:
+            # cumsum: [num_tokens * total_experts]
+            cumsum = op_ext.moe_misc.moe_cumsum(expert_indices, local_experts)
+            # indices: [num_tokens * experts_per_tok]
+            indices = op_ext.moe_misc.get_indices(cumsum, expert_indices)
+            # indptr: [num_local_experts + 1]
+            indptr = op_ext.moe_misc.get_indptr(cumsum, local_experts)
+            # x: [num_tokens * experts_per_tok, hidden_size]
+            x = op.take(x, indices / experts_per_tok, axis=0)
+            x = _expert_forward(x, indptr)
+            x = op_ext.moe_misc.scatter_output(x, indices)
+        # x: [num_tokens, experts_per_tok, hidden_size]
+        x = x.reshape(  # pylint: disable=too-many-function-args
+            num_tokens, experts_per_tok, hidden_size
+        ) * expert_weights.reshape(  # pylint: disable=too-many-function-args
+            num_tokens, experts_per_tok, 1
+        )
+        # x: [num_tokens, hidden_size]
+        x = op_ext.moe_misc.moe_sum(x, dim=1)
+        x = x.reshape(batch_size, seq_len, hidden_size)  # pylint: disable=too-many-function-args
+        return x
+
+
+class MixtralDecoderLayer(nn.Module):
+    """Mixtral decoder layer"""
+
+    def __init__(self, config: MixtralConfig, rotary_embedding: RotaryEmbedding):
+        eps = config.rms_norm_eps
+        self.self_attn = MistralAttention(config, rotary_embedding)
+        self.moe = MixtralMoE(config)
+        self.input_layernorm = nn.RMSNorm(config.hidden_size, -1, eps, bias=False)
+        self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, -1, eps, bias=False)
+
+        def _set_tp():
+            def _set(layer, hint):
+                layer.weight.attrs["shard_strategy"] = hint
+
+            hd = config.head_dim
+            q = self.self_attn.num_q_heads * hd
+            k = self.self_attn.num_kv_heads * hd
+            v = self.self_attn.num_kv_heads * hd
+            i = self.moe.intermediate_size
+            _set(self.self_attn.qkv_proj, tp.ShardSingleDim("_shard_qkv", segs=[q, k, v], dim=0))
+            _set(self.self_attn.o_proj, tp.ShardSingleDim("_shard_o", dim=1))
+            _set(self.moe.e1_e3, tp.ShardSingleDim("_shard_mlp_up", segs=[i, i], dim=1))
+            _set(self.moe.e2, tp.ShardSingleDim("_shard_mlp_down", dim=2))
+
+        self.tensor_parallel_shards = config.tensor_parallel_shards
+        _set_tp()
+
+    def forward(  # pylint: disable=too-many-arguments
+        self,
+        hidden_states: Tensor,
+        attention_mask: Tensor,
+        rolling_cache_len: tir.Var,
+        kv_seq_len: tir.Var,
+        cache_offset: tir.Var,
+    ):
+        """Forward pass of a decoder layer; calculate attention, and add an residual connection."""
+
+        def _apply_residual(out, residual):
+            if self.tensor_parallel_shards > 1:
+                return op.ccl_allreduce(out + residual / self.tensor_parallel_shards, "sum")
+            return out + residual
+
+        out = self.self_attn(
+            self.input_layernorm(hidden_states),
+            attention_mask,
+            rolling_cache_len,
+            kv_seq_len,
+            cache_offset,
+        )
+        hidden_states = _apply_residual(out, residual=hidden_states)
+        out = self.moe(self.post_attention_layernorm(hidden_states))
+        hidden_states = _apply_residual(out, residual=hidden_states)
+        return hidden_states
+
+
+class MixtralModel(MistralModel):
+    """Exact same as LlamaModel."""
+
+    def __init__(self, config: MixtralConfig):
+        super().__init__(config)
+        rotary_embedding = RotaryEmbedding(config)
+        self.layers = nn.ModuleList(
+            [MixtralDecoderLayer(config, rotary_embedding) for _ in range(config.num_hidden_layers)]
+        )
+
+
+class MixtralForCasualLM(MistralForCasualLM):
+    """Same as LlamaForCausalLM, except for the use of sliding window attention."""
+
+    def __init__(self, config: MixtralConfig):
+        super().__init__(config)
+        self.model = MixtralModel(config)
Original file line number	Diff line number	Diff line change
`@@ -358,7 +358,7 @@ def __init__(self, config: MistralConfig):`
`358`	`358`	`[MistralDecoderLayer(config, rotary_embedding) for _ in range(config.num_hidden_layers)]`
`359`	`359`	`)`
`360`	`360`	`self.norm = nn.RMSNorm(config.hidden_size, -1, config.rms_norm_eps, bias=False)`
`361`		`- self.tensor_parallel_shards = config.tensor_parallel_shards > 1`
	`361`	`+ self.tensor_parallel_shards = config.tensor_parallel_shards`
`362`	`362`
`363`	`363`	`def forward( # pylint: disable=too-many-arguments`
`364`	`364`	`self,`