pfnet
/

plamo-2-1b

@@ -30,7 +30,7 @@
     "mamba_num_heads": 32,
     "mamba_step": 2,
     "max_position_embeddings": 10485760,
-    "model_type": "plamo",
     "n_expert": null,
     "num_attention_heads": 16,
     "num_hidden_layers": 16,

     "mamba_num_heads": 32,
     "mamba_step": 2,
     "max_position_embeddings": 10485760,
+    "model_type": "plamo2",
     "n_expert": null,
     "num_attention_heads": 16,
     "num_hidden_layers": 16,

modeling_plamo.py CHANGED Viewed

@@ -551,6 +551,68 @@ def _ssd_chunk_scan_combined_naive(
     return torch.cat(ys, dim=1), ssm_state
 def ssd_chunk_scan_combined(
     x: torch.Tensor,
     dt: torch.Tensor,
@@ -587,19 +649,19 @@ def ssd_chunk_scan_combined(
     To avoid updating state, we set dt to -inf and x to 0
     because `softplus(-inf) = 0` and `exp(0) = 1`
     """
-    if dt.is_cuda:
-        pad = (chunk_size - length % chunk_size) % chunk_size
-        x = torch.nn.functional.pad(x, pad=[0, 0, 0, 0, pad, 0], value=0.0)
-        dt = torch.nn.functional.pad(dt, pad=[0, 0, pad, 0], value=float("-inf"))
-        B = torch.nn.functional.pad(B, pad=[0, 0, 0, 0, pad, 0], value=0.0)
-        C = torch.nn.functional.pad(C, pad=[0, 0, 0, 0, pad, 0], value=0.0)
-        z = torch.nn.functional.pad(z, pad=[0, 0, 0, 0, pad, 0], value=0.0)
-        if seq_idx is not None:
-            seq_idx = torch.nn.functional.pad(seq_idx, pad=[pad, 0], value=0)
-        length = x.shape[1]
-        assert length % chunk_size == 0, (length, chunk_size)
         dtype = _get_trition_dtype(x.dtype)
         out = mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined(  # type: ignore
             x.to(dtype),
@@ -622,19 +684,75 @@ def ssd_chunk_scan_combined(
             assert isinstance(out, torch.Tensor)
             return out[:, pad:]
     else:
-        if ssm_state is None:
-            bsize, _, num_heads, channel = x.shape
-            state = B.shape[-1]
-            ssm_state = torch.zeros(bsize, num_heads, channel, state, dtype=torch.float32, device=x.device)
-        tmp = _ssd_chunk_scan_combined_naive(
-            x, dt, A, B, C, D, z=z, dt_bias=dt_bias, dt_softplus=dt_softplus, seq_idx=seq_idx, ssm_state=ssm_state
-        )
         if return_final_states:
             return tmp
         else:
             return tmp[0]
 def _causal_conv1d(
     conv_state: torch.Tensor | None, weight: torch.Tensor, x: torch.Tensor, seq_idx: torch.Tensor | None
 ) -> tuple[torch.Tensor, torch.Tensor | None]:
@@ -670,52 +788,27 @@ def _causal_conv1d(
         else:
             x = tmp
     else:
-        if conv_state is None:
-            bsize = x.shape[0]
-            dim = weight.shape[0]
-            d_conv = weight.shape[-1]
-            conv_state = torch.zeros(bsize, dim, d_conv - 1, dtype=x.dtype, device=x.device)
-        length = x.shape[-1]
-        out = torch.zeros_like(x)
-        for i in range(length):
-            if i != 0 and seq_idx is not None:
-                conv_state = torch.where(
-                    (seq_idx[:, i - 1] != seq_idx[:, i])[:, None, None],
-                    torch.zeros_like(conv_state),
-                    conv_state,
-                )
-            out[:, :, i : i + 1], conv_state = _causal_conv1d_update(conv_state, weight, x[:, :, i : i + 1])
-        x = out
     if return_final_states:
         return x, conv_state
     else:
         return x, None
-def _causal_conv1d_update(
-    conv_state: torch.Tensor, weight: torch.Tensor, xBC: torch.Tensor
-) -> tuple[torch.Tensor, torch.Tensor]:
-    dtype = conv_state.dtype
-    xBC = xBC.to(dtype)
-    weight = weight.to(dtype)
-    if conv_state.is_cuda:
-        x = causal_conv1d.causal_conv1d_update(
-            x=xBC,
-            conv_state=conv_state,
-            weight=weight[:, 0, :],
-            activation="silu",
-        )
-        return x, conv_state
-    else:
-        x = causal_conv1d.causal_conv1d_update_ref(
-            x=xBC,
-            conv_state=conv_state,
-            weight=weight[:, 0, :],
-            activation="silu",
-        )
-        return x, conv_state
 class Mamba(torch.nn.Module):
     def __init__(self, config: PlamoConfig, layer_idx: int) -> None:
         super().__init__()

     return torch.cat(ys, dim=1), ssm_state
+def _ssd_chunk_scan_combined_cpu(
+    x: torch.Tensor,
+    dt: torch.Tensor,
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    chunk_size: int,
+    D: torch.Tensor,
+    z: torch.Tensor,
+    dt_bias: torch.Tensor,
+    dt_softplus: bool,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # (bsize, nhead, nchunk, chunk_size)
+    dt = dt.float()  # We want high precision for this before cumsum
+    dt = dt.permute(0, 2, 1).unflatten(2, (-1, chunk_size))  # type: ignore
+    if dt_bias is not None:
+        dt = dt + dt_bias[None, :, None, None]
+    if dt_softplus:
+        dt = F.softplus(dt)
+    dA = dt * A[None, :, None, None]
+    dA_cumsum = torch.cumsum(dA, dim=-1)
+    _, _, nheads, _ = x.shape
+    dstate = B.shape[-1]
+    _ = dt.shape[2]
+    with torch.profiler.record_function("ssd_chunk_scan_combined_cpu_chunk_state"):
+        # Following is equivalent to `mamba_ssm.ops.triton.ssd_combined.chunk_state_ref(B, x, dt, dA_cumsum)`
+        # But `einsum` in the above function is too slow in CPU.
+        x_ = torch.unflatten(x, 1, (-1, chunk_size))
+        assert B.shape[2] == nheads  # B should be already expanded
+        B_ = torch.unflatten(B, 1, (-1, chunk_size)).to(x.dtype)  # (bsize, nchunk, chunk_size, nheads, dstate)
+        decay_states = torch.exp((dA_cumsum[:, :, :, -1:] - dA_cumsum)).to(x.dtype)
+        dt_ = dt.to(x.dtype)
+        # einsum("bclhn,bhcl,bhcl,bclhp->bchpn", B_, decay_states, dt_, x_)
+        B_ = B_.permute(0, 1, 3, 4, 2)  # bchnl
+        tmp = dt_ * decay_states  # bhcl
+        tmp = tmp.permute(0, 2, 1, 3)[:, :, :, None]  # bch1l
+        tmp = B_ * tmp  # bchnl
+        x_ = x_.permute(0, 1, 3, 2, 4)  # bchlp
+        tmp = tmp @ x_  # bchnp
+        states = tmp.permute(0, 1, 2, 4, 3)  # bchpn
+    states_dtype = states.dtype
+    if states.dtype not in [torch.float32, torch.float64]:
+        states = states.to(torch.float32)
+    with torch.profiler.record_function("ssd_chunk_scan_combined_cpu_state_passing"):
+        out, last_state = mamba_ssm.ops.triton.ssd_combined.state_passing_ref(
+            states.flatten(start_dim=-2, end_dim=-1),
+            dA_cumsum[:, :, :, -1],
+        )
+    states = torch.unflatten(out, -1, (-1, dstate))
+    last_state = torch.unflatten(last_state, -1, (-1, dstate))
+    states = states.to(states_dtype)
+    with torch.profiler.record_function("ssd_chunk_scan_combined_cpu_chunk_scan"):
+        out = mamba_ssm.ops.triton.ssd_combined.chunk_scan_ref(B, C, x, dt, dA_cumsum, states, D=D, z=z)
+    return out, last_state
+@torch.profiler.record_function("ssd_chunk_scan_combined")
 def ssd_chunk_scan_combined(
     x: torch.Tensor,
     dt: torch.Tensor,
     To avoid updating state, we set dt to -inf and x to 0
     because `softplus(-inf) = 0` and `exp(0) = 1`
     """
+    pad = (chunk_size - length % chunk_size) % chunk_size
+    x = torch.nn.functional.pad(x, pad=[0, 0, 0, 0, pad, 0], value=0.0)
+    dt = torch.nn.functional.pad(dt, pad=[0, 0, pad, 0], value=float("-inf"))
+    B = torch.nn.functional.pad(B, pad=[0, 0, 0, 0, pad, 0], value=0.0)
+    C = torch.nn.functional.pad(C, pad=[0, 0, 0, 0, pad, 0], value=0.0)
+    z = torch.nn.functional.pad(z, pad=[0, 0, 0, 0, pad, 0], value=0.0)
+    if seq_idx is not None:
+        seq_idx = torch.nn.functional.pad(seq_idx, pad=[pad, 0], value=0)
+    length = x.shape[1]
+    assert length % chunk_size == 0, (length, chunk_size)
+    if dt.is_cuda:
         dtype = _get_trition_dtype(x.dtype)
         out = mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined(  # type: ignore
             x.to(dtype),
             assert isinstance(out, torch.Tensor)
             return out[:, pad:]
     else:
+        if ssm_state is None and seq_idx is None:
+            tmp = _ssd_chunk_scan_combined_cpu(
+                x,
+                dt,
+                A,
+                B,
+                C,
+                chunk_size,
+                D=D,
+                z=z,
+                dt_bias=dt_bias.float(),
+                dt_softplus=dt_softplus,
+            )
+        else:
+            if ssm_state is None:
+                bsize, _, num_heads, channel = x.shape
+                state = B.shape[-1]
+                ssm_state = torch.zeros(bsize, num_heads, channel, state, dtype=torch.float32, device=x.device)
+            tmp = _ssd_chunk_scan_combined_naive(
+                x, dt, A, B, C, D, z=z, dt_bias=dt_bias, dt_softplus=dt_softplus, seq_idx=seq_idx, ssm_state=ssm_state
+            )
+        tmp = (tmp[0][:, pad:], tmp[1])
         if return_final_states:
             return tmp
         else:
             return tmp[0]
+def _causal_conv1d_update(
+    conv_state: torch.Tensor, weight: torch.Tensor, xBC: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    dtype = conv_state.dtype
+    xBC = xBC.to(dtype)
+    weight = weight.to(dtype)
+    if conv_state.is_cuda:
+        x = causal_conv1d.causal_conv1d_update(
+            x=xBC,
+            conv_state=conv_state,
+            weight=weight[:, 0, :],
+            activation="silu",
+        )
+        return x, conv_state
+    else:
+        x = causal_conv1d.causal_conv1d_update_ref(
+            x=xBC,
+            conv_state=conv_state,
+            weight=weight[:, 0, :],
+            activation="silu",
+        )
+        return x, conv_state
+def _causal_conv1d_naive(
+    conv_state: torch.Tensor, weight: torch.Tensor, x: torch.Tensor, seq_idx: torch.Tensor | None
+) -> tuple[torch.Tensor, torch.Tensor]:
+    length = x.shape[-1]
+    out = torch.zeros_like(x)
+    for i in range(length):
+        if i != 0 and seq_idx is not None:
+            conv_state = torch.where(
+                (seq_idx[:, i - 1] != seq_idx[:, i])[:, None, None],
+                torch.zeros_like(conv_state),
+                conv_state,
+            )
+        out[:, :, i : i + 1], conv_state = _causal_conv1d_update(conv_state, weight, x[:, :, i : i + 1])
+    return out, conv_state
+@torch.profiler.record_function("causal_conv1d")
 def _causal_conv1d(
     conv_state: torch.Tensor | None, weight: torch.Tensor, x: torch.Tensor, seq_idx: torch.Tensor | None
 ) -> tuple[torch.Tensor, torch.Tensor | None]:
         else:
             x = tmp
     else:
+        if seq_idx is None:
+            x, conv_state = causal_conv1d.causal_conv1d_ref(
+                x=x,
+                initial_states=conv_state,
+                return_final_states=True,
+                weight=weight[:, 0, :],
+                activation="silu",
+            )
+        else:
+            if conv_state is None:
+                bsize = x.shape[0]
+                dim = weight.shape[0]
+                d_conv = weight.shape[-1]
+                conv_state = torch.zeros(bsize, dim, d_conv - 1, dtype=x.dtype, device=x.device)
+            x, conv_state = _causal_conv1d_naive(conv_state, weight, x, seq_idx)
     if return_final_states:
         return x, conv_state
     else:
         return x, None
 class Mamba(torch.nn.Module):
     def __init__(self, config: PlamoConfig, layer_idx: int) -> None:
         super().__init__()