Upload EvaByteForCausalLM

Browse files

Files changed (15) hide show

README.md +199 -0
config.json +48 -0
configuration_evabyte.py +99 -0
eva.py +419 -0
eva_agg_kernel.py +469 -0
eva_cache.py +761 -0
eva_prep_kv_kernel.py +357 -0
eva_pt_ref.py +422 -0
generation_config.json +7 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +362 -0
modeling_evabyte.py +1092 -0
multibyte_decoding_evabyte.py +881 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "_name_or_path": null,
+  "architectures": [
+    "EvaByteForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_class": "eva",
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_evabyte.EvaByteConfig",
+    "AutoModelForCausalLM": "modeling_evabyte.EvaByteForCausalLM"
+  },
+  "bos_token_id": 1,
+  "chunk_size": 16,
+  "eos_token_id": 11,
+  "fp32_ln": false,
+  "fp32_logits": true,
+  "fp32_skip_add": true,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "init_cutoff_factor": null,
+  "init_fn": "v2",
+  "init_std": 0.01275,
+  "initializer_range": 0.01275,
+  "intermediate_size": 11008,
+  "lazy_init": true,
+  "max_position_embeddings": 32768,
+  "max_seq_length": 32768,
+  "mixedp_attn": true,
+  "model_type": "evabyte",
+  "norm_add_unit_offset": true,
+  "num_attention_heads": 32,
+  "num_chunks": null,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 32,
+  "num_pred_heads": 8,
+  "pad_token_id": 0,
+  "return_dict": false,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 100000,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.47.1",
+  "use_cache": true,
+  "vocab_size": 320,
+  "window_size": 2048
+}

configuration_evabyte.py ADDED Viewed

	@@ -0,0 +1,99 @@

+""" EvaByte configuration"""
+from transformers.configuration_utils import PretrainedConfig
+class EvaByteConfig(PretrainedConfig):
+    model_type = "evabyte"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        vocab_size=320,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        norm_add_unit_offset=False,
+        init_fn="mitchell",
+        init_std=0.006,
+        init_cutoff_factor=None,
+        attention_class="mha",
+        window_size=512,
+        num_chunks=None,
+        chunk_size=256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.norm_add_unit_offset = norm_add_unit_offset
+        self.init_fn = init_fn
+        self.init_std = init_std
+        self.init_cutoff_factor = init_cutoff_factor
+        # Attention-specific paramters
+        self.attention_class = attention_class
+        self.window_size = window_size
+        self.num_chunks = num_chunks
+        self.chunk_size = chunk_size
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")

eva.py ADDED Viewed

	@@ -0,0 +1,419 @@

+from typing import Dict, Optional, Tuple, List, Any, Union
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .eva_agg_kernel import triton_eva_agg_fwd
+from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+try:
+    import triton
+    USE_TRITON_IMPL = True
+except ImportError:
+    USE_TRITON_IMPL = False
+    raise ImportError("Triton is not installed. Please install it by running `pip install triton`.")
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims (last dim) of the input.
+    Args:
+        x: Rotary embedded tensor
+    Return:
+        Tensor with half of last dim negated and rotated to the front.
+    """
+    x1, x2 = x.split(x.shape[-1] // 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                         position_ids: torch.Tensor) -> torch.Tensor:
+    """
+    Apply rotary embedding (cos, sin) to the query and key tensor on the sequence dimension.
+    The legends for dimensions are defined as:
+    num_heads: number of attention heads
+    current_seq_len: the current batch's sequence length, should be either 1 or max_seq_len
+    max_seq_len: the static sequence length, different from current_seq_len in cached inference case where it is always
+                 maximum lenghth, e.g. the length of static sequence length of KV cache
+    Args:
+        q: Query tensor, of size (batch_size, num_heads, current_seq_len, head_dim)
+        k: Key tensor, of size (batch_size, num_key_value_heads, current_seq_len, head_dim)
+        cos: Cosine base of rotary embedding, of size (max_seq_len, head_dim)
+        sin: Sine base of rotary embedding, of size (max_seq_len, head_dim)
+        position_ids: The position indices of the tokens corresponding to the query and key tensors. It has a size of
+                      (batch_size, current_seq_len).
+    Returns:
+        Embedded query and key tensor of same size as input.
+    """
+    bs, nheads, cur_seq_len, head_dim = q.shape
+    assert len(
+        k.shape) == 4, f"k should be of shape (batch_size, num_heads, current_seq_len, head_dim), got {k.shape} instead"
+    assert k.shape[0] == bs, f"k has a different batch_size {k.shape[0]} compared to q {bs}"
+    assert list(k.shape[2:]) == [cur_seq_len,
+                                 head_dim], f"k has different current_seq_len and/or head_dim compared to q"
+    assert cos.shape[3] == head_dim, f"cos should have dim of head dim {head_dim}, got {cos.shape[3]} instead"
+    assert list(position_ids.shape) in [[bs, cur_seq_len], [1, cur_seq_len]],\
+            f"position_ids should be of shape {[bs, cur_seq_len]} or {[1, cur_seq_len]}, got {position_ids.shape} instead"
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class EvaAttention(nn.Module):
+    """
+        Causal EVA for language modeling.
+    """
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim_scaling = self.head_dim ** -0.5
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.window_size = config.window_size
+        self.num_chunks = config.num_chunks
+        self.chunk_size = config.chunk_size
+        if self.chunk_size is not None:
+            assert self.window_size >= self.chunk_size and self.window_size % self.chunk_size == 0
+            # chunk_size overrides the number of landmarks
+            self.num_chunks = None
+        self.chunks_per_window = int(self.window_size // self.chunk_size)
+        self.adaptive_phi = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+        self.adaptive_mu_k = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+    def _triton_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert isinstance(attention_mask, tuple)
+        # infer the model's running mode
+        is_prefilling = use_cache and past_key_value.get_seq_length(self.layer_idx) == 0
+        is_decoding = use_cache and past_key_value.get_seq_length(self.layer_idx) > 0
+        if is_prefilling:
+            assert len(attention_mask) == 2
+            window_mask, intra_chunk_mask = attention_mask
+            chunk_dummpy_mask = None
+        elif is_decoding:
+            assert len(attention_mask) == 3
+            window_mask, intra_chunk_mask, chunk_dummpy_mask = attention_mask
+        else:
+            window_mask, intra_chunk_mask = attention_mask
+            chunk_dummpy_mask = None
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if use_cache:
+            past_key_value.update_past_len(q.shape[-2], self.layer_idx)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # update and get cached singleton tokens
+        # update and cache k and v for calculating chunk-level RFAs
+        ############################################
+        if use_cache:
+            s_k, s_v, dump_k, dump_v = past_key_value.update_singletons_and_chunks(
+                k,
+                v,
+                self.layer_idx,
+                self.window_size,
+            )
+        else:
+            s_k, s_v = k, v
+            dump_k, dump_v = k, v
+        if use_cache:
+            singleton_mask, dump_rf_mask = past_key_value.update_mask(
+                s_mask=window_mask,
+                rf_mask=intra_chunk_mask,
+                layer_idx=self.layer_idx,
+                window_size=self.window_size,
+            )
+        else:
+            singleton_mask = window_mask
+            dump_rf_mask = intra_chunk_mask
+        if dump_k is not None and dump_v is not None:
+            # 1. in prefilling, the input shape is
+            #   dump_k/dump_v: [b, h, n, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            # 2. in decoding, the input shape is
+            #   k/v: [b, h, w, d]
+            #   rfa_k/rfa_v: [b, h, w//c, d]
+            # 3. in forward inference; the seq_len is already divisible
+            rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                dump_k, dump_v,
+                self.adaptive_mu_k, self.adaptive_phi,
+                dump_rf_mask, self.head_dim_scaling, self.chunk_size
+            )
+            # rfa_mask = get_rfa_chunk_mask(dump_rf_mask)
+            if use_cache:
+                rfa_k, rfa_v = past_key_value.update_chunk_rfas(
+                    rfa_k, rfa_v, self.layer_idx
+                )
+        elif use_cache:
+            # if there are not enough elements within the last chunk,
+            # we will only use the cached chunk-level RFAs
+            rfa_k, rfa_v = past_key_value.get_chunk_rfas(self.layer_idx)
+        else:
+            rfa_k, rfa_v = None, None
+        ############################################
+        # compute the full attention output
+        ############################################
+        if is_prefilling:
+            # prefilling
+            # 1. in prefilling, the input shape is
+            #   q: [b, h, n, d]
+            #   k/v: [b, h, n, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            attn_output = triton_eva_agg_fwd(
+                q, s_k, s_v,
+                rfa_k, rfa_v,
+                singleton_mask, self.head_dim_scaling, self.window_size, self.chunks_per_window
+            )
+        elif is_decoding:
+            # 2. in decoding, the input shape is
+            #   q: [b, h, 1, d] or [b, h, z, d] (for multi-byte prediction)
+            #   k/v: [b, h, 1 + s, d]
+            #   rfa_k/rfa_v: [b, h, n // c, d]
+            if rfa_k is not None and rfa_v is not None:
+                # we only take the chunk-level RFAs not in the current window
+                seen_seq_len = past_key_value.get_seq_length(self.layer_idx)
+                if seen_seq_len <= self.window_size:
+                    agg_k = s_k
+                    agg_v = s_v
+                    attn_mask = singleton_mask
+                else:
+                    # NOTE: we already updated the cache so the length now
+                    # includes the current token
+                    # we subtract 1 from seen_seq_len because we want
+                    # if seen_seq_len = 2048 -> num_windows_seen_so_far = 0
+                    # if seen_seq_len = 4096 -> num_windows_seen_so_far = 1
+                    # if seen_seq_len = 4097 -> num_windows_seen_so_far = 2
+                    # NOTE the cat order should be taken care of;
+                    # should align with the order based on which
+                    # the attention mask is constructed
+                    num_windows_seen_so_far = (seen_seq_len - 1) // self.window_size
+                    agg_k = torch.cat([s_k, rfa_k[..., :num_windows_seen_so_far * self.chunks_per_window, :]], dim=-2)
+                    agg_v = torch.cat([s_v, rfa_v[..., :num_windows_seen_so_far * self.chunks_per_window, :]], dim=-2)
+                    if singleton_mask is not None:
+                        assert chunk_dummpy_mask is not None
+                        attn_mask = torch.cat([singleton_mask, chunk_dummpy_mask], dim=-1)
+                    else:
+                        attn_mask = singleton_mask
+            else:
+                agg_k = s_k
+                agg_v = s_v
+                attn_mask = singleton_mask
+            attn_output = F.scaled_dot_product_attention(
+                q, agg_k, agg_v,
+                attn_mask=attn_mask,
+                is_causal=False,
+                dropout_p=0.0,
+                scale=self.head_dim_scaling
+            )
+        else:
+            # 3. in single-forward inference
+            attn_output = triton_eva_agg_fwd(
+                q, s_k, s_v,
+                rfa_k, rfa_v,
+                singleton_mask, self.head_dim_scaling, self.window_size, self.chunks_per_window
+            )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def _multibyte_decoding_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # during multi-byte forwarding, we only read caches and do not update them
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert USE_TRITON_IMPL
+        assert isinstance(attention_mask, torch.Tensor) and attention_mask.dtype == torch.bool
+        assert use_cache and past_key_value.get_seq_length(self.layer_idx) > 0
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # update and get cached singleton tokens
+        ############################################
+        input_len = k.shape[-2]
+        window_pos = past_key_value.past_window_pos[self.layer_idx]
+        new_window_pos = window_pos + input_len
+        past_key_value.past_window_k[self.layer_idx][:, :, window_pos : new_window_pos, :] = k
+        past_key_value.past_window_v[self.layer_idx][:, :, window_pos : new_window_pos, :] = v
+        s_k = past_key_value.past_window_k[self.layer_idx][:, :, : new_window_pos, :]
+        s_v = past_key_value.past_window_v[self.layer_idx][:, :, : new_window_pos, :]
+        rfa_k, rfa_v = past_key_value.get_chunk_rfas(self.layer_idx)
+        ############################################
+        # compute the full attention output
+        ############################################
+        # 2. in decoding, the input shape is
+        #   q: [b, h, 1, d] or [b, h, z, d] (for multi-byte prediction)
+        #   k/v: [b, h, 1 + s, d]
+        #   rfa_k/rfa_v: [b, h, n // c, d]
+        if rfa_k is not None and rfa_v is not None:
+            # NOTE the cat order should be taken care of;
+            # should align with the order based on which
+            # the attention mask is constructed
+            # agg_k = torch.cat([s_k, rfa_k], dim=-2)
+            # agg_v = torch.cat([s_v, rfa_v], dim=-2)
+            agg_k = torch.cat([rfa_k, s_k], dim=-2)
+            agg_v = torch.cat([rfa_v, s_v], dim=-2)
+        else:
+            agg_k = s_k
+            agg_v = s_v
+        attn_output = F.scaled_dot_product_attention(
+            q, agg_k, agg_v,
+            attn_mask=attention_mask,
+            is_causal=False,
+            dropout_p=0.0,
+            scale=self.head_dim_scaling
+        )
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        if use_cache and past_key_value is None:
+            raise ValueError
+        assert USE_TRITON_IMPL
+        if use_cache and multibyte_decoding:
+            return self._multibyte_decoding_forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cos=cos,
+                sin=sin,
+            )
+        else:
+            return self._triton_forward(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cos=cos,
+                sin=sin,
+            )

eva_agg_kernel.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import math
+import torch
+import triton
+import triton.language as tl
+# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
+# @triton.autotune(
+#     configs=[
+#         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
+#         # This config has a race condition when EVEN_M == False, disabling it for now.
+#         # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
+#     ],
+#     key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
+# )
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_C": lambda args: args["nchunks"] % args["BLOCK_N"] == 0,
+        "EVEN_W": lambda args: args["WINDOW_SIZE"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_eva_agg_kernel(
+    Q,
+    K,
+    V,
+    RFA_K,
+    RFA_V,
+    WindowMask,
+    Out,
+    softmax_scale,
+    stride_qb, stride_qh, stride_qm,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_rfa_kb, stride_rfa_kh, stride_rfa_kc,
+    stride_rfa_vb, stride_rfa_vh, stride_rfa_vc,
+    stride_mb, stride_mm,
+    stride_ob, stride_oh, stride_om,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    nchunks,
+    headdim,
+    CACHE_KEY_SEQLEN_Q, # TODO: why keeping this
+    CACHE_KEY_SEQLEN_K, # TODO: why keeping this
+    CACHE_KEY_NCHUNKS, # TODO: why keeping this
+    CHUNKS_PER_WINDOW: tl.constexpr,
+    WINDOW_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    EMPTY_RFA_KV: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_W: tl.constexpr,
+    EVEN_C: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_bh = tl.program_id(1)
+    off_h = off_bh % nheads
+    off_b = off_bh // nheads
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_w = (start_m * BLOCK_M) // WINDOW_SIZE
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_c = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    # TODO: add paratheses or not
+    q_ptrs = (
+        Q +
+        off_b * stride_qb +
+        off_h * stride_qh +
+        (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K +
+        off_b * stride_kb +
+        off_h * stride_kh +
+        (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V +
+        off_b * stride_vb +
+        off_h * stride_vh +
+        (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if EMPTY_RFA_KV == 0:
+        rfa_k_ptrs = (
+            RFA_K +
+            off_b * stride_rfa_kb +
+            off_h * stride_rfa_kh +
+            (offs_c[:, None] * stride_rfa_kc + offs_d[None, :])
+        )
+        rfa_v_ptrs = (
+            RFA_V +
+            off_b * stride_rfa_vb +
+            off_h * stride_rfa_vh +
+            (offs_c[:, None] * stride_rfa_vc + offs_d[None, :])
+        )
+    qk_scale = softmax_scale
+    qk_scale *= 1.4426950408889634  # log2(e)
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            WindowMask +
+            off_b * stride_mb +
+            (offs_m[:, None] * stride_mm + offs_n[None, :])
+        )
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    d_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    # load q: it will stay in SRAM throughout
+    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
+    # tl.load(q_ptrs), we get the wrong output!
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_d[None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            q = tl.load(
+                q_ptrs,
+                mask=offs_m[:, None] < seqlen_q,
+                other=0.0
+            )
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0
+            )
+    # loop over k, v and update accumulator
+    # Iterate over local singletons;
+    # so we only iterate over blocks within the current window
+    start_idx_n = offs_w * WINDOW_SIZE
+    end_idx_n = tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(start_idx_n, end_idx_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, tl.trans(k))
+        # Trying to combine the two masks seem to make the result wrong
+        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if MASK_TYPE == 1:
+            if EVEN_M & EVEN_W:
+                mask = tl.load(
+                    m_ptrs + start_n - start_idx_n
+                ).to(tl.float32)
+            else:
+                mask = tl.load(
+                    m_ptrs + start_n - start_idx_n,
+                    mask=(offs_m[:, None] < seqlen_q)
+                    & ((start_n - start_idx_n + offs_n)[None, :] < WINDOW_SIZE),
+                    other=0.0,
+                ).to(tl.float32)
+            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
+            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
+            # to multiply with softmax_scale here.
+            # we assume mask already implies the causal masking
+            qk = qk * qk_scale + mask
+            m_ij = tl.maximum(tl.max(qk, 1), m_i)
+            p = tl.exp2(qk - m_ij[:, None])
+        else:
+            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
+            m_ij = tl.maximum(tl.max(qk, 1) * qk_scale, m_i)
+            p = tl.exp2(qk * qk_scale - m_ij[:, None])
+        d_ij = tl.sum(p, 1)
+        # scale acc_o
+        prev_scale = tl.exp2(m_i - m_ij)
+        # # -- update output accumulator --
+        acc_o = acc_o * prev_scale[:, None]
+        # update acc_o
+        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0
+                )
+        else:
+            if EVEN_HEADDIM:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=(start_n + offs_n)[:, None] < seqlen_k,
+                    other=0.0,
+                )
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+                    other=0.0,
+                )
+        p = p.to(v.dtype)
+        acc_o = tl.dot(p, v, acc_o)
+        # -- update statistics
+        d_i = d_i * prev_scale + d_ij
+        m_i = m_ij
+    if EMPTY_RFA_KV == 0:
+        # Iterate over RFA chunks
+        # we only iterate over chunks before the current local singleton window
+        end_idx_c = tl.minimum(offs_w * CHUNKS_PER_WINDOW, nchunks)
+        for start_c in range(0, end_idx_c, BLOCK_N):
+            start_c = tl.multiple_of(start_c, BLOCK_N)
+            # -- compute qk ----
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=(start_c + offs_c)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_k = tl.load(
+                        rfa_k_ptrs + start_c * stride_rfa_kc,
+                        mask=((start_c + offs_c)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk += tl.dot(q, tl.trans(rfa_k))
+            # Trying to combine the two masks seem to make the result wrong
+            if not EVEN_C:  # Need to mask out otherwise the softmax is wrong
+                qk += tl.where((start_c + offs_c)[None, :] < nchunks, 0, float("-inf"))
+            m_ij = tl.maximum(tl.max(qk, 1) * qk_scale, m_i)
+            p = tl.exp2(qk * qk_scale - m_ij[:, None])
+            d_ij = tl.sum(p, 1)
+            # scale acc_o
+            prev_scale = tl.exp2(m_i - m_ij)
+            # # -- update output accumulator --
+            acc_o = acc_o * prev_scale[:, None]
+            # update acc_o
+            # TODO: If we just do "if EVEN_N", there seems to be some race condition ?
+            if EVEN_C & EVEN_M:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=offs_d[None, :] < headdim,
+                        other=0.0
+                    )
+            else:
+                if EVEN_HEADDIM:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=(start_c + offs_n)[:, None] < nchunks,
+                        other=0.0,
+                    )
+                else:
+                    rfa_v = tl.load(
+                        rfa_v_ptrs + start_c * stride_rfa_vc,
+                        mask=((start_c + offs_n)[:, None] < nchunks) & (offs_d[None, :] < headdim),
+                        other=0.0,
+                    )
+            p = p.to(rfa_v.dtype)
+            acc_o = tl.dot(p, rfa_v, acc_o)
+            # -- update statistics
+            d_i = d_i * prev_scale + d_ij
+            m_i = m_ij
+    # BUG: have to store and immediately load
+    acc_o = acc_o / d_i[:, None]
+    # TODO: understand why rematerialize offsets to save registers?
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (
+        Out +
+        off_b * stride_ob +
+        off_h * stride_oh +
+        (offs_m[:, None] * stride_om + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_ptrs, acc_o
+            )
+        else:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=offs_m[:, None] < seqlen_q
+            )
+        else:
+            tl.store(
+                out_ptrs, acc_o,
+                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim)
+            )
+def triton_eva_agg_fwd(q, k, v, rfa_k, rfa_v, window_mask, softmax_scale, window_size, chunks_per_window):
+    if rfa_k is None and rfa_v is None:
+        empty_rfa_kv = 1
+        q, k, v = [
+            x if x.stride(-1) == 1 else x.contiguous()
+            for x in [q, k, v]
+        ]
+    else:
+        assert rfa_k is not None and rfa_v is not None, "Both rfa_k and rfa_v must either be None or have values at the same time."
+        empty_rfa_kv = 0
+        q, k, v, rfa_k, rfa_v = [
+            x if x.stride(-1) == 1 else x.contiguous()
+            for x in [q, k, v, rfa_k, rfa_v]
+        ]
+    # shape constraints
+    batch, nheads, seqlen_q, head_dim = q.shape
+    _,     _,      seqlen_k, _        = k.shape
+    if empty_rfa_kv == 0:
+        nchunks = rfa_k.shape[-2]
+        assert rfa_k.shape == (batch, nheads, nchunks, head_dim)
+        assert rfa_v.shape == (batch, nheads, nchunks, head_dim)
+        assert q.dtype == k.dtype == v.dtype == rfa_k.dtype == rfa_v.dtype
+    else:
+        nchunks = 0
+        assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert k.shape == (batch, nheads, seqlen_k, head_dim)
+    assert v.shape == (batch, nheads, seqlen_k, head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    # assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
+    assert q.dtype in [torch.bfloat16, torch.float], "Only support bf16 and fp32 for now"
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_type = 0
+    if window_mask is not None:
+        mask_type = 1
+        assert window_mask.dtype == q.dtype, torch.float
+        assert window_mask.is_cuda
+        assert window_mask.dim() == 4
+        assert window_mask.shape == (batch, 1, seqlen_q, window_size)
+        if window_mask.stride(-1) != 1:
+            window_mask = window_mask.contiguous()
+    mask_strides = (
+        (window_mask.stride(0), window_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    rfa_k_strides = (
+        (rfa_k.stride(0), rfa_k.stride(1), rfa_k.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    rfa_v_strides = (
+        (rfa_v.stride(0), rfa_v.stride(1), rfa_v.stride(2))
+        if empty_rfa_kv == 0 else
+        (0, 0, 0)
+    )
+    assert chunks_per_window > 0, "chunks_per_window must be greater than 0"
+    o = torch.empty_like(q)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    if q.dtype == torch.float:
+        BLOCK = 64
+    else:
+        BLOCK = 128
+    num_warps = 4 if head_dim <= 64 else 8
+    assert chunks_per_window >= BLOCK, "chunks_per_window must be greater than BLOCK"
+    # WINDOW_MASK_TYPE:
+    # - 0: regular causal mask, simply None
+    # - 1: the shape must be B, 1, W, I, J
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _fwd_eva_agg_kernel[grid](
+        q,
+        k,
+        v,
+        rfa_k,
+        rfa_v,
+        window_mask,
+        o,
+        softmax_scale,
+        q.stride(0), q.stride(1), q.stride(2),
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        rfa_k_strides[0], rfa_k_strides[1], rfa_k_strides[2],
+        rfa_v_strides[0], rfa_v_strides[1], rfa_v_strides[2],
+        mask_strides[0], mask_strides[1],
+        o.stride(0), o.stride(1), o.stride(2),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        nchunks,
+        head_dim,
+        seqlen_q // 32,
+        seqlen_k // 32,
+        nchunks // 32,
+        chunks_per_window,
+        window_size,
+        mask_type,
+        empty_rfa_kv,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return o

eva_cache.py ADDED Viewed

	@@ -0,0 +1,761 @@

+from typing import Dict, Optional, Tuple, List, Any, Union
+import torch
+from transformers.cache_utils import Cache
+class EvaCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+    def __init__(self) -> None:
+        self.w_k: List[torch.Tensor] = []
+        self.w_v: List[torch.Tensor] = []
+        self.rf_q: List[torch.Tensor] = []
+        self.rf_k: List[torch.Tensor] = []
+        self.rf_v: List[torch.Tensor] = []
+        self.softmax_phi_k_v: List[torch.Tensor] = []
+        self.log_sum_phi_k: List[torch.Tensor] = []
+        self.rf_k_bar: List[torch.Tensor] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        # attention masks temporary buffer
+        self.rf_mask: List[Optional[torch.Tensor]] = []
+        self.s_mask: List[torch.Tensor] = []
+        self.chunk_mask: List[torch.Tensor] = []
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.w_k)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.w_k)):
+            device = self.w_k[layer_idx].device
+            self.w_k[layer_idx] = self.w_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.w_v[layer_idx].device
+            self.w_v[layer_idx] = self.w_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_q[layer_idx].device
+            self.rf_q[layer_idx] = self.rf_q[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_k[layer_idx].device
+            self.rf_k[layer_idx] = self.rf_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_v[layer_idx].device
+            self.rf_v[layer_idx] = self.rf_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.softmax_phi_k_v[layer_idx].device
+            self.softmax_phi_k_v[layer_idx] = self.softmax_phi_k_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.log_sum_phi_k[layer_idx].device
+            self.log_sum_phi_k[layer_idx] = self.log_sum_phi_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_k_bar[layer_idx].device
+            self.rf_k_bar[layer_idx] = self.rf_k_bar[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_mask[layer_idx].device
+            self.rf_mask[layer_idx] = self.rf_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.s_mask[layer_idx].device
+            self.s_mask[layer_idx] = self.s_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.chunk_mask[layer_idx].device
+            self.chunk_mask[layer_idx] = self.chunk_mask[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+    def update_past_len(
+        self,
+        cur_q_len: int,
+        layer_idx: int
+    ):
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += cur_q_len
+        return self._seen_tokens
+    def update_mask(
+            self,
+            prev_s_mask,
+            cur_s_mask,
+            chunk_mask,
+            rf_mask,
+            layer_idx,
+            window_size,
+            chunk_size,
+    ):
+        ############################################
+        # compute masks for singletons
+        ############################################
+        q_len = None
+        if len(self.s_mask) <= layer_idx:
+            q_len = chunk_mask.shape[-2]
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            if q_len < window_size:
+                assert prev_s_mask is None
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            self.s_mask.append(cur_s_mask[..., -1:, :] if cur_s_mask is not None else prev_s_mask[..., -1, -1:, :])
+        else:
+            # decoding stage
+            prev_s_mask = None
+            cached_s_mask = self.s_mask[layer_idx]
+            assert cached_s_mask is not None
+            if cached_s_mask.shape[-1] == window_size:
+                cur_s_mask = cur_s_mask
+            else:
+                cur_s_mask = torch.cat([cached_s_mask, cur_s_mask], dim=-1)
+            # store the past window-wise key-value pairs
+            self.s_mask[layer_idx] = cur_s_mask
+        ############################################
+        # compute masks for intra-chunks
+        ############################################
+        dump_rf_mask = None
+        if len(self.rf_mask) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if q_len < chunk_size:
+                cur_rf_mask = rf_mask
+            else:
+                if q_len % chunk_size == 0:
+                    dump_rf_mask = rf_mask
+                    cur_rf_mask = None
+                else:
+                    remainder_tokens = q_len % chunk_size
+                    if rf_mask is not None:
+                        dump_rf_mask, cur_rf_mask = torch.split(rf_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    else:
+                        dump_rf_mask = None
+                        cur_rf_mask = None
+            self.rf_mask.append(cur_rf_mask)
+        else:
+            past_rf_mask = self.rf_mask[layer_idx]
+            if past_rf_mask is not None:
+                # when decoding tokens, we always assume the
+                # incoming token mask is 0 (not masked)
+                cur_rf_mask = torch.cat([past_rf_mask, rf_mask], dim=-2)
+            else:
+                # we do not need to use rf_mask anymore after we receive generated tokens
+                cur_rf_mask = None
+            # We need to store rf_k_bar and RFA-results that
+            # compute the per-chunk RFA.
+            # Dump the chunk if the len of current chunk reaches <chunk_size>.
+            if cur_rf_mask is not None and cur_rf_mask.shape[-2] == chunk_size:
+                dump_rf_mask = cur_rf_mask
+                cur_rf_mask = None
+            self.rf_mask[layer_idx] = cur_rf_mask
+        ############################################
+        # compute masks for inter chunks
+        ############################################
+        if len(self.chunk_mask) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            if q_len < window_size:
+                cur_chunk_mask = chunk_mask
+                prev_chunk_mask = None
+            else:
+                if q_len % window_size == 0:
+                    cur_chunk_mask = None
+                    prev_chunk_mask = chunk_mask
+                else:
+                    remainder_tokens = q_len % window_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    prev_chunk_mask, cur_chunk_mask = torch.split(chunk_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                bsz, num_heads, _, head_dim = prev_chunk_mask.shape
+                prev_chunk_mask = prev_chunk_mask.reshape(bsz, num_heads, -1, window_size, head_dim)
+                assert prev_s_mask is not None
+                if prev_s_mask.shape[-3] == 1 and prev_chunk_mask.shape[-3] > 1:
+                    # need to expand
+                    prev_s_mask = prev_s_mask.expand(-1, -1, prev_chunk_mask.shape[-3], -1, -1)
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            self.chunk_mask.append(cur_chunk_mask[..., -1:, :] if cur_chunk_mask is not None else prev_chunk_mask[..., -1, -1:, :])
+        else:
+            # decoding stage
+            prev_chunk_mask = None
+            cur_chunk_mask = self.chunk_mask[layer_idx]
+            # if the current sequence length reaches <chunk_size>,
+            # we append a new 1 to the end of chunk_mask
+            seen_seq_len = self.get_seq_length(layer_idx)
+            if seen_seq_len > 0 and seen_seq_len % chunk_size == 0:
+                past_chunk_mask = self.chunk_mask[layer_idx]
+                if past_chunk_mask is not None:
+                    # when decoding tokens, we always assume the
+                    # incoming token mask is 0 (not masked)
+                    cur_chunk_mask = torch.cat([past_chunk_mask, chunk_mask], dim=-1)
+                else:
+                    cur_chunk_mask = chunk_mask
+                self.chunk_mask[layer_idx] = cur_chunk_mask
+            # if the len of current sequence reaches <window_size> + 1,
+            # we turn on the mask for most recent chunks
+            if seen_seq_len > 0 and seen_seq_len % window_size == 1:
+                cur_chunk_mask = self.chunk_mask[layer_idx]
+                # we do not need to use rf_mask anymore after we receive generated tokens
+                num_chunks_per_window = window_size // chunk_size
+                cur_chunk_mask[..., -num_chunks_per_window:] = False
+                self.chunk_mask[layer_idx] = cur_chunk_mask
+        return (prev_s_mask, cur_s_mask, prev_chunk_mask, cur_chunk_mask, dump_rf_mask)
+    def update_singletons(
+            self,
+            q,
+            k,
+            v,
+            layer_idx,
+            window_size,
+    ):
+        if len(self.w_k) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            q_len = q.shape[-2]
+            if q_len < window_size:
+                w_q = q
+                w_k = k
+                w_v = v
+                past_w_q = past_w_k = past_w_v = None
+            else:
+                if q_len % window_size == 0:
+                    w_q = None
+                    w_k = None
+                    w_v = None
+                    past_w_q = q
+                    past_w_k = k
+                    past_w_v = v
+                else:
+                    remainder_tokens = q_len % window_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    past_w_q, w_q = torch.split(q, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    past_w_k, w_k = torch.split(k, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    past_w_v, w_v = torch.split(v, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                bsz, num_heads, _, head_dim = past_w_q.shape
+                past_w_q = past_w_q.reshape(bsz, num_heads, -1, window_size, head_dim)
+                past_w_k = past_w_k.reshape(bsz, num_heads, -1, window_size, head_dim)
+                past_w_v = past_w_v.reshape(bsz, num_heads, -1, window_size, head_dim)
+            # w_q = q[..., None, -window_size:, :] # [b, h, 1, j, d]
+            # w_k =  # [b, h, 1, j, d]
+            # w_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            # if w_k is None, it means we happen to pass in a sqeuence that is divisible by window_size
+            # we leave the cache with window_size-sized kv pairs to be cleared next iteration
+            self.w_k.append(w_k if w_k is not None else past_w_k[..., -1, :, :])
+            self.w_v.append(w_v if w_v is not None else past_w_v[..., -1, :, :])
+        else:
+            # decoding stage
+            past_w_q = past_w_k = past_w_v = None
+            # this is implemented as either a sliding window or fixed window
+            w_q = q # [b, h, 1, d]
+            w_k = k # [b, h, 1, d]
+            w_v = v # [b, h, 1, d]
+            cached_w_k = self.w_k[layer_idx]
+            assert cached_w_k is not None # [b, h, j, d]
+            if cached_w_k.shape[-2] == window_size:
+                w_k = w_k
+            else:
+                w_k = torch.cat([cached_w_k, w_k], dim=-2)
+            cached_w_v = self.w_v[layer_idx]
+            assert cached_w_v is not None
+            if cached_w_v.shape[-2] == window_size:
+                w_v = w_v
+            else:
+                w_v = torch.cat([cached_w_v, w_v], dim=-2)
+            # store the past window-wise key-value pairs
+            self.w_k[layer_idx] = w_k
+            self.w_v[layer_idx] = w_v
+        return (past_w_q, past_w_k, past_w_v), (w_q, w_k, w_v)
+    def update_chunks(
+            self,
+            q,
+            k,
+            v,
+            layer_idx,
+            chunk_size
+    ):
+        q_len = q.shape[-2]
+        dump_q = None
+        dump_k = None
+        dump_v = None
+        if len(self.rf_q) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if q_len < chunk_size:
+                rf_q = q
+                rf_k = k
+                rf_v = v
+            else:
+                if q_len % chunk_size == 0:
+                    rf_q = None
+                    rf_k = None
+                    rf_v = None
+                    dump_q = q
+                    dump_k = k
+                    dump_v = v
+                else:
+                    remainder_tokens = q_len % chunk_size
+                    # [b, h, n-r, d] [b, h, r, d]
+                    dump_q, rf_q = torch.split(q, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    dump_k, rf_k = torch.split(k, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+                    dump_v, rf_v = torch.split(v, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+            self.rf_q.append(rf_q)
+            self.rf_k.append(rf_k)
+            self.rf_v.append(rf_v)
+        else:
+            # decode tokens
+            # add query, key & value to the current chunk.
+            past_rf_q = self.rf_q[layer_idx]
+            if past_rf_q is not None:
+                rf_q = torch.cat([past_rf_q, q], dim=-2)
+            else:
+                rf_q = q
+            past_rf_k = self.rf_k[layer_idx]
+            if past_rf_k is not None:
+                rf_k = torch.cat([past_rf_k, k], dim=-2)
+            else:
+                rf_k = k
+            past_rf_v = self.rf_v[layer_idx]
+            if past_rf_v is not None:
+                rf_v = torch.cat([past_rf_v, v], dim=-2)
+            else:
+                rf_v = v
+            # We need to store rf_k_bar and RFA-results that
+            # compute the per-chunk RFA.
+            # Dump the chunk if the len of current chunk reaches <chunk_size>.
+            if rf_q.shape[-2] == chunk_size:
+                dump_q = rf_q
+                dump_k = rf_k
+                dump_v = rf_v
+                # clear the chunk
+                rf_q = None
+                rf_k = None
+                rf_v = None
+            self.rf_q[layer_idx] = rf_q
+            self.rf_k[layer_idx] = rf_k
+            self.rf_v[layer_idx] = rf_v
+        return dump_q, dump_k, dump_v
+    def update_chunk_rfas(
+        self,
+        softmax_phi_k_v,
+        log_sum_phi_k,
+        rf_k_bar,
+        layer_idx,
+        random_feature_dim
+    ):
+        if len(self.softmax_phi_k_v) <= layer_idx:
+            # prefill stage
+            self.softmax_phi_k_v.append(softmax_phi_k_v)
+            self.log_sum_phi_k.append(log_sum_phi_k)
+            self.rf_k_bar.append(rf_k_bar)
+        else:
+            # token decoding
+            past_softmax_phi_k_v = self.softmax_phi_k_v[layer_idx]
+            past_log_sum_phi_k = self.log_sum_phi_k[layer_idx]
+            past_rf_k_bar = self.rf_k_bar[layer_idx]
+            if past_softmax_phi_k_v is not None:
+                if random_feature_dim == 1:
+                    dim = -2
+                else:
+                    dim = -3
+                softmax_phi_k_v = torch.cat([past_softmax_phi_k_v, softmax_phi_k_v], dim=dim)
+            if past_log_sum_phi_k is not None:
+                if random_feature_dim == 1:
+                    dim = -2
+                else:
+                    dim = -3
+                log_sum_phi_k = torch.cat([past_log_sum_phi_k, log_sum_phi_k], dim=dim)
+            if past_rf_k_bar is not None:
+                rf_k_bar = torch.cat([past_rf_k_bar, rf_k_bar], dim=-2)
+            self.softmax_phi_k_v[layer_idx] = softmax_phi_k_v
+            self.log_sum_phi_k[layer_idx] = log_sum_phi_k
+            self.rf_k_bar[layer_idx] = rf_k_bar
+        return softmax_phi_k_v, log_sum_phi_k, rf_k_bar
+    def get_chunk_rfas(self, layer_idx):
+        if len(self.softmax_phi_k_v) <= layer_idx:
+            return (
+                None,
+                None,
+                None
+            )
+        else:
+            return (
+                self.softmax_phi_k_v[layer_idx],
+                self.log_sum_phi_k[layer_idx],
+                self.rf_k_bar[layer_idx]
+            )
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.w_k) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def update(
+        self,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("`update` is not used in Eva Cache.")
+class EvaStaticCacheForTriton(Cache):
+    """
+    A variant of EvaCache for eva's triton kernels
+    """
+    def __init__(
+        self,
+        batch_size,
+        num_key_value_heads,
+        window_size,
+        head_dim,
+        num_layers,
+        dtype,
+        device
+    ) -> None:
+        self.past_window_k: List[torch.Tensor] = []
+        self.past_window_v: List[torch.Tensor] = []
+        cache_shape = (batch_size, num_key_value_heads, window_size, head_dim)
+        for idx in range(num_layers):
+            new_window_k = torch.zeros(cache_shape, dtype=dtype, device=device)
+            new_window_v = torch.zeros(cache_shape, dtype=dtype, device=device)
+            self.past_window_k.append(new_window_k)
+            self.past_window_v.append(new_window_v)
+        self.past_window_pos: List[int] = []
+        self.rfa_k: List[torch.Tensor] = []
+        self.rfa_v: List[torch.Tensor] = []
+        # self.rfa_mask: List[torch.Tensor] = []
+        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+        # attention masks temporary buffer
+        self.rf_mask: List[Optional[torch.Tensor]] = []
+        self.s_mask: List[torch.Tensor] = []
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.past_window_pos)
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.past_window_k)):
+            device = self.past_window_k[layer_idx].device
+            self.past_window_k[layer_idx] = self.past_window_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.past_window_v[layer_idx].device
+            self.past_window_v[layer_idx] = self.past_window_v[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rfa_k[layer_idx].device
+            self.rfa_k[layer_idx] = self.rfa_k[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rfa_v[layer_idx].device
+            self.rfa_v[layer_idx] = self.rfa_v[layer_idx].index_select(0, beam_idx.to(device))
+            # device = self.rfa_mask[layer_idx].device
+            # self.rfa_mask[layer_idx] = self.rfa_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.rf_mask[layer_idx].device
+            self.rf_mask[layer_idx] = self.rf_mask[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.s_mask[layer_idx].device
+            self.s_mask[layer_idx] = self.s_mask[layer_idx].index_select(0, beam_idx.to(device))
+    @property
+    def seen_tokens(self):
+        if hasattr(self, "_seen_tokens"):
+            return self._seen_tokens
+        else:
+            return None
+    def update_past_len(
+        self,
+        cur_q_len: int,
+        layer_idx: int
+    ):
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += cur_q_len
+        return self._seen_tokens
+    def update_mask(
+            self,
+            s_mask,
+            rf_mask,
+            layer_idx,
+            window_size,
+    ):
+        ############################################
+        # compute masks for singletons
+        ############################################
+        if len(self.s_mask) <= layer_idx:
+            # prefill stage
+            # q is of shape [b, h, n, d]
+            # s_v =  # [b, h, 1, j, d]
+            # store the past window-wise key-value pairs
+            if s_mask is None:
+                cur_s_mask = None
+            else:
+                q_len = s_mask.shape[-2]
+                # s_mask is of shape [b, h, n, w]
+                # let r = q_len % window_size
+                # if r == 0, the mask to be appended is of shape [b, h, 1, w]
+                # otherwise, r < w, the mask to be appended is of shape [b, h, 1, r]
+                remainder_tokens = q_len % window_size
+                if remainder_tokens == 0:
+                    cur_s_mask = None
+                else:
+                    cur_s_mask = s_mask[..., -1:, :remainder_tokens]
+            self.s_mask.append(cur_s_mask)
+            # we use the passed s_mask for subsequent computations
+            dump_s_mask = s_mask
+        else:
+            # decoding stage
+            past_s_mask = self.s_mask[layer_idx]
+            if past_s_mask is None:
+                assert s_mask is None
+                cur_s_mask = None
+            else:
+                assert s_mask is not None
+                cur_s_mask = torch.cat([past_s_mask, s_mask], dim=-1)
+            dump_s_mask = cur_s_mask
+            if cur_s_mask is not None and cur_s_mask.shape[-1] == window_size:
+                cur_s_mask = None
+                # store the past window-wise key-value pairs
+            self.s_mask[layer_idx] = cur_s_mask
+        ############################################
+        # compute masks for intra-chunks
+        ############################################
+        dump_rf_mask = None
+        if len(self.rf_mask) <= layer_idx:
+            # initialize chunk stats
+            # prefill stage
+            if rf_mask is None:
+                cur_rf_mask = None
+            else:
+                q_len = rf_mask.shape[-2]
+                if q_len < window_size:
+                    dump_rf_mask = None
+                    cur_rf_mask = rf_mask
+                else:
+                    if q_len % window_size == 0:
+                        dump_rf_mask = rf_mask
+                        cur_rf_mask = None
+                    else:
+                        remainder_tokens = q_len % window_size
+                        dump_rf_mask, cur_rf_mask = torch.split(rf_mask, [q_len - remainder_tokens, remainder_tokens], dim=-2)
+            self.rf_mask.append(cur_rf_mask)
+        else:
+            past_rf_mask = self.rf_mask[layer_idx]
+            if past_rf_mask is not None:
+                # when decoding tokens, we always assume the
+                # incoming token mask is 0 (not masked)
+                cur_rf_mask = torch.cat([past_rf_mask, rf_mask], dim=-2)
+            else:
+                cur_rf_mask = None
+            if cur_rf_mask is not None and cur_rf_mask.shape[-2] == window_size:
+                dump_rf_mask = cur_rf_mask
+                cur_rf_mask = None
+            self.rf_mask[layer_idx] = cur_rf_mask
+        return dump_s_mask, dump_rf_mask
+    def update_singletons_and_chunks(
+            self,
+            k,
+            v,
+            layer_idx,
+            window_size,
+    ):
+        if len(self.past_window_pos) <= layer_idx:
+            # prefill stage
+            s_k = k
+            s_v = v
+            input_len = k.shape[-2]
+            window_pos = 0
+            if input_len <= window_size:
+                new_window_pos = window_pos + input_len
+                cached_window_k = k
+                cached_window_v = v
+                dump_k = None
+                dump_v = None
+            else:
+                remainder_tokens = input_len % window_size
+                if remainder_tokens == 0:
+                    remainder_tokens = window_size
+                new_window_pos = window_pos + remainder_tokens
+                # [b, h, n-r, d] [b, h, r, d]
+                cached_window_k = k[..., -remainder_tokens:, :]
+                cached_window_v = v[..., -remainder_tokens:, :]
+                dump_k = k[..., :-remainder_tokens, :]
+                dump_v = v[..., :-remainder_tokens, :]
+            # store the past window-wise key-value pairs
+            self.past_window_k[layer_idx][:, :, window_pos : new_window_pos, :] = cached_window_k
+            self.past_window_v[layer_idx][:, :, window_pos : new_window_pos, :] = cached_window_v
+            self.past_window_pos.append(new_window_pos)
+        else:
+            # decoding stage
+            # if the previous cache has full tokens,
+            # roll back to the first elements
+            if self.past_window_pos[layer_idx] == window_size:
+                self.past_window_pos[layer_idx] = 0
+                dump_k = self.past_window_k[layer_idx].clone()
+                dump_v = self.past_window_v[layer_idx].clone()
+            else:
+                dump_k = None
+                dump_v = None
+            input_len = k.shape[-2]
+            window_pos = self.past_window_pos[layer_idx]
+            new_window_pos = window_pos + input_len
+            self.past_window_k[layer_idx][:, :, window_pos : new_window_pos, :] = k
+            self.past_window_v[layer_idx][:, :, window_pos : new_window_pos, :] = v
+            s_k = self.past_window_k[layer_idx][:, :, : new_window_pos, :]
+            s_v = self.past_window_v[layer_idx][:, :, : new_window_pos, :]
+            self.past_window_pos[layer_idx] = new_window_pos
+        return s_k, s_v, dump_k, dump_v
+    def update_chunk_rfas(
+        self,
+        rfa_k,
+        rfa_v,
+        layer_idx,
+    ):
+        if len(self.rfa_k) <= layer_idx:
+            # prefill stage
+            self.rfa_k.append(rfa_k)
+            self.rfa_v.append(rfa_v)
+        else:
+            # token decoding
+            past_rfa_k = self.rfa_k[layer_idx]
+            past_rfa_v = self.rfa_v[layer_idx]
+            if past_rfa_k is not None:
+                rfa_k = torch.cat([past_rfa_k, rfa_k], dim=-2)
+            if past_rfa_v is not None:
+                rfa_v = torch.cat([past_rfa_v, rfa_v], dim=-2)
+            self.rfa_k[layer_idx] = rfa_k
+            self.rfa_v[layer_idx] = rfa_v
+        return rfa_k, rfa_v
+    def get_past_window_pos(self, layer_idx):
+        if len(self.past_window_pos) <= layer_idx:
+            return None
+        else:
+            return self.past_window_pos[layer_idx]
+    def get_past_window_kv(self, layer_idx):
+        if len(self.past_window_pos) <= layer_idx:
+            return None, None
+        else:
+            return (
+                self.past_window_k[layer_idx][:, :, : self.past_window_pos[layer_idx], :],
+                self.past_window_v[layer_idx][:, :, : self.past_window_pos[layer_idx], :]
+            )
+    def get_chunk_rfas(self, layer_idx):
+        if len(self.rfa_k) <= layer_idx:
+            return None, None
+        else:
+            return self.rfa_k[layer_idx], self.rfa_v[layer_idx]
+    def get_seq_length(self, layer_idx = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # layer_idx must be provided since otherwise
+        # any layer > 0 can only get the updated _seen_tokens
+        if len(self.past_window_pos) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+    def update(
+        self,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise NotImplementedError("`update` is not used in Eva Cache.")

eva_prep_kv_kernel.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import math
+import torch
+import triton
+import triton.language as tl
+@triton.heuristics(
+    {
+        "EVEN_N": lambda args: args["seqlen"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_eva_prep_kv_kernel(
+    K, # [b, h, n, d]
+    V, # [b, h, n, d]
+    PARAM_MU, # [1, h, 1, 1, d]
+    PARAM_PHI,  # [1, h, 1, 1, d]
+    ChunkMask, # [b, h, n, 1]
+    Out_RFA_K, # [b, h, c, d]
+    Out_RFA_V, # [b, h, c, d]
+    softmax_scale,
+    stride_kb, stride_kh, stride_kn,
+    stride_vb, stride_vh, stride_vn,
+    stride_mu_h,
+    stride_phi_h,
+    stride_mb, stride_mn,
+    stride_ok_b, stride_ok_h, stride_ok_c,
+    stride_ov_b, stride_ov_h, stride_ov_c,
+    nheads,
+    seqlen,
+    nchunks,
+    headdim,
+    CACHE_KEY_SEQLEN, # TODO: why keeping this
+    CACHE_KEY_NCHUNKS, # TODO: why keeping this
+    CHUNKS_PER_BLOCK: tl.constexpr,
+    CHUNK_SIZE: tl.constexpr,
+    MASK_TYPE: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_n = tl.program_id(0)
+    offs_bh = tl.program_id(1)
+    offs_h = offs_bh % nheads
+    offs_b = offs_bh // nheads
+    # initialize offsets
+    # we load BLOCK_N keys and values each time, and
+    # reshape it to [CHUNKS_PER_BLOCK, CHUNK_SIZE]
+    offs_c = tl.arange(0, CHUNKS_PER_BLOCK)
+    offs_m = tl.arange(0, CHUNK_SIZE)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    k_ptrs = (
+        K +
+        offs_b * stride_kb +
+        offs_h * stride_kh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_kn +
+            offs_d[None, None, :]
+        )
+    )
+    v_ptrs = (
+        V +
+        offs_b * stride_vb +
+        offs_h * stride_vh +
+        (
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None, None] * CHUNK_SIZE +
+                offs_m[None, :, None]
+            ) * stride_vn +
+            offs_d[None, None, :]
+        )
+    )
+    param_mu_ptrs = (
+        PARAM_MU +
+        offs_h * stride_mu_h +
+        offs_d[None, None, :]
+    )
+    param_phi_ptrs = (
+        PARAM_PHI +
+        offs_h * stride_phi_h +
+        offs_d[None, None, :]
+    )
+    log2e = 1.4426950408889634
+    if MASK_TYPE == 1:
+        m_ptrs = (
+            ChunkMask +
+            offs_b * stride_mb +
+            (
+                (
+                    start_n * BLOCK_N +
+                    offs_c[:, None] * CHUNK_SIZE +
+                    offs_m[None, :]
+                ) * stride_mn
+            )
+        )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            k = tl.load(
+                k_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    param_mu = tl.load(param_mu_ptrs).to(k.dtype)
+    rfa_k_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    rfa_k_c_w += tl.sum(k * param_mu, axis=-1)
+    rfa_k_c_w *= log2e
+    if MASK_TYPE == 1:
+        if EVEN_N:
+            mask = tl.load(
+                m_ptrs
+            ).to(tl.float32)
+        else:
+            mask = tl.load(
+                m_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None] * CHUNK_SIZE +
+                        offs_m[None, :]
+                    ) < seqlen,
+                other=0.0,
+            ).to(tl.float32)
+        rfa_k_c_w = rfa_k_c_w + mask
+    rfa_k_c_w = tl.exp2(rfa_k_c_w - tl.max(rfa_k_c_w, axis=-1)[:, None])
+    rfa_k_c_w = rfa_k_c_w / tl.sum(rfa_k_c_w, axis=-1)[:, None]
+    rfa_k_c = tl.sum(k * rfa_k_c_w[:, :, None].to(k.dtype), axis=-2)
+    # TODO: understand why rematerialize offsets to save registers?
+    offs_out_c = start_n * CHUNKS_PER_BLOCK + tl.arange(0, CHUNKS_PER_BLOCK)
+    out_rfa_k_ptrs = (
+        Out_RFA_K +
+        offs_b * stride_ok_b +
+        offs_h * stride_ok_h +
+        (offs_out_c[:, None] * stride_ok_c + offs_d[None, :])
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c
+            )
+        else:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=offs_out_c[:, None] < nchunks
+            )
+        else:
+            tl.store(
+                out_rfa_k_ptrs, rfa_k_c,
+                mask=(offs_out_c[:, None] < nchunks) & (offs_d[None, :] < headdim)
+            )
+    param_phi = tl.load(param_phi_ptrs).to(k.dtype)
+    rfa_v_c_w = tl.zeros([CHUNKS_PER_BLOCK, CHUNK_SIZE], dtype=tl.float32)
+    rfa_v_c_w += tl.sum(k * param_phi, axis=-1)
+    rfa_v_c_w -= (0.5 * tl.sum(k * k, axis=-1))
+    rfa_v_c_w *= log2e * softmax_scale
+    if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
+        rfa_v_c_w += tl.where(
+            (
+                start_n * BLOCK_N +
+                offs_c[:, None] * CHUNK_SIZE +
+                offs_m[None, :]
+            ) < seqlen,
+            0,
+            float("-inf")
+        )
+    if MASK_TYPE == 1:
+        rfa_v_c_w = rfa_v_c_w + mask
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=offs_d[None, None, :] < headdim,
+                other=0.0
+            )
+    else:
+        if EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        start_n * BLOCK_N +
+                        offs_c[:, None, None] * CHUNK_SIZE +
+                        offs_m[None, :, None]
+                    ) < seqlen,
+                other=0.0
+            )
+        else:
+            v = tl.load(
+                v_ptrs,
+                mask=(
+                        (
+                            start_n * BLOCK_N +
+                            offs_c[:, None, None] * CHUNK_SIZE +
+                            offs_m[None, :, None]
+                        ) < seqlen
+                    ) & (offs_d[None, None, :] < headdim),
+                other=0.0
+            )
+    rfa_v_c_w = tl.exp2(rfa_v_c_w - tl.max(rfa_v_c_w, axis=-1)[:, None])
+    rfa_v_c_w = rfa_v_c_w / tl.sum(rfa_v_c_w, axis=-1)[:, None]
+    rfa_v_c = tl.sum(v * rfa_v_c_w[:, :, None].to(v.dtype), axis=-2)
+    offs_out_c = start_n * CHUNKS_PER_BLOCK + tl.arange(0, CHUNKS_PER_BLOCK)
+    out_rfa_v_ptrs = (
+        Out_RFA_V +
+        offs_b * stride_ov_b +
+        offs_h * stride_ov_h +
+        (offs_out_c[:, None] * stride_ov_c + offs_d[None, :])
+    )
+    if EVEN_N:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c
+            )
+        else:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=offs_d[None, :] < headdim
+            )
+    else:
+        if EVEN_HEADDIM:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=offs_out_c[:, None] < nchunks
+            )
+        else:
+            tl.store(
+                out_rfa_v_ptrs, rfa_v_c,
+                mask=(offs_out_c[:, None] < nchunks) & (offs_d[None, :] < headdim)
+            )
+def triton_eva_prep_kv_fwd(k, v, param_mu, param_phi, chunk_mask, softmax_scale, chunksize):
+    k, v, param_mu, param_phi = [
+        x if x.stride(-1) == 1 else x.contiguous()
+        for x in [k, v, param_mu, param_phi]
+    ]
+    # shape constraints
+    batch, nheads, seqlen, head_dim = k.shape
+    assert seqlen % chunksize == 0, "seqlen must be divisible by chunksize"
+    nchunks = seqlen // chunksize
+    assert k.shape == (batch, nheads, seqlen, head_dim)
+    assert v.shape == (batch, nheads, seqlen, head_dim)
+    assert param_mu.shape == (1, nheads, 1, 1, head_dim)
+    assert param_phi.shape == (1, nheads, 1, 1, head_dim)
+    assert head_dim <= 128, "We only test head dimensions up to 128"
+    assert k.dtype == v.dtype == param_mu.dtype == param_phi.dtype, "All tensors must have the same type"
+    assert k.dtype in [torch.bfloat16, torch.float], "Only support bf16 and fp32 for now"
+    assert k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(head_dim)
+    mask_type = 0
+    if chunk_mask is not None:
+        mask_type = 1
+        assert chunk_mask.dtype == k.dtype
+        assert chunk_mask.is_cuda
+        assert chunk_mask.dim() == 4
+        assert chunk_mask.shape == (batch, 1, seqlen, 1)
+        if chunk_mask.stride(-1) != 1:
+            chunk_mask = chunk_mask.contiguous()
+    mask_strides = (
+        (chunk_mask.stride(0), chunk_mask.stride(2))
+        if mask_type == 1 else
+        (0, 0)
+    )
+    out_rfa_k = torch.empty((batch, nheads, nchunks, head_dim), dtype=k.dtype, device=k.device)
+    out_rfa_v = torch.empty((batch, nheads, nchunks, head_dim), dtype=v.dtype, device=v.device)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(head_dim), 16)
+    BLOCK = 128
+    num_warps = 4 if head_dim <= 64 else 8
+    assert (BLOCK > chunksize) & (BLOCK % chunksize) == 0, "BLOCK must be divisible by chunksize"
+    chunks_per_block = BLOCK // chunksize
+    grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_N"]), batch * nheads)
+    _fwd_eva_prep_kv_kernel[grid](
+        k,
+        v,
+        param_mu,
+        param_phi,
+        chunk_mask,
+        out_rfa_k,
+        out_rfa_v,
+        softmax_scale,
+        k.stride(0), k.stride(1), k.stride(2),
+        v.stride(0), v.stride(1), v.stride(2),
+        param_mu.stride(1),
+        param_phi.stride(1),
+        mask_strides[0], mask_strides[1],
+        out_rfa_k.stride(0), out_rfa_k.stride(1), out_rfa_k.stride(2),
+        out_rfa_v.stride(0), out_rfa_v.stride(1), out_rfa_v.stride(2),
+        nheads,
+        seqlen,
+        nchunks,
+        head_dim,
+        seqlen // 32,
+        nchunks // 32,
+        chunks_per_block,
+        chunksize,
+        mask_type,
+        BLOCK_HEADDIM,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1,
+    )
+    return out_rfa_k, out_rfa_v

eva_pt_ref.py ADDED Viewed

	@@ -0,0 +1,422 @@

+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+MASK_MIN_VALUE = -10e10
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    """
+    Rotates half the hidden dims (last dim) of the input.
+    Args:
+        x: Rotary embedded tensor
+    Return:
+        Tensor with half of last dim negated and rotated to the front.
+    """
+    x1, x2 = x.split(x.shape[-1] // 2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor,
+                         position_ids: torch.Tensor) -> torch.Tensor:
+    """
+    Apply rotary embedding (cos, sin) to the query and key tensor on the sequence dimension.
+    The legends for dimensions are defined as:
+    num_heads: number of attention heads
+    current_seq_len: the current batch's sequence length, should be either 1 or max_seq_len
+    max_seq_len: the static sequence length, different from current_seq_len in cached inference case where it is always
+                 maximum lenghth, e.g. the length of static sequence length of KV cache
+    Args:
+        q: Query tensor, of size (batch_size, num_heads, current_seq_len, head_dim)
+        k: Key tensor, of size (batch_size, num_key_value_heads, current_seq_len, head_dim)
+        cos: Cosine base of rotary embedding, of size (max_seq_len, head_dim)
+        sin: Sine base of rotary embedding, of size (max_seq_len, head_dim)
+        position_ids: The position indices of the tokens corresponding to the query and key tensors. It has a size of
+                      (batch_size, current_seq_len).
+    Returns:
+        Embedded query and key tensor of same size as input.
+    """
+    bs, nheads, cur_seq_len, head_dim = q.shape
+    assert len(
+        k.shape) == 4, f"k should be of shape (batch_size, num_heads, current_seq_len, head_dim), got {k.shape} instead"
+    assert k.shape[0] == bs, f"k has a different batch_size {k.shape[0]} compared to q {bs}"
+    assert list(k.shape[2:]) == [cur_seq_len,
+                                 head_dim], f"k has different current_seq_len and/or head_dim compared to q"
+    assert cos.shape[3] == head_dim, f"cos should have dim of head dim {head_dim}, got {cos.shape[3]} instead"
+    assert list(position_ids.shape) in [[bs, cur_seq_len], [1, cur_seq_len]],\
+            f"position_ids should be of shape {[bs, cur_seq_len]} or {[1, cur_seq_len]}, got {position_ids.shape} instead"
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def attention_op(
+        q,
+        k,
+        v,
+        attn_mask,
+        mixedp_attn,
+        head_dim_scaling
+    ):
+    attn = torch.matmul(q, k.transpose(-2, -1))
+    if mixedp_attn:
+        attn = attn.to(torch.float)
+    attn = attn * head_dim_scaling
+    if attn_mask is not None:
+        attn = attn.masked_fill(attn_mask, MASK_MIN_VALUE)
+    attn_weights = torch.softmax(attn, dim=-1).to(q.dtype)
+    attn_output = torch.matmul(attn_weights, v)
+    return attn_output
+def prm_projection(
+    x: torch.Tensor,
+    projection_matrix: torch.Tensor,
+    mixedp_attn: bool = False
+    ):
+    """
+    Constructs nonnegative kernel features for fast softmax attention.
+    Args:
+    x: input for which features are computed
+    projection_matrix: random matrix used to compute features
+    Returns:
+    Random features for fast attention.
+    """
+    # x : [..., m, d]
+    # proj : [..., r, d]
+    scaling_factor = (x.shape[-1] ** -0.5)
+    proj_x = torch.matmul(projection_matrix, x.transpose(-1, -2)) # [..., r, m]
+    norm = torch.sum(x ** 2, dim=-1).unsqueeze(-2) * 0.5 # [..., 1]
+    if mixedp_attn:
+        proj_x = proj_x.to(torch.float)
+        norm = norm.to(torch.float)
+    phi_x =  scaling_factor * (proj_x - norm)
+    return phi_x
+class EvaAttention(nn.Module):
+    def __init__(self, config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.head_dim_scaling = self.head_dim ** -0.5
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.window_size = config.window_size
+        self.num_chunks = config.num_chunks
+        self.chunk_size = config.chunk_size
+        if self.chunk_size is not None:
+            assert self.window_size >= self.chunk_size and self.window_size % self.chunk_size == 0
+            # chunk_size overrides the number of landmarks
+            self.num_chunks = None
+        self.chunks_per_window = int(self.window_size // self.chunk_size)
+        self.random_feature_dim = 1
+        self.adaptive_phi = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+        self.adaptive_mu_k = nn.Parameter(
+            torch.randn(
+                1,
+                self.num_heads,
+                1,
+                1,
+                self.head_dim
+            ).clamp(-1., 1.) * self.head_dim_scaling
+        )
+    def _generate_feature_map(self, rf_q, rf_k, rf_v):
+        rf_k_logits = torch.sum(self.adaptive_mu_k.to(rf_k.dtype) * rf_k, dim=-1, keepdim=True) # b h c m 1
+        if self.config.mixedp_attn:
+            rf_k_logits = rf_k_logits.to(torch.float)
+        rf_k_weights = torch.softmax(rf_k_logits, dim=-2).to(rf_k.dtype)
+        rf_k_bar = torch.sum(rf_k_weights * rf_k, dim=-2)
+        weights = self.adaptive_phi.to(rf_k.dtype)
+        return weights, rf_k_bar
+    def _calculate_chunk_rfa_cache(self, rf_q, rf_k, rf_v, weights, rf_mask=None):
+        proj_x = torch.sum(weights * rf_k, dim=-1, keepdim=True)
+        norm = torch.sum(rf_k ** 2, dim=-1, keepdim=True) * 0.5 # [..., 1]
+        if self.config.mixedp_attn:
+            proj_x = proj_x.to(torch.float)
+            norm = norm.to(torch.float)
+        log_phi_k = self.head_dim_scaling * (proj_x - norm)
+        if rf_mask is not None:
+            log_phi_k = log_phi_k.masked_fill(rf_mask, MASK_MIN_VALUE)
+        # [b, h, c, m, r]
+        softmax_phi_k = torch.softmax(log_phi_k, dim=-2).to(rf_k.dtype)
+        softmax_phi_k_v = torch.sum(softmax_phi_k * rf_v, dim=-2)
+        # [b, h, c, r, m] [b, h, c, m, d] -> [b, h, c, r, d]
+        # softmax_phi_k_v = torch.matmul(softmax_phi_k.transpose(-1, -2), rf_v).squeeze(-2)
+        log_sum_phi_k = None
+        return softmax_phi_k_v, log_sum_phi_k
+    def _calculate_chunk_rfa(self, q, softmax_phi_k_v, log_sum_phi_k, weights):
+        if self.random_feature_dim == 1:
+            # when r = 1, the snis weights becomes 1, so this takes no effect
+            # [b, h, c, r, d] -> [b, h, c, d]
+            return softmax_phi_k_v
+        else:
+            # [b, h, c, r, d] [b, h, 1, s, d] -> [b, h, c, r, s]
+            log_phi_q = prm_projection(q.unsqueeze(-3), weights, self.config.mixedp_attn)
+            # [b, h, c, r, s] [b, h, c, r, 1] -> [b, h, c, r, s]
+            sniw = torch.softmax(log_phi_q + log_sum_phi_k, dim=-1).to(q.dtype)
+            # [b, h, c, r, s] [b, h, c, r, d] -> [b, h, c, s, d] -> [b, h, s, c, d]
+            rfa_per_chunk = torch.matmul(sniw.transpose(-1, -2), softmax_phi_k_v).transpose(-3, -2)
+            return rfa_per_chunk
+    def window_partition(self, x, window_size=None):
+        window_size = window_size if window_size is not None else self.window_size
+        gw, d = x.shape[-2:]
+        leading_dims = x.shape[:-2]
+        n_groups = gw // window_size
+        return x.reshape(*leading_dims, n_groups, window_size, d)
+    def window_merge(self, x, window_size=None):
+        g, w, d = x.shape[-3:]
+        leading_dims = x.shape[:-3]
+        return x.reshape(*leading_dims, g * w, d)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cos: Optional[torch.Tensor] = None,
+        sin: Optional[torch.Tensor] = None,
+        multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        assert not output_attentions
+        bsz, q_len, _ = hidden_states.size()
+        ############################################
+        # initialize past states if not provided
+        ############################################
+        if use_cache and past_key_value is None:
+            raise ValueError
+        if use_cache and multibyte_decoding:
+            raise NotImplementedError("Multibyte decoding is not supported for PyTorch native implementation")
+        # assert isinstance(attention_mask, tuple)
+        if len(attention_mask) == 4:
+            assert use_cache
+            prev_causal_mask, cur_causal_mask, chunk_causal_mask, intra_chunk_mask = attention_mask
+        elif len(attention_mask) == 3:
+            assert not use_cache
+            window_causal_mask, chunk_causal_mask, intra_chunk_mask = attention_mask
+        else:
+            raise NotImplementedError("Only attention-mask tuple with length 2 or 3 is supported")
+        ############################################
+        # compute q, k, v from hidden states
+        ############################################
+        # [b, h, q_len, d]
+        q = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        k = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        # [b, h, kv_len, d]
+        v = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        if use_cache:
+            past_key_value.update_past_len(q.shape[-2], self.layer_idx)
+        ############################################
+        # apply rotary positional embeddings to q, k
+        ############################################
+        q, k = apply_rotary_pos_emb(q, k, cos, sin, position_ids)
+        ############################################
+        # compute q, k, v stats for the local window
+        ############################################
+        if use_cache:
+            (prev_w_q, prev_w_k, prev_w_v), (cur_w_q, cur_w_k, cur_w_v) = past_key_value.update_singletons(
+                q,
+                k,
+                v,
+                self.layer_idx,
+                self.window_size,
+                self.singleton_update
+            )
+        else:
+            prev_w_q = self.window_partition(q) # [b, h, w, i, d]
+            prev_w_k = self.window_partition(k) # [b, h, w, j, d]
+            prev_w_v = self.window_partition(v) # [b, h, w, j, d]
+            # during training, we assume window_size divides seq_len so no remainders
+            cur_w_q = cur_w_k = cur_w_v = None
+        ############################################
+        # compute q, k, v stats for chunk-level RFAs
+        ############################################
+        if use_cache:
+            dump_q, dump_k, dump_v = past_key_value.update_chunks(q, k, v, self.layer_idx, self.chunk_size)
+        else:
+            dump_q, dump_k, dump_v = q, k, v
+        if use_cache:
+            prev_s_mask, cur_s_mask, prev_chunk_mask, cur_chunk_mask, dump_rf_mask = past_key_value.update_mask(
+                prev_s_mask=prev_causal_mask,
+                cur_s_mask=cur_causal_mask,
+                chunk_mask=chunk_causal_mask,
+                rf_mask=intra_chunk_mask,
+                layer_idx=self.layer_idx,
+                window_size=self.window_size,
+                chunk_size=self.chunk_size,
+                singleton_update=self.singleton_update
+            )
+        else:
+            prev_s_mask = window_causal_mask # [1, 1, w, i, j]
+            cur_s_mask = None
+            prev_chunk_mask = self.window_partition(chunk_causal_mask)
+            cur_chunk_mask = None
+            dump_rf_mask = intra_chunk_mask
+            if prev_s_mask.shape[-3] == 1:
+                # need to expand
+                prev_s_mask = prev_s_mask.expand(-1, -1, prev_chunk_mask.shape[-3], -1, -1)
+        if (
+            dump_q is not None and
+            dump_k is not None and
+            dump_v is not None
+        ):
+            # [b, h, c, j, d]
+            rf_q = self.window_partition(dump_q, window_size=self.chunk_size)
+            # [b, h, c, j, d]
+            rf_k = self.window_partition(dump_k, window_size=self.chunk_size)
+            # [b, h, c, j, d]
+            rf_v = self.window_partition(dump_v, window_size=self.chunk_size)
+            if dump_rf_mask is not None:
+                rf_mask = self.window_partition(dump_rf_mask, window_size=self.chunk_size)
+                rf_q = rf_q.masked_fill(rf_mask, 0.)
+                rf_k = rf_k.masked_fill(rf_mask, 0.)
+                rf_v = rf_v.masked_fill(rf_mask, 0.)
+            else:
+                rf_mask = None
+        else:
+            rf_q = None
+            rf_k = None
+            rf_v = None
+            rf_mask = None
+        if rf_q is not None:
+            # import pdb; pdb.set_trace()
+            weights, rf_k_bar = self._generate_feature_map(rf_q, rf_k, rf_v)
+            softmax_phi_k_v, log_sum_phi_k = self._calculate_chunk_rfa_cache(rf_q, rf_k, rf_v, weights, rf_mask=rf_mask)
+            if use_cache:
+                softmax_phi_k_v, log_sum_phi_k, rf_k_bar = past_key_value.update_chunk_rfas(
+                    softmax_phi_k_v, log_sum_phi_k, rf_k_bar, self.layer_idx, 1
+                )
+        elif use_cache:
+            weights = None
+            softmax_phi_k_v, log_sum_phi_k, rf_k_bar = past_key_value.get_chunk_rfas(self.layer_idx)
+        else:
+            weights = None
+            softmax_phi_k_v = None
+            log_sum_phi_k = None
+            rf_k_bar = None
+        if rf_k_bar is not None:
+            rfa_per_chunk = self._calculate_chunk_rfa(q, softmax_phi_k_v, log_sum_phi_k, weights)
+        ############################################
+        # compute meta-attention weights for
+        # - group-wise RFAs and
+        # - singletons (equivalent to exact local attention)
+        ############################################
+        if prev_w_k is not None:
+            if rf_k_bar is not None:
+                num_windows = prev_w_k.shape[-3]
+                # rf_k_bar and rfa_per_chunk take the shape [b, h, c, d]
+                # -> [b, h, 1, c, d] -> [b, h, w, c, d]
+                prev_rf_k_bar = rf_k_bar.unsqueeze(-3).expand(-1, -1, num_windows, -1, -1)
+                prev_rfa_per_chunk = rfa_per_chunk.unsqueeze(-3).expand(-1, -1, num_windows, -1, -1)
+                prev_agg_k = torch.cat([prev_w_k, prev_rf_k_bar], dim=-2)
+                prev_agg_v = torch.cat([prev_w_v, prev_rfa_per_chunk], dim=-2)
+                prev_attn_mask = torch.cat([prev_s_mask, prev_chunk_mask], dim=-1)
+            else:
+                prev_agg_k = prev_w_k
+                prev_agg_v = prev_w_v
+                prev_attn_mask = prev_s_mask
+            prev_attn_output = attention_op(
+                q=prev_w_q,
+                k=prev_agg_k,
+                v=prev_agg_v,
+                attn_mask=prev_attn_mask,
+                mixedp_attn=self.config.mixedp_attn,
+                head_dim_scaling=self.head_dim_scaling
+            )
+            prev_attn_output = self.window_merge(prev_attn_output)
+        if cur_w_k is not None:
+            if rf_k_bar is not None:
+                # rf_k_bar and rfa_per_chunk take the shape [b, h, c, d]
+                # cur_w_k and cur_w_v also has shape [b, h, m, d]
+                cur_agg_k = torch.cat([cur_w_k, rf_k_bar], dim=-2)
+                cur_agg_v = torch.cat([cur_w_v, rfa_per_chunk], dim=-2)
+                cur_attn_mask = torch.cat([cur_s_mask, cur_chunk_mask], dim=-1)
+            else:
+                cur_agg_k = cur_w_k
+                cur_agg_v = cur_w_v
+                cur_attn_mask = cur_s_mask
+            cur_attn_output = attention_op(
+                q=cur_w_q,
+                k=cur_agg_k,
+                v=cur_agg_v,
+                attn_mask=cur_attn_mask,
+                mixedp_attn=self.config.mixedp_attn,
+                head_dim_scaling=self.head_dim_scaling
+            )
+        if prev_w_k is not None and cur_w_k is not None:
+            attn_output = torch.cat([prev_attn_output, cur_attn_output], dim=-2)
+        elif prev_w_k is not None:
+            attn_output = prev_attn_output
+        elif cur_w_k is not None:
+            attn_output = cur_attn_output
+        else:
+            raise ValueError("There must be some bug")
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+        attn_output = attn_output.transpose(1, 2).reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        attn_weights = None
+        return attn_output, attn_weights, past_key_value

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 11,
+  "pad_token_id": 0,
+  "transformers_version": "4.47.1"
+}

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b8f009b49caa79bbd15766b5b29b2dcf4a74a030af1a2043e53c3f15971cf33d
+size 4994268984

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6131455aa5c8cfb07ecf35a9d7cb99f2f7e3a5e0fc9aa90c0c2e00f7e13d583
+size 4947590376

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c497c85b8d257c9b78a1db9caccf6f07718b9b0aa06351fc357f64c9ca896b79
+size 3034842568

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,362 @@

+{
+  "metadata": {
+    "total_size": 12976660480
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00003-of-00003.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_mu_k": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.adaptive_phi": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_mu_k": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.adaptive_phi": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_mu_k": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.adaptive_phi": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
+    "model.norm.weight": "model-00003-of-00003.safetensors"
+  }
+}

modeling_evabyte.py ADDED Viewed

	@@ -0,0 +1,1092 @@

+from typing import List, Optional, Tuple, Union
+import math
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_evabyte import EvaByteConfig
+from .multibyte_decoding_evabyte import MultiByteDecodingMixin
+try:
+    import triton
+    USE_TRITON_IMPL = True
+    from .eva import EvaAttention
+    from .eva_agg_kernel import triton_eva_agg_fwd
+    from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+except ImportError:
+    USE_TRITON_IMPL = False
+    print("WARNING: triton is not installed, using fallback EVA which might be slow and throw errors")
+    from .eva_pt_ref import EvaAttention
+from .eva_cache import EvaCache, EvaStaticCacheForTriton
+MASK_MIN_VALUE = -10e10
+def prepare_eva_attention_mask(
+        seq_len,
+        device,
+        chunk_size,
+        window_size,
+        use_cache=False,
+        cache=None
+    ):
+    """
+    Prepare attention masks for EVA.
+    """
+    chunk_causal_mask  = None
+    window_causal_mask = None
+    if use_cache:
+        cached_seq_len = cache.get_seq_length()
+        total_seq_len = seq_len + cached_seq_len
+        # cached_seq_len will be 0 during prefilling
+        # padded_seq_len = chunk_size * math.ceil(total_seq_len / chunk_size)
+        padded_seq_len = window_size * math.ceil(total_seq_len / window_size)
+        num_chunks = padded_seq_len // chunk_size
+    else:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        assert seq_len % chunk_size == 0
+        num_chunks = seq_len // chunk_size
+        assert seq_len % window_size == 0
+    # create causal mask
+    ################################
+    # generate chunked causal masks
+    ################################
+    # [b, h, j, c, c]
+    chunks_per_window = window_size // chunk_size
+    if num_chunks >= chunks_per_window:
+        chunk_causal_mask = torch.ones(
+            (chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool
+        ).triu(0)
+        num_blocks = num_chunks // chunks_per_window
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            chunk_size,
+            num_blocks,
+            chunks_per_window,
+            num_blocks,
+            chunks_per_window
+        ).transpose(-2, -3)
+        block_diag_zero = (
+            torch.eye(num_blocks, device=device, dtype=torch.bool)
+            .unsqueeze(-1)
+            .unsqueeze(-1)
+            .unsqueeze(0)
+        )
+        # Set diagonal blocks to zero
+        chunk_causal_mask = chunk_causal_mask.masked_fill(block_diag_zero, True)
+        # Reshape back to original size
+        chunk_causal_mask = (
+            chunk_causal_mask
+            .transpose(-2, -3)
+            .reshape(chunk_size, num_chunks, num_chunks)
+            .transpose(-2, -3)
+            .reshape(chunk_size * num_chunks, num_chunks)
+            .unsqueeze(0)
+            .unsqueeze(0)
+        )
+    else:
+        chunk_causal_mask = torch.ones(
+            (1, 1, chunk_size, num_chunks, num_chunks),
+            device=device,
+            dtype=torch.bool,
+        ).triu(0).transpose(-2, -3) # [1, 1, c, j, c]
+        chunk_causal_mask = chunk_causal_mask.reshape(
+            1, 1, chunk_size * num_chunks, num_chunks
+        ) # [1, 1, n, c]
+    if use_cache:
+        chunk_causal_mask = chunk_causal_mask[..., cached_seq_len : cached_seq_len + seq_len, :]
+    window_causal_mask = torch.ones(
+        (1, 1, 1, window_size, window_size),
+        device=device
+    ).triu(1).to(torch.bool)
+    return (chunk_causal_mask, window_causal_mask)
+def pad_to_multiple(tensor, multiple, dim=-2, value=0, create_mask=False, left_padding=False):
+    assert dim < 0 # only accept ``dim'' index in a reverse manner
+    seqlen = int(tensor.shape[dim])
+    m = seqlen / multiple
+    if m.is_integer():
+        if create_mask:
+            return tensor, torch.ones(size=(tensor.shape[0], tensor.shape[dim]), dtype=torch.bool, device=tensor.device)
+        else:
+            return tensor
+    remainder = math.ceil(m) * multiple - seqlen
+    pad_offset = (0,) * (-1 - dim) * 2
+    if left_padding:
+        padded_res = F.pad(tensor, (*pad_offset, remainder, 0), value=value)
+    else:
+        padded_res = F.pad(tensor, (*pad_offset, 0, remainder), value=value)
+    if create_mask:
+        # assume dim 0 is the batch size
+        padding_mask = torch.ones(size=(padded_res.shape[0], padded_res.shape[dim]), dtype=torch.bool, device=padded_res.device)
+        if left_padding:
+            padding_mask[:, :remainder] = False
+        else:
+            padding_mask[:, -remainder:] = False
+        return padded_res, padding_mask
+    else:
+        return padded_res
+class EvaByteRMSNorm(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.fp32_ln = config.fp32_ln
+        self.variance_epsilon = config.rms_norm_eps
+        self.add_unit_offset = config.norm_add_unit_offset
+        if self.add_unit_offset:
+            self.weight = nn.Parameter(torch.zeros(config.hidden_size))
+        else:
+            self.weight = nn.Parameter(torch.ones(config.hidden_size))
+    def forward(self, hidden_states):
+        if hasattr(self, 'config'):
+            fp32_ln = self.config.fp32_ln
+        else:
+            fp32_ln = self.fp32_ln
+        hidden_states = hidden_states.to(torch.float32 if fp32_ln else torch.bfloat16)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        if self.add_unit_offset:
+            return (1 + self.weight) * hidden_states
+        else:
+            return self.weight * hidden_states
+class EvaByteRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._set_cos_sin_cache(seq_len=max_position_embeddings,
+                                device=self.inv_freq.device,
+                                dtype=torch.get_default_dtype())
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        # return (
+        #     self.cos_cached[:seq_len].to(dtype=x.dtype),
+        #     self.sin_cached[:seq_len].to(dtype=x.dtype),
+        # )
+        if seq_len < self.max_seq_len_cached:
+            cos_slice = self.cos_cached.split(seq_len, dim=0)[0]
+            sin_slice = self.sin_cached.split(seq_len, dim=0)[0]
+        else:
+            cos_slice = self.cos_cached
+            sin_slice = self.sin_cached
+        return (
+            cos_slice.to(dtype=x.dtype),
+            sin_slice.to(dtype=x.dtype),
+        )
+class EvaByteLinearScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        t = t / self.scaling_factor
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteDynamicNTKScalingRotaryEmbedding(EvaByteRotaryEmbedding):
+    """EvaByteRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * ((self.scaling_factor * seq_len / self.max_position_embeddings) -
+                                (self.scaling_factor - 1))**(self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base**(torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+class EvaByteMLP(nn.Module):
+    def __init__(self, config, layer_idx: int = None):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+        self.config = config
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class EvaByteDecoderLayer(nn.Module):
+    def __init__(self, config: EvaByteConfig, layer_idx: int = None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.self_attn = EvaAttention(config=config, layer_idx=layer_idx)
+        self.mlp = EvaByteMLP(config, layer_idx=layer_idx)
+        self.input_layernorm = EvaByteRMSNorm(config)
+        self.post_attention_layernorm = EvaByteRMSNorm(config)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            cos: Optional[torch.Tensor] = None,
+            sin: Optional[torch.Tensor] = None,
+            multibyte_decoding: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(hidden_states=hidden_states,
+                                                                            attention_mask=attention_mask,
+                                                                            position_ids=position_ids,
+                                                                            past_key_value=past_key_value,
+                                                                            output_attentions=output_attentions,
+                                                                            use_cache=use_cache,
+                                                                            cos=cos,
+                                                                            sin=sin,
+                                                                            multibyte_decoding=multibyte_decoding)
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        if self.config.fp32_skip_add:
+            residual = residual.float()
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states, )
+        if output_attentions:
+            outputs += (self_attn_weights, )
+        if use_cache:
+            outputs += (present_key_value, )
+        return outputs
+class EvaBytePreTrainedModel(PreTrainedModel):
+    config_class = EvaByteConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["EvaByteDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", 0.02)
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, EvaByteModel):
+            module.gradient_checkpointing = value
+class EvaByteModel(EvaBytePreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`EvaByteDecoderLayer`]
+    Args:
+        config: EvaByteConfig
+    """
+    def __init__(self, config: EvaByteConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = self.config.max_position_embeddings
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([EvaByteDecoderLayer(config, layer_idx=layer_idx) for layer_idx in range(config.num_hidden_layers)])
+        self.norm = EvaByteRMSNorm(config)
+        self.gradient_checkpointing = False
+        self.rope = config.rope_theta
+        # Initialize weights and apply final processing
+        self.post_init()
+        self._init_rope()
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = EvaByteRotaryEmbedding(self.head_dim,
+                                                   max_position_embeddings=self.max_position_embeddings,
+                                                   base=self.rope)
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = EvaByteLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            elif scaling_type == "dynamic":
+                self.rotary_emb = EvaByteDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _helper_padding_mask(
+            self,
+            padding_mask,
+            causal_mask
+    ):
+        padding_mask = torch.logical_or(padding_mask, padding_mask.transpose(-1, -2))
+        return torch.logical_or(padding_mask, causal_mask)
+    def _prepare_eva_generation_attn_mask_triton(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                cur_rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                cur_rf_mask = None
+            if past_key_values.s_mask[0] is not None:
+                cur_s_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.s_mask[0].dtype,
+                    device=past_key_values.s_mask[0].device
+                )
+            else:
+                cur_s_mask = None
+            seen_tokens = past_key_values.get_seq_length()
+            if seen_tokens <= self.config.window_size:
+                rfa_chunks_dummy_mask = None
+            else:
+                if cur_s_mask is not None:
+                    chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+                    # the ongoing decoding step would be (seen_seq_len + 1)-th token
+                    num_windows_seen_so_far = seen_tokens // self.config.window_size
+                    rfa_chunks_dummy_mask = torch.zeros(
+                        (batch_size, 1, seq_len, num_windows_seen_so_far * chunks_per_window),
+                        dtype=past_key_values.s_mask[0].dtype,
+                        device=past_key_values.s_mask[0].device
+                    )
+                else:
+                    rfa_chunks_dummy_mask = None
+            # rf_mask and cur_mask are 0s because we do not want to mask them
+            return (cur_s_mask, cur_rf_mask, rfa_chunks_dummy_mask)
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            padded_attention_mask = pad_to_multiple(
+                attention_mask,
+                self.config.window_size,
+                dim=-1,
+                value=0,
+                create_mask=False,
+                left_padding=False
+            )
+            # convert 0 -> padding to 1 -> padding
+            padded_rf_mask = ~padded_attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            # [b, 1, w, j, 1]
+            padded_w_attn_mask = padded_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1).to(torch.bool)
+            # [b, 1, w, j, 1] [b, 1, w, 1, j] -> [b, 1, w, j, j]
+            w_padding_mask = torch.logical_or(padded_w_attn_mask, padded_w_attn_mask.transpose(-1, -2))
+            w_causal_mask = torch.ones(
+                (1, 1, 1, self.config.window_size, self.config.window_size),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            s_mask = torch.logical_or(w_padding_mask, w_causal_mask)
+            s_mask = s_mask.reshape(batch_size, 1, -1, self.config.window_size)
+            s_mask = s_mask[..., :seq_len, :]
+            # negate the attention mask to get the padding mask
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+            return (s_mask, rf_mask)
+        else:
+            return (None, None)
+    def _prepare_eva_generation_attn_mask(
+        self,
+        attention_mask,
+        input_ids,
+        use_cache,
+        past_key_values
+    ):
+        batch_size, seq_len = input_ids.shape
+        if use_cache and past_key_values.get_seq_length() > 0:
+            # decoding phase
+            if past_key_values.rf_mask[0] is not None:
+                rf_mask = torch.zeros(
+                    (batch_size, 1, seq_len, 1),
+                    dtype=past_key_values.rf_mask[0].dtype,
+                    device=past_key_values.rf_mask[0].device
+                )
+            else:
+                rf_mask = None
+            cur_causal_mask = torch.zeros(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            chunk_causal_mask = torch.ones(
+                (batch_size, 1, seq_len, 1),
+                dtype=torch.bool,
+                device=input_ids.device
+            )
+            # chunk_causal_mask are 1s because we will mask them by default and
+            # will be unmasked when the current singleton attention is processed over
+            return (None, cur_causal_mask, chunk_causal_mask, rf_mask)
+        true_num_chunks = seq_len // self.config.chunk_size
+        chunk_causal_mask, _ = prepare_eva_attention_mask(
+            seq_len,
+            input_ids.device,
+            self.config.chunk_size,
+            self.config.window_size,
+            use_cache=use_cache,
+            cache=past_key_values
+        )
+        chunk_causal_mask = chunk_causal_mask[..., :seq_len, :true_num_chunks]
+        if attention_mask is not None and torch.any(attention_mask == 0.0):
+            # convert 0 -> padding to 1 -> padding
+            rf_mask = ~attention_mask.unsqueeze(1).unsqueeze(-1).to(torch.bool) # [b, 1, n, 1]
+        else:
+            rf_mask = None
+        if seq_len < self.config.window_size:
+            cur_window_mask = torch.ones(
+                (1, 1, seq_len, seq_len),
+                device=input_ids.device
+            ).triu(1).to(torch.bool)
+            if rf_mask is not None:
+                cur_window_mask = self._helper_padding_mask(rf_mask, cur_window_mask)
+            prev_window_mask = None
+        else:
+            if seq_len % self.config.window_size == 0:
+                num_windows = seq_len // self.config.window_size
+                cur_window_mask = None
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask = rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+            else:
+                num_windows = seq_len // self.config.window_size
+                remainder_tokens = seq_len % self.config.window_size
+                cur_window_mask = torch.ones(
+                    (1, 1, remainder_tokens, remainder_tokens),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                prev_window_mask = torch.ones(
+                    (1, 1, num_windows, self.config.window_size, self.config.window_size),
+                    device=input_ids.device
+                ).triu(1).to(torch.bool)
+                if rf_mask is not None:
+                    prev_rf_mask, cur_rf_mask = torch.split(rf_mask, [seq_len - remainder_tokens, remainder_tokens], dim=-2)
+                    cur_window_mask = self._helper_padding_mask(cur_rf_mask, cur_window_mask)
+                    prev_rf_mask = prev_rf_mask.reshape(batch_size, 1, -1, self.config.window_size, 1)
+                    prev_window_mask = self._helper_padding_mask(prev_rf_mask, prev_window_mask)
+        return (prev_window_mask, cur_window_mask, chunk_causal_mask, rf_mask)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None,
+    ) -> Tuple:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if self.gradient_checkpointing and self.training and use_cache:
+            raise ValueError("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
+        batch_size, seq_len = input_ids.shape
+        #### Step 0. Hack
+        if (not self.training) and (not use_cache) and (not multibyte_decoding):
+            # forward-only inference mode.
+            # We tweak use_cache to be True to reuse code for generation
+            use_cache = True
+            device = input_ids.device if input_ids is not None else None
+            if position_ids is None:
+                position_ids = torch.arange(0, seq_len, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        #### Step 1. Prepare caches if in inference mode
+        if use_cache:
+            if past_key_values is not None:
+                assert isinstance(past_key_values, Cache)
+            else:
+                if not USE_TRITON_IMPL:
+                    past_key_values = EvaCache()
+                else:
+                    past_key_values = EvaStaticCacheForTriton(
+                        input_ids.shape[0],
+                        self.config.num_attention_heads,
+                        self.config.window_size,
+                        self.config.hidden_size // self.config.num_attention_heads,
+                        self.config.num_hidden_layers,
+                        self.embed_tokens.weight.dtype,
+                        self.embed_tokens.weight.device,
+                    )
+        if not multibyte_decoding:
+            if use_cache:
+                if USE_TRITON_IMPL:
+                    causal_mask = self._prepare_eva_generation_attn_mask_triton(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+                else:
+                    causal_mask = self._prepare_eva_generation_attn_mask(
+                        attention_mask,
+                        input_ids,
+                        use_cache,
+                        past_key_values
+                    )
+            else:
+                assert self.training
+                assert seq_len % self.config.window_size == 0
+                # for training, we need to pass in the attention mask
+                # usually calculated by _prepare_training_attn_mask()
+                causal_mask = attention_mask
+        else:
+            assert use_cache
+            causal_mask = attention_mask
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        max_seq_length = past_seen_tokens + inputs_embeds.shape[1]
+        hidden_states = inputs_embeds
+        if position_ids is None:
+            assert not use_cache, "during decoding we must explicitly pass position_ids to the model call"
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(past_seen_tokens, max_seq_length, device=device, dtype=int).reshape(1, -1).expand(batch_size, -1)
+        cos, sin = self.rotary_emb(hidden_states, seq_len=max_seq_length)
+        assert len(cos.shape) == 2, f"cos should be of shape (max_seq_len, head_dim), got {cos.shape} instead"
+        assert sin.shape == cos.shape, f"sin should be of shape (max_seq_len, head_dim), got {sin.shape} instead"
+        assert len(position_ids.shape) == 2, f"position_ids should be of 2D, got {position_ids.shape} instead"
+        cos = cos[position_ids, :]
+        sin = sin[position_ids, :]
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+        if USE_TRITON_IMPL and (not multibyte_decoding):
+            # the masks generated above for triton kernels are boolean. Convert them to floats
+            if (
+                (not use_cache) or
+                (use_cache and past_seen_tokens == 0)
+            ):
+                window_mask, intra_chunk_mask = causal_mask
+                if window_mask is not None:
+                    assert window_mask.dtype == torch.bool
+                    window_mask_float = window_mask.to(torch.float)
+                    window_mask_float = window_mask_float.masked_fill(window_mask.to(torch.bool), MASK_MIN_VALUE)
+                    window_mask_float = window_mask_float.reshape(batch_size, 1, -1, self.config.window_size)
+                    window_mask = window_mask_float.to(hidden_states.dtype)
+                if intra_chunk_mask is not None:
+                    assert intra_chunk_mask.dtype == torch.bool
+                    intra_chunk_mask_float = intra_chunk_mask.to(torch.float)
+                    intra_chunk_mask_float = intra_chunk_mask_float.masked_fill(intra_chunk_mask.to(torch.bool), MASK_MIN_VALUE)
+                    intra_chunk_mask = intra_chunk_mask_float.to(hidden_states.dtype)
+                causal_mask = (window_mask, intra_chunk_mask)
+        if self.config.fp32_skip_add:
+            hidden_states = hidden_states.float()
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states, )
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache=None)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cos=cos,
+                    sin=sin,
+                    multibyte_decoding=multibyte_decoding,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1], )
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states, )
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class EvaByteForCausalLM(EvaBytePreTrainedModel, MultiByteDecodingMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        EvaBytePreTrainedModel.__init__(self, config)
+        self.model = EvaByteModel(config)
+        self.vocab_size = config.vocab_size
+        # define multibyte prediction heads
+        if hasattr(config, "num_pred_heads") and config.num_pred_heads > 1:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size * config.num_pred_heads, bias=False)
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def _prepare_training_attn_mask(
+        self,
+        target_token_type_ids,
+        use_doc_boundary_attention,
+        EOS_TOKEN_TYPE_ID=None,
+        PAD_TOKEN_TYPE_ID=None,
+    ):
+        '''
+        This function prepares the attention mask for training byte models.
+            target_token_type_ids:
+                Tensor of shape (batch_size, seq_len), marking the token type ids
+                for the target sequence. In particular, we should have
+                    - target_token_type_ids[i, j] = EOS_TOKEN_TYPE_ID
+                        if the j-th token in the i-th sequence is the end of an article.
+                    - target_token_type_ids[i, j] = PAD_TOKEN_TYPE_ID
+                        if the j-th token in the i-th sequence is the padding token.
+            use_doc_boundary_attention: bool,
+                whether to enable doc boundary attention.
+            EOS_TOKEN_TYPE_ID: int,
+                the token type id for the end of an article.
+            PAD_TOKEN_TYPE_ID: int,
+                the token type id for the padding token.
+        '''
+        assert self.training
+        batch_size, num_tokens = target_token_type_ids.shape
+        chunk_causal_mask, window_causal_mask = prepare_eva_attention_mask(
+            num_tokens,
+            target_token_type_ids.device,
+            chunk_size=self.config.chunk_size,
+            window_size=self.config.window_size,
+            use_cache=False,
+            cache=None
+        )
+        if use_doc_boundary_attention:
+            #### step 1: mark each document with a unique id
+            end_token_ids = {EOS_TOKEN_TYPE_ID, PAD_TOKEN_TYPE_ID}
+            token_types = torch.zeros(batch_size, num_tokens)
+            for sequence_idx, sequence in enumerate(target_token_type_ids):
+                num_articles = 0
+                start_index = 0
+                # for each sample in the batch, the collapsed attention mask looks like:
+                # [1, 1, .... 1, 0, 2, 2, ... 2, 0, ... n, n ..... n], assuming there are n articles in the sequence.
+                # Each of the n articles are separated by 0.
+                for token_idx, token_type_id in enumerate(sequence):
+                    if start_index is not None and token_type_id.item() in end_token_ids:
+                        num_articles += 1
+                        end_index = token_idx if token_type_id == PAD_TOKEN_TYPE_ID else token_idx + 1
+                        token_types[sequence_idx][start_index:end_index] = num_articles
+                        start_index = None
+                    elif start_index is None and token_type_id not in end_token_ids:
+                        start_index = token_idx + 1
+            assert num_tokens % self.config.chunk_size == 0, "Number of tokens must be divisible by chunk size"
+            assert num_tokens % self.config.window_size == 0, "Number of tokens must be divisible by window size"
+            num_chunks = num_tokens // self.config.chunk_size
+            num_windows = num_tokens // self.config.window_size
+            article_separator = 0
+            #### step 2: generate attention masks for each window
+            #### NOTE: we perform exact attention within each window,
+            ####       so we only need to mask out different documents
+            ####       for each window.
+            token_types_windows = token_types.reshape(batch_size, num_windows, self.config.window_size, 1)
+            token_types_windows_t = token_types_windows.transpose(-1, -2)
+            # replace all elements in TOKEN_SEPS with -1
+            token_types_windows = torch.where(token_types_windows == article_separator, -1, token_types_windows)
+            window_3d_mask = (token_types_windows == token_types_windows_t)
+            window_3d_mask = ~window_3d_mask
+            #### step 3: generate chunk-level 3D masks
+            #### NOTE: this is a bit tricky, as we aim to mask out different
+            ####       documents to avoid cross-doc attention across chunks.
+            #### Example: suppose we have a sequence of length 12 with 3 documents:
+            ####       [1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3].
+            ####       The chunk-size and window-size are both 4.
+            ####       The chunk-level mask of shape (batch_size, seq_len, num_chunks) is:
+            ####       [
+            ####         [0, 0, 0],
+            ####         [0, 0, 0],
+            ####         [0, 0, 0],
+            ####         [0, 0, 0],
+            ####
+            ####         [1, 0, 0],
+            ####         [0, 0, 0],
+            ####         [0, 0, 0],
+            ####         [0, 0, 0],
+            ####
+            ####         [0, 1, 0],
+            ####         [0, 1, 0],
+            ####         [0, 1, 0],
+            ####         [0, 1, 0],
+            ####       ]
+            ####       Explanation:
+            ####       - Tokens will not attend to their own and future chunks.
+            ####         (as tokens within a chunk are captured by the window-level exact attention)
+            ####       - Tokens will attend to a chunk only if there are tokens
+            ####         from the same document in that chunk.
+            ####       The mask within each chunk of shape (batch_size, num_chunks, chunk_size) is:
+            ####       [
+            ####         [1, 1, 1, 1],
+            ####         [0, 0, 0, 1],
+            ####         [1, 1, 1, 1],
+            ####       ]
+            ####       Explanation:
+            ####       - If all tokens in a chunk are from the same document,
+            ####         no tokens will be masked out.
+            ####       - If there are tokens from different documents in a chunk,
+            ####         only tokens from the rightmost document will be kept.
+            ####         (b/c the future chunks might contain tokens from the rightmost document,
+            ####         but all the remaining docs will never get attended by other docs)
+            token_types_chunks = token_types.reshape(batch_size, num_chunks, self.config.chunk_size)
+            inter_chunk_mask = torch.zeros((batch_size, num_tokens, num_chunks), dtype=torch.bool)
+            intra_chunk_mask = torch.ones_like(token_types_chunks, dtype=torch.bool)
+            for chunk_idx in range(num_chunks):
+                for batch_idx in range(batch_size):
+                    # Identify tokens in the current chunk belonging to each sequence
+                    chunk = token_types_chunks[batch_idx, chunk_idx]
+                    unique_elements = torch.unique(chunk, sorted=True).tolist()
+                    # Create a mask for whether each token can attend to the current chunk
+                    for token_type in unique_elements:
+                        if token_type == article_separator:
+                            continue
+                        token_mask = (token_types[batch_idx] == token_type)
+                        inter_chunk_mask[batch_idx, :, chunk_idx] |= token_mask
+                    # Create a mask within each chunk
+                    unique_elements = [x for x in unique_elements if x != article_separator]
+                    if len(unique_elements) > 1 and chunk[-1] != article_separator:
+                        intra_chunk_mask[batch_idx, chunk_idx] = (chunk == unique_elements[-1])
+            inter_chunk_mask = ~inter_chunk_mask
+            intra_chunk_mask = ~intra_chunk_mask
+            window_mask = torch.logical_or(window_causal_mask, window_3d_mask.unsqueeze(1))
+            inter_chunk_mask = torch.logical_or(chunk_causal_mask, inter_chunk_mask.unsqueeze(1))
+            intra_chunk_mask = intra_chunk_mask.unsqueeze(1).unsqueeze(-1)
+            joint_mask = torch.cat([window_mask, inter_chunk_mask.reshape(*window_mask.shape)], dim=-1)
+            attention_mask = (joint_mask, intra_chunk_mask)
+        else:
+            joint_mask = torch.cat([window_causal_mask, chunk_causal_mask.reshape(*window_causal_mask.shape)], dim=-1)
+            attention_mask = (joint_mask, None)
+        return attention_mask
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            return_all_pred_logits: Optional[bool] = None,
+            multibyte_decoding: Optional[bool] = None) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (output_hidden_states
+                                if output_hidden_states is not None else self.config.output_hidden_states)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is None:
+            assert past_key_values is None
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            multibyte_decoding=multibyte_decoding,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        if self.config.fp32_logits:
+            logits = logits.float()
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(reduction="none")
+            if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+                shift_logits = logits.view(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+                # shift_logits = shift_logits.view(-1, logits.shape[1] * self.config.num_pred_heads, self.config.vocab_size)
+                shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            else:
+                shift_logits = logits.view(-1, self.config.vocab_size)
+            shift_labels = labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if hasattr(self.config, "num_pred_heads") and self.config.num_pred_heads > 1:
+            all_pred_logits = logits.reshape(logits.shape[0], logits.shape[1], self.config.num_pred_heads, self.config.vocab_size)
+            if return_all_pred_logits:
+                logits = all_pred_logits
+            else:
+                logits = all_pred_logits[..., 0, :]
+        if not return_dict:
+            output = (logits, ) + outputs[1:]
+            return (loss, ) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(self,
+                                      input_ids,
+                                      past_key_values=None,
+                                      attention_mask=None,
+                                      inputs_embeds=None,
+                                      use_cache=True,
+                                      **kwargs):
+        # prefill phase:
+        # input_ids:      b x s
+        # attention_mask: None if no padding or b x s
+        # position_ids :  b x s
+        # token gen phase:
+        # input_ids : b x 1
+        # attention_mask: b x 1 x s
+        # position_ids:  b x 1
+        past_length = 0
+        if past_key_values is not None:
+            assert isinstance(past_key_values, Cache)
+            past_length = past_key_values.get_seq_length()
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1]:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        # must initialize position_ids at each step during GPU inference
+        assert position_ids is not None
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(
+                past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), )
+        return reordered_past

multibyte_decoding_evabyte.py ADDED Viewed

	@@ -0,0 +1,881 @@

+# The implementation of multibyte deocidng is largely adapted from
+# Medusa decoding: https://github.com/FasterDecoding/Medusa
+import torch
+import torch.nn.functional as F
+from transformers.generation.stopping_criteria import (
+    MaxLengthCriteria,
+    StoppingCriteriaList,
+)
+from typing import Union, List
+from .eva_cache import EvaStaticCacheForTriton
+from .eva_prep_kv_kernel import triton_eva_prep_kv_fwd
+class MultibyteEosTokenCriteria:
+    """
+    This class implements a simple stopping criteria to stop generation whenever
+    the "end-of-sequence" token is generated in the last `new_tokens` tokens.
+    Adapted from
+    https://github.com/huggingface/transformers/blob/main/src/transformers/generation/stopping_criteria.py#L446
+    By default, it uses the `model.generation_config.eos_token_id`.
+    Args:
+        eos_token_id (`Union[int, List[int]]`):
+            The id(s) of the *end-of-sequence* token.
+    """
+    def __init__(self, eos_token_ids: Union[int, List[int]]):
+        if isinstance(eos_token_ids, int):
+            eos_token_ids = [eos_token_ids]
+        self.eos_token_ids = eos_token_ids
+    def __call__(self, input_ids: torch.LongTensor, new_tokens: int) -> bool:
+        current_input_len = input_ids.shape[-1]
+        new_token_ids = input_ids[:, current_input_len - new_tokens:]
+        for eos_token_id in self.eos_token_ids:
+            if torch.any(new_token_ids == eos_token_id):
+                return True
+        return False
+def build_tree(spec):
+    nodes_at_depth = []
+    nodes_at_depth.append([()])  # Root at depth 1
+    for d in range(1, len(spec) + 1):
+        prev_nodes = nodes_at_depth[d - 1]
+        spec_list = spec[d - 1]
+        current_nodes = []
+        for node_idx, node in enumerate(prev_nodes):
+            if node_idx < len(spec_list):
+                num_children = spec_list[node_idx]
+            else:
+                num_children = 0
+            for child_idx in range(num_children):
+                new_node = node + (child_idx,)
+                current_nodes.append(new_node)
+        nodes_at_depth.append(current_nodes)
+    # Flatten the list of nodes, excluding the root node if desired
+    all_nodes = [node for depth_nodes in nodes_at_depth for node in depth_nodes if node]
+    return all_nodes
+evabyte_7b_95 = build_tree(
+    [
+        [10],
+        [10, 8, 2, 2, 1, 1],
+        [10, 4, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 0, 0, 0, 1],
+        [8, 2, 2, 1, 0, 0, 0, 0, 0, 0, 1],
+        [6, 2, 1, 1],
+        [4, 2, 1, 1],
+        [4, 2, 1],
+    ]
+)
+evabyte_7b_31 = build_tree(
+    [
+        [4],
+        [3, 2, 1, 1],
+        [3, 2, 1, 1],
+        [2, 1, 1],
+        [2, 1],
+        [2, 1],
+        [2, 1],
+    ]
+)
+TOPK = 10 # topk for sparse tree (10 is a placeholder and it is sufficient)
+def pad_path(path, length, pad_value=-2):
+    """
+    Pad the given path list with a specific value up to a specified length.
+    Parameters:
+    - path (list): The original list that needs padding.
+    - length (int): The desired length of the padded list.
+    - pad_value (optional, default=-2): The value to use for padding.
+    Returns:
+    - list: A new list based on the original path but padded to the desired length.
+    Example:
+    >>> pad_path([1,2,3], 5)
+    [1, 2, 3, -2, -2]
+    Note:
+    If the given path is already longer than the specified length,
+    then no padding occurs, and the original path is returned.
+    """
+    return path + [pad_value] * (length - len(path))
+def reset_past_key_values(passed_key_values):
+    """
+    Resets the current lengths in the passed key-values to zero.
+    This function is designed to be used during the evaluation of a baseline model.
+    It iterates through each layer's key-values and sets their current lengths to zero,
+    effectively resetting their state.
+    Args:
+    - passed_key_values (list of torch.Tensor): Contains past hidden states and past attention values for each layer.
+    Returns:
+    - passed_key_values (list of torch.Tensor): Updated past hidden states and past attention values with reset lengths.
+    """
+    for i in range(len(passed_key_values)):
+        for j in range(2):
+            passed_key_values[i][j].current_length.fill_(0)
+    return passed_key_values
+def get_nucleus_one_token(logit, temperature, top_p):
+    """
+    Performs token sampling based on the nucleus (top-p) sampling method.
+    This function selects a token from a given logit distribution using the nucleus sampling strategy.
+    It allows for more controlled and diverse generation compared to traditional top-k sampling.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor (BxC).
+        temperature (float): A temperature parameter to control the randomness in sampling.
+                             Higher values increase diversity, lower values make selections more deterministic.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+                       It controls the size of the set of high-probability tokens to consider for sampling.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    if top_p >= 1:
+        return torch.multinomial(F.softmax(logit / temperature, dim=-1), 1)
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def get_typical_one_token(logit, temperature, posterior_threshold, posterior_alpha):
+    """
+    Implements token sampling based on the typical sampling method.
+    This function selects a token from a given logit distribution using the typical sampling strategy,
+    aiming to balance between diversity and likelihood in a more nuanced way compared to traditional methods.
+    Args:
+        logit (torch.Tensor): The logits from a language model output, expected to be a 2D tensor.
+        temperature (float): A parameter to control the randomness in sampling.
+                              Higher values increase diversity, lower values make selections more deterministic.
+        posterior_threshold (float): A threshold to decide the lower bound of probabilities to be considered for sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A tensor containing the indices of the sampled tokens.
+    """
+    logit = logit / temperature
+    probs = torch.softmax(logit, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logit[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logit, dim=-1), 1)
+    return sampled_tokens
+def generate_medusa_buffers(medusa_choices, device="cuda"):
+    """
+    Generate buffers for the Medusa structure based on the provided choices.
+    Parameters:
+    - medusa_choices (list): A nested list representing tree in the Medusa structure.
+    - device (str): Device to which the tensors should be moved. Default is "cuda".
+    Returns:
+    - dict: A dictionary containing buffers related to the Medusa structure.
+    """
+    # Sort the medusa_choices based on their lengths and then their values
+    sorted_medusa_choices = sorted(medusa_choices, key=lambda x: (len(x), x))
+    medusa_len = len(sorted_medusa_choices) + 1
+    # Initialize depth_counts to keep track of how many choices have a particular depth
+    depth_counts = [0] * max([len(path) for path in sorted_medusa_choices])
+    for path in sorted_medusa_choices:
+        depth_counts[len(path) - 1] += 1
+    # Create the attention mask for Medusa
+    medusa_attn_mask = torch.eye(medusa_len, medusa_len)
+    medusa_attn_mask[:, 0] = 1
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            # retrieve ancestor position
+            if len(cur_medusa_choice) == 1:
+                continue
+            ancestor_idx = []
+            for c in range(len(cur_medusa_choice) - 1):
+                ancestor_idx.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]) + 1)
+            medusa_attn_mask[j + start + 1, ancestor_idx] = 1
+        start += depth_counts[i]
+    # Generate tree indices for the Medusa structure
+    medusa_tree_indices = torch.zeros(medusa_len, dtype=torch.long)
+    medusa_tree_indices[0] = 0
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_medusa_choice = sorted_medusa_choices[start + j]
+            medusa_tree_indices[start + j + 1] = cur_medusa_choice[-1] + TOPK * i + 1
+        start += depth_counts[i]
+    # Generate position IDs for the Medusa structure
+    medusa_position_ids = torch.zeros(medusa_len, dtype=torch.long)
+    start = 0
+    for i in range(len(depth_counts)):
+        medusa_position_ids[start + 1: start + depth_counts[i] + 1] = i + 1
+        start += depth_counts[i]
+    # Generate retrieval indices for Medusa structure verification
+    retrieve_indices_nest = []
+    retrieve_paths = []
+    for i in range(len(sorted_medusa_choices)):
+        cur_medusa_choice = sorted_medusa_choices[-i-1]
+        retrieve_indice = []
+        if cur_medusa_choice in retrieve_paths:
+            continue
+        else:
+            for c in range(len(cur_medusa_choice)):
+                retrieve_indice.append(sorted_medusa_choices.index(cur_medusa_choice[:c+1]))
+                retrieve_paths.append(cur_medusa_choice[:c+1])
+        retrieve_indices_nest.append(retrieve_indice)
+    max_length = max([len(x) for x in retrieve_indices_nest])
+    retrieve_indices = [pad_path(path, max_length) for path in retrieve_indices_nest]
+    retrieve_indices = torch.tensor(retrieve_indices, dtype=torch.long)
+    retrieve_indices = retrieve_indices + 1
+    retrieve_indices = torch.cat([torch.zeros((retrieve_indices.shape[0], 1), dtype=torch.long), retrieve_indices], dim=1)
+    # Aggregate the generated buffers into a dictionary
+    medusa_buffers = {
+        "medusa_attn_mask": medusa_attn_mask.unsqueeze(0).unsqueeze(0),
+        "tree_indices": medusa_tree_indices,
+        "medusa_position_ids": medusa_position_ids.unsqueeze(0),
+        "retrieve_indices": retrieve_indices,
+    }
+    # Move the tensors in the dictionary to the specified device
+    medusa_buffers = {
+        k: v.clone().to(device)
+        if isinstance(v, torch.Tensor)
+        else torch.tensor(v, device=device)
+        for k, v in medusa_buffers.items()
+    }
+    return medusa_buffers
+def generate_candidates(
+        medusa_logits,
+        logits,
+        tree_indices,
+        retrieve_indices,
+        temperature = 0,
+        posterior_threshold=0.3,
+        posterior_alpha = 0.09,
+        top_p=0.8,
+        sampling = 'typical',
+        fast = False
+    ):
+    # Say we have 3 heads, and the top-4 for each head are:
+    # [10, 3, 8, 4]
+    # [9, 5, 1, 6]
+    # [7, 16, 3, 2]
+    # candidates_id = 10
+    if temperature == 0 or fast:
+        candidates_ids = torch.argmax(logits[:, -1]).unsqueeze(0)
+    else:
+        if sampling == 'typical':
+            candidates_ids = get_typical_one_token(logits[:, -1], temperature, posterior_threshold, posterior_alpha).squeeze(0)
+        elif sampling == 'nucleus':
+            candidates_ids = get_nucleus_one_token(logits[:, -1], temperature, top_p).squeeze(0)
+        else:
+            raise NotImplementedError
+    # this calculates the top-k medusa logits
+    # candidates_medusa_id = [
+    #   [9, 5, 1, 6]
+    #   [7, 16, 3, 2]
+    # ]
+    candidates_medusa_ids = torch.topk(medusa_logits[:, 0, -1], TOPK, dim=-1).indices
+    # [10, 9, 5, 1, 6, 7, 16, 3, 2]
+    candidate_ids = torch.cat([candidates_ids, candidates_medusa_ids.view(-1)], dim=-1)
+    # based on the pre-defined tree_indices, select the corresponding candidates
+    # if we select top-2 and top-3 for the two heads (we select top-1 for the first head):
+    # tree_candidates = [10, 9, 5, 7, 16, 3, 7, 16, 3]
+    tree_candidate_ids = candidate_ids[tree_indices]
+    # tree_candidate_ids = [10, 9, 5, 7, 16, 3, 7, 16, 3, 0]
+    # Sometimes the tree_indices are padded, so we append a zero here
+    # so that all padded indices select the appended zero.
+    tree_candidate_ids_ext = torch.cat(
+        [
+            tree_candidate_ids,
+            torch.zeros((1), dtype=torch.long, device=tree_candidate_ids.device)
+        ],
+        dim=0
+    )
+    # [[10, 9, 7], [10, 9, 16], [10, 9, 3], [10, 5, 7], [10, 5, 16], [10, 5, 3]]
+    unflattened_candidate_ids = tree_candidate_ids_ext[retrieve_indices]
+    tree_candidate_ids = tree_candidate_ids.unsqueeze(0)
+    return tree_candidate_ids, unflattened_candidate_ids
+def get_nucleus_posterior_mask(logits, candidates, temperature, top_p):
+    """
+    Generates a posterior mask for token candidates using nucleus (top-p) sampling.
+    This function applies nucleus sampling to a set of logits, and then generates a mask indicating
+    which candidate tokens are selected. It adapts the sampling strategy to accommodate for
+    temperature scaling and cumulative probability thresholding.
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        top_p (float): The cumulative probability threshold for nucleus sampling.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    # adapted from https://github.com/huggingface/transformers/blob/18a879f47576822aa1a5c49aecb27d89bfa5fa69/examples/run_generation.py#L79
+    # Apply temperature
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    if top_p >= 1:
+        sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+        sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+        posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+        return posterior_mask
+    # Convert to probabilities (softmax)
+    probs = F.softmax(logits, dim=-1)
+    # Sort the probabilities
+    sorted_logits, sorted_indices = torch.sort(probs, descending=True)
+    # Compute cumulative probabilities
+    cum_probs = torch.cumsum(sorted_logits, dim=-1)
+    # Create mask for the top-p nucleus
+    sorted_indices_to_remove = cum_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    # Remove low-probability tokens
+    logits[indices_to_remove] = float('-inf')
+    # Sample from the remaining tokens
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    # Create a mask for selected tokens
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha):
+    """
+    Args:
+        logits (torch.Tensor): A tensor of logits from a language model output.
+        candidates (torch.Tensor): A tensor of candidate tokens to compare against sampled tokens.
+        temperature (float): A parameter to scale the logits, controlling randomness in sampling.
+        posterior_threshold (float): The minimum threshold for probabilities to be considered in sampling.
+        posterior_alpha (float): A scaling factor applied to the entropy-based adaptive threshold.
+    Returns:
+        torch.Tensor: A posterior mask indicating which candidate tokens match the sampled tokens.
+    """
+    logits = logits[:, :-1] / temperature
+    n_samples, n_tokens = logits.shape[0], logits.shape[1]
+    logits = logits.view(n_samples*n_tokens, -1)
+    probs = F.softmax(logits, dim=-1)
+    entropy = -torch.sum(
+            probs * torch.log(probs + 1e-5), dim=-1
+        )
+    threshold = torch.minimum(
+            torch.ones_like(entropy) * posterior_threshold,
+            torch.exp(-entropy) * posterior_alpha,
+        )
+    indices_to_remove = probs < threshold.unsqueeze(-1)
+    logits[indices_to_remove] = float('-inf')
+    sampled_tokens = torch.multinomial(F.softmax(logits, dim=-1), 1)
+    sampled_tokens = sampled_tokens.view(n_samples, n_tokens)
+    posterior_mask = (candidates[:, 1:] == sampled_tokens).int()
+    return posterior_mask
+def evaluate_posterior(
+    logits,
+    candidates,
+    temperature,
+    posterior_threshold=0.3,
+    posterior_alpha = 0.09,
+    top_p=0.8,
+    sampling = 'typical',
+    fast = True
+):
+    if logits.shape[1] <= 1:
+        return torch.tensor(0, dtype=torch.long, device=candidates.device), 0
+    # Greedy decoding based on temperature value
+    if temperature == 0:
+        # Find the tokens that match the maximum logits for each position in the sequence
+        posterior_mask = (
+            candidates[:, 1:] == torch.argmax(logits[:, :-1], dim=-1)
+        ).int()
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    elif sampling == 'typical':
+        if fast:
+            posterior_prob = torch.softmax(logits[:, :-1] / temperature, dim=-1)
+            candidates_prob = torch.gather(
+                posterior_prob, dim=-1, index=candidates[:, 1:].unsqueeze(-1)
+            ).squeeze(-1)
+            posterior_entropy = -torch.sum(
+                posterior_prob * torch.log(posterior_prob + 1e-5), dim=-1
+            )  # torch.sum(torch.log(*)) is faster than torch.prod
+            threshold = torch.minimum(
+                torch.ones_like(posterior_entropy) * posterior_threshold,
+                torch.exp(-posterior_entropy) * posterior_alpha,
+            )
+            posterior_mask = candidates_prob > threshold
+            candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+            # Choose the best candidate based on the evaluated posterior probabilities
+            accept_length = candidates_accept_length.max().item()
+            if accept_length == 0:
+                # If no candidates are accepted, just choose the first one
+                best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+            else:
+                best_candidates = torch.where(candidates_accept_length == accept_length)[0]
+                # Accept the best one according to likelihood
+                likelihood = torch.sum(
+                    torch.log(candidates_prob[best_candidates, :accept_length]), dim=-1
+                )
+                best_candidate = best_candidates[torch.argmax(likelihood)]
+            return best_candidate, accept_length
+        # Calculate posterior probabilities and thresholds for candidate selection
+        posterior_mask = get_typical_posterior_mask(logits, candidates, temperature, posterior_threshold, posterior_alpha)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        # Choose the best candidate based on the evaluated posterior probabilities
+        accept_length = candidates_accept_length.max().item()
+        if accept_length == 0:
+            # If no candidates are accepted, just choose the first one
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+            # Accept the best one according to likelihood
+        return best_candidate, accept_length
+    elif sampling == 'nucleus':
+        assert top_p < 1.0 + 1e-6, "top_p should between 0 and 1"
+        posterior_mask = get_nucleus_posterior_mask(logits, candidates, temperature, top_p)
+        candidates_accept_length = (torch.cumprod(posterior_mask, dim=1)).sum(dim=1)
+        accept_length = candidates_accept_length.max().item()
+        # Choose the best candidate
+        if accept_length == 0:
+            # Default to the first candidate if none are accepted
+            best_candidate = torch.tensor(0, dtype=torch.long, device=candidates.device)
+        else:
+            best_candidate = torch.argmax(candidates_accept_length).to(torch.long)
+        return best_candidate, accept_length
+    else:
+        raise NotImplementedError
+def update_inference_inputs(
+    input_ids,
+    medusa_logits,
+    logits,
+    candidate_ids,
+    best_candidate,
+    accept_length,
+):
+    input_ids = torch.cat(
+        [
+            input_ids,
+            candidate_ids[None, best_candidate, : accept_length + 1]
+        ],
+        dim=-1
+    )
+    logits = logits[
+        None, best_candidate, accept_length : accept_length + 1
+    ]
+    medusa_logits = medusa_logits[
+        :, None, best_candidate, accept_length : accept_length + 1
+    ]
+    # Update the new token counter
+    new_token = accept_length + 1
+    return input_ids, medusa_logits, logits, new_token
+def split_logits(full_logits):
+    # logits has shape [b, n, heads, vocab_size]
+    logits = full_logits[..., 0, :]
+    medusa_logits = full_logits[..., 1:, :].permute(2, 0, 1, 3)
+    return medusa_logits, logits
+class MultiByteDecodingMixin:
+    def multi_byte_pred_update_cache(
+        self,
+        past_key_values,
+        retrieve_indices,
+        best_candidate,
+        new_tokens,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        select_indices = (
+            retrieve_indices[best_candidate, : new_tokens] + prev_window_len
+        )
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_key_values.update_past_len(new_tokens, layer_idx)
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            tgt_window_k = past_window_k[..., select_indices, :]
+            tgt_window_v = past_window_v[..., select_indices, :]
+            dst_window_k = past_window_k[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_v = past_window_v[..., prev_window_len : prev_window_len + new_tokens, :]
+            dst_window_k.copy_(tgt_window_k, non_blocking=True)
+            dst_window_v.copy_(tgt_window_v, non_blocking=True)
+            new_window_len = prev_window_len + new_tokens
+            if new_window_len >= self.config.window_size:
+                assert new_window_len < 2 * self.config.window_size
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                _window_len = new_window_len - self.config.window_size
+                if _window_len > 0:
+                    new_window_k = past_window_k[..., self.config.window_size : new_window_len, :]
+                    new_window_v = past_window_v[..., self.config.window_size : new_window_len, :]
+                    _dst_window_k = past_window_k[..., : _window_len, :]
+                    _dst_window_v = past_window_v[..., : _window_len, :]
+                    _dst_window_k.copy_(new_window_k, non_blocking=True)
+                    _dst_window_v.copy_(new_window_v, non_blocking=True)
+                past_key_values.past_window_pos[layer_idx] = _window_len
+            else:
+                dump_k = None
+                dump_v = None
+                past_key_values.past_window_pos[layer_idx] = new_window_len
+            if dump_k is not None and dump_v is not None:
+                rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                    dump_k, dump_v,
+                    self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                    self.model.layers[layer_idx].self_attn.adaptive_phi,
+                    None,
+                    self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                    self.model.layers[layer_idx].self_attn.chunk_size
+                )
+                rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                    rfa_k, rfa_v, layer_idx
+                )
+        return past_key_values
+    def _multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+        self,
+        past_key_values,
+    ):
+        prev_window_len = past_key_values.get_past_window_pos(0)
+        for layer_idx in range(self.config.num_hidden_layers):
+            past_window_k = past_key_values.past_window_k[layer_idx]
+            past_window_v = past_key_values.past_window_v[layer_idx]
+            new_window_len = prev_window_len
+            if new_window_len == self.config.window_size:
+                dump_k = past_window_k[..., :self.config.window_size, :].clone()
+                dump_v = past_window_v[..., :self.config.window_size, :].clone()
+                past_key_values.past_window_pos[layer_idx] = 0
+                if dump_k is not None and dump_v is not None:
+                    rfa_k, rfa_v = triton_eva_prep_kv_fwd(
+                        dump_k, dump_v,
+                        self.model.layers[layer_idx].self_attn.adaptive_mu_k,
+                        self.model.layers[layer_idx].self_attn.adaptive_phi,
+                        None,
+                        self.model.layers[layer_idx].self_attn.head_dim_scaling,
+                        self.model.layers[layer_idx].self_attn.chunk_size
+                    )
+                    rfa_k, rfa_v = past_key_values.update_chunk_rfas(
+                        rfa_k, rfa_v, layer_idx
+                    )
+        return past_key_values
+    def multi_byte_pred_update_attn_mask(
+        self,
+        last_iter_new_tokens,
+        tree_candidate_ids,
+        past_attn_mask,
+        medusa_attn_mask,
+        past_key_values,
+    ):
+        batch_size, tree_candidate_len = tree_candidate_ids.shape
+        seen_tokens = past_key_values.get_seq_length()
+        # NOTE: past_key_values has been updated so now
+        # seen_tokens incldues new tokens from the last tree iteration
+        assert seen_tokens > 0
+        # so one iteration would not cross two windows
+        assert last_iter_new_tokens < self.config.window_size
+        if past_attn_mask is not None and seen_tokens < self.config.window_size:
+            past_attn_mask = torch.cat(
+                [
+                    past_attn_mask,
+                    torch.ones(
+                        [batch_size, 1, tree_candidate_len, last_iter_new_tokens],
+                        dtype=torch.bool,
+                        device=self.device
+                    )
+                ],
+                dim=-1
+            )
+        else:
+            # we initialize attn mask each time when
+            # 1. the model crosses the window bounary, or
+            # 2. after prefilling
+            chunks_per_window = int(self.config.window_size // self.config.chunk_size)
+            window_tokens = seen_tokens % self.config.window_size
+            num_windows_seen_so_far = seen_tokens // self.config.window_size
+            attn_mask_len = num_windows_seen_so_far * chunks_per_window + window_tokens
+            past_attn_mask = torch.ones(
+                (batch_size, 1, tree_candidate_len, attn_mask_len),
+                dtype=torch.bool,
+                device=self.device
+            )
+        # note that 1 indicates the position is not masked
+        tree_attn_mask = torch.cat(
+            [
+                past_attn_mask,
+                medusa_attn_mask.to(torch.bool)
+            ],
+            dim=-1
+        )
+        return tree_attn_mask, past_attn_mask
+    @torch.no_grad()
+    def multi_byte_generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        temperature=0.0,
+        max_length=None,
+        max_new_tokens=None,
+        stopping_criteria=None,
+        posterior_threshold=0.09,
+        posterior_alpha=0.3,
+        top_p=0.8,
+        sampling='typical',
+        fast=True,
+        do_sample=False,
+        medusa_choices=None,
+        return_acc_lengths=False
+    ):
+        if do_sample or temperature > 0.0:
+            fast = False
+        ### Prepare `max_length` depending on other stopping criteria.
+        if max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids.shape[-1]
+        elif max_new_tokens is None and max_length is None:
+            max_length = getattr(self.config, "max_position_embeddings", 32768)
+        ### Set up stopping criteria
+        eos_stop_criteria = MultibyteEosTokenCriteria(self.generation_config.eos_token_id)
+        stop_criteria = StoppingCriteriaList()
+        if max_length is not None:
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            stop_criteria.append(
+                MaxLengthCriteria(
+                    max_length=max_length,
+                    max_position_embeddings=max_position_embeddings,
+                )
+            )
+        if stopping_criteria is not None and len(stopping_criteria) > 0:
+            stop_criteria.extend(stopping_criteria)
+        assert input_ids.shape[0] == 1, "Only support batch size 1 for now"
+        assert attention_mask is None, "Only support attention mask None for now"
+        # Avoid modifying the input_ids in-place
+        input_ids = input_ids.clone()
+        position_ids = torch.arange(0, input_ids.shape[1], device=self.device, dtype=int).reshape(1, -1)
+        ####################################################
+        # 0. initialize the medusa buffers
+        ####################################################
+        if medusa_choices is None:
+            medusa_choices = evabyte_7b_95
+        medusa_buffers = generate_medusa_buffers(
+            medusa_choices, device=self.device
+        )
+        past_key_values = EvaStaticCacheForTriton(
+            input_ids.shape[0],
+            self.config.num_attention_heads,
+            # we add 256 to allow tree ids
+            self.config.window_size + 256,
+            self.config.hidden_size // self.config.num_attention_heads,
+            self.config.num_hidden_layers,
+            self.lm_head.weight.dtype,
+            self.lm_head.weight.device,
+        )
+        # prefill to get medusa logits and logits
+        full_logits, past_key_values = self.forward(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            use_cache=True,
+            past_key_values=past_key_values,
+            return_all_pred_logits=True,
+            multibyte_decoding=False,
+        )
+        # handles an edge case where the prefill length == window_size
+        # we force the previous window to be dumped into RFA chunks
+        past_key_values = self._multi_byte_pred_update_cache_when_prefil_len_eq_window_size(
+            past_key_values
+        )
+        medusa_logits, logits = split_logits(full_logits)
+        past_attn_mask = None
+        last_iter_new_tokens = 0
+        max_iters = 32768
+        if return_acc_lengths:
+            acc_lengths = []
+        for _ in range(max_iters):
+            ####################################################
+            # 1. generate candidate_ids with topk predictions from Medusa heads
+            ####################################################
+            tree_candidate_ids, unflattened_candidate_ids = generate_candidates(
+                medusa_logits,
+                logits,
+                medusa_buffers["tree_indices"],
+                medusa_buffers["retrieve_indices"],
+                temperature=temperature,
+                posterior_alpha=posterior_alpha,
+                posterior_threshold=posterior_threshold,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast,
+            )
+            ####################################################
+            # 2. Build the medusa attention mask and position ids
+            ####################################################
+            # NOTE: 1 indicates the position is not masked
+            medusa_attn_mask, past_attn_mask = self.multi_byte_pred_update_attn_mask(
+                last_iter_new_tokens,
+                tree_candidate_ids,
+                past_attn_mask,
+                medusa_buffers["medusa_attn_mask"],
+                past_key_values,
+            )
+            medusa_position_ids = medusa_buffers["medusa_position_ids"] + input_ids.shape[1]
+            ####################################################
+            # 3. tree decoding
+            ####################################################
+            tree_full_logits, past_key_values = self.forward(
+                tree_candidate_ids,
+                past_key_values=past_key_values,
+                attention_mask=medusa_attn_mask,
+                position_ids=medusa_position_ids,
+                return_all_pred_logits=True,
+                multibyte_decoding=True,
+            )
+            _medusa_logits, _logits = split_logits(tree_full_logits)
+            medusa_logits = _medusa_logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            logits = _logits[..., 0, medusa_buffers["retrieve_indices"], :]
+            ####################################################
+            # 4. candidate selection
+            ####################################################
+            # if the current iteration, with tree tokens, crosses window
+            # boundaries, trim the condidate_ids to be within the window
+            # so that those exceeded tokens (which will be inaccurate)
+            # will not be considered
+            tree_depth = unflattened_candidate_ids.shape[-1]
+            if tree_depth + past_key_values.get_past_window_pos(0) > self.config.window_size:
+                max_acc_len = self.config.window_size - past_key_values.get_past_window_pos(0)
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids[:, :max_acc_len]
+                _trimmed_logits = logits[:, :max_acc_len]
+            else:
+                _trimmed_unflattened_candidate_ids = unflattened_candidate_ids
+                _trimmed_logits = logits
+            best_candidate, accept_length = evaluate_posterior(
+                _trimmed_logits,
+                _trimmed_unflattened_candidate_ids,
+                temperature,
+                posterior_threshold,
+                posterior_alpha,
+                top_p=top_p,
+                sampling=sampling,
+                fast=fast
+            )
+            ####################################################
+            # 5. update model inputs and caches
+            ####################################################
+            input_ids, medusa_logits, logits, last_iter_new_tokens = update_inference_inputs(
+                input_ids,
+                medusa_logits,
+                logits,
+                unflattened_candidate_ids,
+                best_candidate,
+                accept_length,
+            )
+            past_key_values = self.multi_byte_pred_update_cache(
+                past_key_values,
+                medusa_buffers["retrieve_indices"],
+                best_candidate,
+                last_iter_new_tokens,
+            )
+            if return_acc_lengths:
+                acc_lengths.append(last_iter_new_tokens)
+            if stop_criteria(input_ids, None) or eos_stop_criteria(input_ids, last_iter_new_tokens):
+                if return_acc_lengths:
+                    return input_ids, acc_lengths
+                else:
+                    return input_ids
+        if return_acc_lengths:
+            return input_ids, acc_lengths
+        else:
+            return input_ids