Upload 4 files

Browse files

Files changed (3) hide show

processor_config.json +1 -1
ultravox_model.py +85 -48
ultravox_processing.py +188 -75

processor_config.json CHANGED Viewed

@@ -5,7 +5,7 @@
   "auto_map": {
     "AutoProcessor": "ultravox_processing.UltravoxProcessor"
   },
-  "encoder_ds_factor": 320,
   "processor_class": "UltravoxProcessor",
   "stack_factor": 8
 }

   "auto_map": {
     "AutoProcessor": "ultravox_processing.UltravoxProcessor"
   },
+  "encoder_ds_factor": 2,
   "processor_class": "UltravoxProcessor",
   "stack_factor": 8
 }

ultravox_model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import logging
 import re
-from typing import Any, Dict, Optional, Set, Tuple, Union
 import peft
 import torch
@@ -10,6 +10,7 @@ import transformers
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
@@ -19,7 +20,7 @@ from .ultravox_config import LossFunction
 from .ultravox_config import UltravoxConfig
-class UltravoxModel(transformers.LlamaPreTrainedModel):
     """
     The Ultravox model which consists of an audio encoder and a language model.
@@ -37,6 +38,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
     config: UltravoxConfig  # for type hinting
     # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing
     _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"]
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
@@ -46,15 +50,16 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
         self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
-        # This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
-        self._no_split_modules = (self.language_model._no_split_modules or []) + (
-            self.audio_tower._no_split_modules or []
-        )
         self.loss_config = LossConfig()
         self.post_init()
@@ -141,6 +146,24 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         )
         return {"loss": kl_loss}
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -149,8 +172,9 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
-        audio_len: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
         alt_input_ids: Optional[torch.Tensor] = None,
@@ -181,29 +205,37 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
-        if audio_values is not None:
             assert (
-                audio_token_start_idx is not None and audio_token_len is not None
-            ), "audio_token_start_idx and audio_token_len must be provided if audio_values are provided."
             assert (
-                len(audio_token_start_idx) == len(audio_token_len) == len(audio_values)
-            ), "audio_token_start_idx, audio_token_len, and audio_values must have the same batch size."
-            # B x A/3200 x D
             audio_tower_output = self.audio_tower.forward(
                 audio_values.to(self.audio_tower.dtype),
-                audio_len=audio_len,
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
-            for i, (audio, start, length) in enumerate(
-                zip(audio_embeds, audio_token_start_idx, audio_token_len)
-            ):
-                length = min(length, audio.shape[0])
-                inputs_embeds[i, start : start + length] = audio[:length]
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
@@ -238,7 +270,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
-        audio_len: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
@@ -267,7 +300,8 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
-            model_input["audio_len"] = audio_len
         return model_input
@@ -284,7 +318,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
         if config.audio_model_id is not None:
-            if "whisper" in config.audio_model_id is not None:
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
@@ -300,7 +334,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
         else:
-            if "whisper" in config.audio_config._name_or_path:
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
                 audio_tower.init_latency_mask(
                     config.audio_latency_block_size, dtype=config.torch_dtype
@@ -393,13 +427,17 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
         if state_dict is None:
             state_dict = super().state_dict()
-        named_params = dict(self.named_parameters())
         state_dict = {
             k: v
             for k, v in state_dict.items()
-            if k in self.keep_params
-            or (k in named_params and named_params[k].requires_grad)
         }
         return state_dict
@@ -445,7 +483,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
-    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]]
 ) -> bool:
     """
     Check if the cache is empty.
@@ -481,12 +519,8 @@ def apply_lora(model: torch.nn.Module, lora_config: dict) -> torch.nn.Module:
 class StackAudioFrames(nn.Module):
     """
-    Stack the audio embedding frames to reduce the sequence length by a factor of `stack_factor`.
-    The number of output frames will be `ceil(T / stack_factor) + 1` where `T` is the number of input frames.
-    NOTE: the extra +1 is intentional: in case the number of audio tokens are over-estimated by the processor,
-    we want to make sure `processor.audio_token_replacement` (i.e. EOS) doesn't get leaked into the middle of embeddings.
-    In most cases this extra padding will get removed in the model's forward function so it has no effect.
     """
     def __init__(self, stack_factor: int = 8):
@@ -496,7 +530,7 @@ class StackAudioFrames(nn.Module):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
-        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T + self.stack_factor))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
@@ -568,17 +602,25 @@ class ModifiedWhisperEncoder(
     base_model_prefix = "model.encoder"
     _no_split_modules = ["WhisperEncoderLayer"]
-    def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype):
-        if audio_latency_block_size is None:
-            self.audio_streaming_mask = None
-            return
-        # maximum sequence length
-        max_seqlen = (
             self.config.max_source_positions
             * self.conv1.stride[0]
             * self.conv2.stride[0]
         )
         assert (
             max_seqlen > 0
         ), f"maximum sequence length must be positive, got {max_seqlen}"
@@ -610,11 +652,7 @@ class ModifiedWhisperEncoder(
         output_hidden_states=None,
         return_dict=None,
     ):
-        expected_seq_length = (
-            self.config.max_source_positions
-            * self.conv1.stride[0]
-            * self.conv2.stride[0]
-        )
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
@@ -665,7 +703,6 @@ class ModifiedWhisperEncoder(
             attention_mask = self.get_extended_attention_mask(
                 attention_mask,
                 None,
-                device=hidden_states.device,
                 dtype=hidden_states.dtype,
             )

 import logging
 import re
+from typing import Any, Dict, Generator, Optional, Set, Tuple, Union
 import peft
 import torch
 import transformers.activations
 import transformers.modeling_outputs
 import transformers.models
+from transformers.generation.utils import GenerationMixin
 from transformers.models.whisper import modeling_whisper as whisper
 # We must use relative import in this directory to allow uploading to HF Hub
 from .ultravox_config import UltravoxConfig
+class UltravoxModel(transformers.LlamaPreTrainedModel, GenerationMixin):
     """
     The Ultravox model which consists of an audio encoder and a language model.
     config: UltravoxConfig  # for type hinting
     # Usually we load encoder and LLM weights from a pretrained model separately, so they are allowed to be missing
     _keys_to_ignore_on_load_missing = ["audio_tower.*", "language_model.*"]
+    # Since we have kwargs in forward, we need to set this to False, otherwise grad_accum_steps will cause incorrect train loss to be reported
+    # see https://github.com/huggingface/transformers/issues/35856 and https://github.com/huggingface/trl/pull/2615/files
+    accepts_loss_kwargs = False
     def __init__(self, config: UltravoxConfig):
         super().__init__(config)
         self.vocab_size = config.vocab_size
         self.audio_tower = self._create_audio_tower(config)
+        self.audio_tower_context_length: Optional[int] = None
+        self.audio_tower_context_length = self.audio_tower.max_context_length
         self.multi_modal_projector = self._create_multi_modal_projector(config)
         self.language_model = self._create_language_model(config)
         # Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
         # FSDP throws an error if some of the layer types are not found in the model.
+        # This would be something like ["LlamaDecoderLayer"] as we don't split audio encoder layers.
+        self._no_split_modules = self.language_model._no_split_modules
         self.loss_config = LossConfig()
         self.post_init()
         )
         return {"loss": kl_loss}
+    def _audio_iter(
+        self, audio_batch_size: torch.Tensor
+    ) -> Generator[Tuple[int, int], None, None]:
+        """
+        Iterate over the audio batch size and yield the batch index and audio index of each audio item.
+        Args:
+            audio_batch_size: A tensor of shape (B,) where B is the batch size.
+        Returns:
+            A generator that yields a tuple of (start index, length) for each audio item.
+        """
+        audio_index = 0
+        for i_b, batch_count in enumerate(audio_batch_size):
+            for _ in range(batch_count):
+                yield i_b, audio_index
+                audio_index += 1
     def forward(
         self,
         input_ids: torch.Tensor,
         labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_batch_size: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         # the alt_* fields are needed for KL divergence loss
         alt_input_ids: Optional[torch.Tensor] = None,
             # B x T  ->  B x T x D
             inputs_embeds = self.get_input_embeddings().forward(input_ids)
+        if audio_values is not None and len(audio_values) > 0:
             assert (
+                audio_token_start_idx is not None
+                and audio_token_len is not None
+                and audio_lens is not None
+                and audio_batch_size is not None
+            ), "audio_token_start_idx/audio_token_len/audio_lens must be provided if audio_values are provided."
             assert (
+                len(audio_token_start_idx)
+                == len(audio_token_len)
+                == len(audio_lens)
+                == len(audio_values)
+            ), "audio_token_start_idx/audio_token_len/audio_lens/audio_values must have the same batch size."
+            assert len(audio_batch_size) == len(
+                inputs_embeds
+            ), "audio_batch_size and inputs_embeds must have the same batch size."
+            # B x A/3200 x (D=max-audio-length-in-batch)
             audio_tower_output = self.audio_tower.forward(
                 audio_values.to(self.audio_tower.dtype),
+                audio_len=audio_lens,
             ).last_hidden_state
             audio_tower_output = audio_tower_output.to(inputs_embeds.dtype)
             audio_embeds = self.multi_modal_projector.forward(audio_tower_output)
             # combine audio and text embeddings
+            for i_b, i_a in self._audio_iter(audio_batch_size):
+                start_idx = audio_token_start_idx[i_a]
+                token_len = audio_token_len[i_a]
+                item_embedding = audio_embeds[i_a][:token_len]
+                inputs_embeds[i_b][start_idx : start_idx + token_len] = item_embedding
         lm_output = self.language_model.forward(
             inputs_embeds=inputs_embeds,
         audio_values: Optional[torch.FloatTensor] = None,
         audio_token_start_idx: Optional[torch.Tensor] = None,
         audio_token_len: Optional[torch.Tensor] = None,
+        audio_lens: Optional[torch.Tensor] = None,
+        audio_batch_size: Optional[torch.Tensor] = None,
         past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]] = None,
         attention_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
                 audio_token_start_idx - prefill_start_idx
             )
             model_input["audio_token_len"] = audio_token_len
+            model_input["audio_batch_size"] = audio_batch_size
+            model_input["audio_lens"] = audio_lens
         return model_input
         cls, config: UltravoxConfig
     ) -> Union[transformers.Wav2Vec2Model, "ModifiedWhisperEncoder"]:
         if config.audio_model_id is not None:
+            if "whisper" in config.audio_model_id.lower():
                 audio_tower = ModifiedWhisperEncoder.from_pretrained(
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
                     config.audio_model_id, torch_dtype=config.torch_dtype
                 )
         else:
+            if "whisper" in config.audio_config._name_or_path.lower():
                 audio_tower = ModifiedWhisperEncoder(config.audio_config)
                 audio_tower.init_latency_mask(
                     config.audio_latency_block_size, dtype=config.torch_dtype
         if state_dict is None:
             state_dict = super().state_dict()
+        trainable_params = {k for k, v in self.named_parameters() if v.requires_grad}
+        # normalize the keys to match the original model
+        # Example: audio_tower.base_model.model.layers.0._fsdp_wrapped_module.self_attn.k_proj.lora_B.default.weight
+        trainable_params = {
+            k.replace("_fsdp_wrapped_module.", "") for k in trainable_params
+        }
         state_dict = {
             k: v
             for k, v in state_dict.items()
+            if k in self.keep_params or k in trainable_params
         }
         return state_dict
 # TODO: refactor common parts to a shared module
 def is_cache_empty(
+    past_key_values: Optional[Union[Tuple, transformers.cache_utils.Cache]],
 ) -> bool:
     """
     Check if the cache is empty.
 class StackAudioFrames(nn.Module):
     """
+    Stack the audio embedding frames to reduce the sequence length by a factor
+    of `stack_factor`.
     """
     def __init__(self, stack_factor: int = 8):
     def forward(self, audio_embeds: torch.Tensor) -> torch.Tensor:
         B, T, C = audio_embeds.shape
         T_pad = (T + self.stack_factor - 1) // self.stack_factor * self.stack_factor
+        audio_embeds = F.pad(audio_embeds, (0, 0, 0, T_pad - T))
         B, T, C = audio_embeds.shape
         audio_embeds = audio_embeds.view(
             B, T // self.stack_factor, C * self.stack_factor
     base_model_prefix = "model.encoder"
     _no_split_modules = ["WhisperEncoderLayer"]
+    def __init__(self, config: transformers.WhisperConfig):
+        super().__init__(config)
+        self.config.is_decoder = False
+    @property
+    def max_context_length(self):
+        return (
             self.config.max_source_positions
             * self.conv1.stride[0]
             * self.conv2.stride[0]
         )
+    def init_latency_mask(self, audio_latency_block_size: int, dtype: torch.dtype):
+        if audio_latency_block_size is None:
+            self.audio_streaming_mask = None
+            return
+        # Use max_context_length directly in the calculation
+        max_seqlen = self.max_context_length
         assert (
             max_seqlen > 0
         ), f"maximum sequence length must be positive, got {max_seqlen}"
         output_hidden_states=None,
         return_dict=None,
     ):
+        expected_seq_length = self.max_context_length
         if input_features.shape[-1] > expected_seq_length:
             raise ValueError(
                 f"Whisper expects the mel input features to be of length {expected_seq_length} or less, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
             attention_mask = self.get_extended_attention_mask(
                 attention_mask,
                 None,
                 dtype=hidden_states.dtype,
             )

ultravox_processing.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import dataclasses
-from typing import Optional, Union
 import numpy as np
 import torch
@@ -15,7 +15,13 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
     include_alt_fields: bool = False
     def __call__(self, features, *args, **kwargs):
-        audio_values = [f.pop("audio_values", None) for f in features]
         if self.include_alt_fields:
             # these fields are hard-coded in the transformer data collator, so they need special handling before calling the super method
             alt_features = [
@@ -34,8 +40,12 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
             batch["alt_attention_mask"] = alt_batch["attention_mask"]
             batch["alt_labels"] = alt_batch["labels"]
         # Pad the last dimension of all audio_values to the same length, with 0s on the right.
-        if audio_values and audio_values[0] is not None:
             max_len = max([x.shape[-1] for x in audio_values])
             batch["audio_values"] = torch.stack(
                 [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
@@ -45,10 +55,12 @@ class DataCollatorForSeq2SeqWithAudio(transformers.DataCollatorForSeq2Seq):
                     [f["input_ids"].shape[-1] for f in features]
                 )
                 displacement = batch["input_ids"].shape[-1] - input_ids_lens
                 batch["audio_token_start_idx"] += displacement.to(
                     batch["audio_token_start_idx"].device
                 )
         return batch
@@ -62,11 +74,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
     """
     attributes = ["audio_processor", "tokenizer"]
-    audio_processor_class = (
-        "Wav2Vec2Processor",
-        "SeamlessM4TFeatureExtractor",
-        "WhisperProcessor",
-    )
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
@@ -80,27 +88,32 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         audio_processor=None,
         tokenizer=None,
         audio_padding: str = "longest",
-        encoder_ds_factor: int = 320,
         stack_factor: int = 8,
         audio_placeholder: str = "<|audio|>",
     ):
         """
         Args:
             audio_processor: The audio processor for the audio encoder.
             tokenizer: The tokenizer for the language model.
             audio_padding: The padding strategy for the audio encoder.
-            encoder_ds_factor: The downsample factor of the audio encoder.
             stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector.
             audio_placeholder: The placeholder for the audio in the text.
         """
         self.audio_padding = audio_padding
         self.encoder_ds_factor = encoder_ds_factor
         self.stack_factor = stack_factor
         self.audio_placeholder = audio_placeholder
-        self.audio_token_replacement = tokenizer.eos_token
         assert (
-            self.audio_token_replacement is not None
         ), "The tokenizer has no EOS token. Cannot recover."
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
@@ -114,7 +127,7 @@ class UltravoxProcessor(transformers.ProcessorMixin):
         audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
-            or "facebook/wav2vec2-base-960h"
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(
@@ -129,30 +142,100 @@ class UltravoxProcessor(transformers.ProcessorMixin):
             stack_factor=config.stack_factor,
         )
     def __call__(
         self,
         text: Optional[str] = None,
         audio: Optional[Union[np.ndarray, torch.Tensor]] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[
             Union[str, transformers.TensorType]
         ] = transformers.TensorType.PYTORCH,
         **kwargs,
     ) -> transformers.BatchFeature:
         """
         Main method to prepare for the model one text sequence and audio. This method forwards the `text`
         and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
-        audio processor's [`~Wav2Vec2Processor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The audio to be prepared. Audio can be NumPy array or PyTorch tensor. In case of a
-                NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels, and T the
-                sample length of the audio.
             sampling_rate (`int`, *optional*, defaults to 16000):
                 Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what
                 you are doing.
@@ -176,75 +259,105 @@ class UltravoxProcessor(transformers.ProcessorMixin):
               Returned when `audio` is not `None`.
             - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`.
         """
-        # TODO: Add support for multiple audio and text inputs.
         data = {}
-        audio_embed_frames = 0
-        if audio is not None and len(audio) > 0:
-            if self.audio_padding == "max_length":
-                # 30 seconds is the expected length for Whisper
-                assert sampling_rate is not None, "Sampling rate must be provided."
-                audio_len = 30 * sampling_rate
-            else:
-                audio_len = audio.shape[-1]
-            # It's guaranteed that the number of frames is less than or equal to this amount.
-            # For Whisper this is exact AFAICT, but for Wav2Vec2 it's an upper bound.
-            # Currently, StackAudioFrames makes sure an over-estimation won't cause issues by padding the audio embeddings.
-            nb_encoder_frames = int(round(audio_len / self.encoder_ds_factor + 1e-4))
-            audio_embed_frames = int(np.ceil(nb_encoder_frames / self.stack_factor))
-            data["audio_token_len"] = [audio_embed_frames]
             # Main audio processing. The processor is model-specific.
-            x = self.audio_processor(
-                audio,
                 sampling_rate=sampling_rate,
                 padding="longest",
-                max_length=audio_len,
                 return_attention_mask=True,
                 **kwargs,
             )
-            if "input_features" in x:
-                data["audio_values"] = x.input_features
-            else:
-                data["audio_values"] = x.input_values
-            # data["audio_len"] is the number of frames in the audio, used for creating attention masks in whisper encoder
-            if (
-                self.audio_padding == "max_length"
-            ):  # audio is padded to max length, so we rely on the attention mask to determine audio_len
-                data["audio_len"] = (
-                    x.attention_mask.sum(-1) - 1
-                )  # Whisper attention mask includes an extra 1 at the end that needs to be subtracted
-            else:  # audio is not padded, so we can directly use the audio length
-                data["audio_len"] = [torch.as_tensor(data["audio_values"]).shape[-1]]
-        if text is not None:
-            assert isinstance(
-                text, str
-            ), "Text must be a string. Batch mode not supported yet."
-            if self.audio_placeholder in text:
-                if "audio_token_len" not in data:
-                    raise ValueError(
-                        f"audio must be provided when using audio placeholder ({self.audio_placeholder}) in text."
-                    )
-                start_idx = len(
-                    self.tokenizer.encode(
-                        text[: text.index(self.audio_placeholder)],
-                        add_special_tokens=False,
-                    )
-                )
-                data["audio_token_start_idx"] = [start_idx]
-                # Replace the audio placeholder with the audio token.
-                #   e.g. "Transcribe\n<|audio|>" -> "Transcribe </s></s></s></s></s></s></s></s>"
-                #        where the number of </s> is the number of audio frames.
-                text = text.replace(
-                    self.audio_placeholder,
-                    self.audio_token_replacement * audio_embed_frames,
                 )
             # Special tokens like BOS should already have been added by the caller.
-            data.update(self.tokenizer([text], add_special_tokens=False, **kwargs))
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)

 import dataclasses
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import torch
     include_alt_fields: bool = False
     def __call__(self, features, *args, **kwargs):
+        audio_values = [x for f in features for x in f.pop("audio_values", [])]
+        audio_lens = [x for f in features for x in f.pop("audio_lens", [])]
+        audio_token_len = [x for f in features for x in f.pop("audio_token_len", [])]
+        audio_token_start_idx = [
+            x for f in features for x in f.pop("audio_token_start_idx", [])
+        ]
         if self.include_alt_fields:
             # these fields are hard-coded in the transformer data collator, so they need special handling before calling the super method
             alt_features = [
             batch["alt_attention_mask"] = alt_batch["attention_mask"]
             batch["alt_labels"] = alt_batch["labels"]
+        batch["audio_token_start_idx"] = torch.stack(audio_token_start_idx)
+        batch["audio_lens"] = torch.stack(audio_lens)
+        batch["audio_token_len"] = torch.stack(audio_token_len)
         # Pad the last dimension of all audio_values to the same length, with 0s on the right.
+        if audio_values:
             max_len = max([x.shape[-1] for x in audio_values])
             batch["audio_values"] = torch.stack(
                 [F.pad(x, (0, max_len - x.shape[-1])) for x in audio_values]
                     [f["input_ids"].shape[-1] for f in features]
                 )
                 displacement = batch["input_ids"].shape[-1] - input_ids_lens
+                displacement = displacement.repeat_interleave(
+                    batch["audio_batch_size"].squeeze(-1)
+                )
                 batch["audio_token_start_idx"] += displacement.to(
                     batch["audio_token_start_idx"].device
                 )
         return batch
     """
     attributes = ["audio_processor", "tokenizer"]
+    audio_processor_class = ("WhisperProcessor",)
     tokenizer_class = (
         "PreTrainedTokenizer",
         "PreTrainedTokenizerFast",
         audio_processor=None,
         tokenizer=None,
         audio_padding: str = "longest",
+        encoder_ds_factor: int = 2,
         stack_factor: int = 8,
         audio_placeholder: str = "<|audio|>",
+        # Defaults to whisper encoder context size
+        audio_context_size: Optional[int] = 3000,
     ):
         """
         Args:
             audio_processor: The audio processor for the audio encoder.
             tokenizer: The tokenizer for the language model.
             audio_padding: The padding strategy for the audio encoder.
             stack_factor: The factor by which the audio encoder output is stacked in the multimodal projector.
+            encoder_ds_factor: The downsampling factor of the audio encoder.
             audio_placeholder: The placeholder for the audio in the text.
+            audio_context_size: The maximum number of frames that the audio encoder can handle.
         """
         self.audio_padding = audio_padding
         self.encoder_ds_factor = encoder_ds_factor
         self.stack_factor = stack_factor
         self.audio_placeholder = audio_placeholder
+        self.audio_context_size = audio_context_size
         assert (
+            tokenizer.eos_token is not None
         ), "The tokenizer has no EOS token. Cannot recover."
+        self.vocab = tokenizer.get_vocab()
+        self.audio_token_replacement = tokenizer.eos_token
         if tokenizer.pad_token_id is None:
             tokenizer.pad_token_id = tokenizer.eos_token_id
         audio_processor = transformers.AutoProcessor.from_pretrained(
             config.audio_model_id
             or config.audio_config._name_or_path
+            or "openai/whisper-tiny"
         )
         tokenizer = transformers.AutoTokenizer.from_pretrained(
             stack_factor=config.stack_factor,
         )
+    def _chunk_and_pad_audio(
+        self,
+        audio_values: torch.Tensor,
+        audio_lens: torch.Tensor,
+        include_audio_num_chunks: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Processes the audio batch by chunking any items in the batch according to the audio_context_size,
+        padding the last chunk if needed, and returns a dictionary with updated audio data.
+        Args:
+            audio_values (torch.Tensor): A tensor of audio values (e.g., in B, D, T format).
+            audio_lens (torch.Tensor): A tensor of audio lengths.
+        Returns:
+            Dict[str, Any]: Dictionary with the following keys:
+                - "audio_values": The concatenated audio tensor after chunking and padding.
+                - "audio_lens": Tensor of lengths for each chunk.
+                - "audio_is_continuation": Tensor of booleans indicating if the chunk is a continuation of the previous chunk.
+                - "audio_batch_size": A Tensor with one integer representing the number of chunks.
+        """
+        chunked_audio_values: List[torch.Tensor] = []
+        chunked_audio_lens: List[int] = []
+        is_continuation_list: List[bool] = []
+        num_chunks: List[int] = []
+        context_size = self.audio_context_size or audio_values.shape[-1]
+        for i in range(audio_values.shape[0]):  # iterate over the batch
+            num_chunks.append(int(np.ceil(audio_lens[i] / context_size)))
+            for offset in range(0, audio_lens[i], context_size):
+                is_continuation = offset > 0
+                chunk = audio_values[i, :, offset : offset + context_size]
+                if is_continuation and chunk.shape[-1] < context_size:
+                    # N.B. We only need to pad continuation chunks. If none of the samples require chunking, the
+                    # batch might not (need to) be padded all the way to the audio_context_size, in which case
+                    # we've already included the padding above. On the other hand, if we have any continuation
+                    # chunks we know that the batch needs to be padded to audio_context_size because that's what
+                    # we're slicing to.
+                    chunk = F.pad(chunk, (0, context_size - chunk.shape[-1]))
+                chunked_audio_values.append(chunk)
+                chunked_audio_lens.append(
+                    min(int(audio_lens[i].item()) - offset, context_size)
+                )
+                is_continuation_list.append(is_continuation)
+        data = {
+            "audio_values": torch.stack(chunked_audio_values, dim=0),
+            "audio_lens": torch.tensor(
+                chunked_audio_lens, dtype=torch.int64, device=audio_values.device
+            ),
+            "audio_is_continuation": torch.tensor(
+                is_continuation_list, dtype=torch.bool, device=audio_values.device
+            ),
+            "audio_batch_size": torch.tensor(
+                [len(chunked_audio_values)], device=audio_values.device
+            ),
+        }
+        if include_audio_num_chunks:
+            data["audio_num_chunks"] = torch.tensor(
+                num_chunks, dtype=torch.int64, device=audio_values.device
+            )
+        return data
     def __call__(
         self,
         text: Optional[str] = None,
         audio: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        audios: Optional[
+            Union[
+                List[Union[np.ndarray, torch.Tensor]], Union[np.ndarray, torch.Tensor]
+            ]
+        ] = None,
         sampling_rate: Optional[int] = None,
         return_tensors: Optional[
             Union[str, transformers.TensorType]
         ] = transformers.TensorType.PYTORCH,
+        include_audio_num_chunks: bool = False,
         **kwargs,
     ) -> transformers.BatchFeature:
         """
         Main method to prepare for the model one text sequence and audio. This method forwards the `text`
         and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
         the text. To prepare the audio(s), this method forwards the `audio`, `sampling_rate` and `kwargs` arguments to
+        audio processor's [`~WhisperProcessor.__call__`] if `audio` is not `None`. Please refer to the docstring
         of the above two methods for more information.
         Args:
             text (`str`, `List[str]`):
                 The sequence to be encoded. Sequence can be a string or (pretokenized string).
             audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The audio to be prepared. Audio can be a single-channel (1-dimensional) NumPy array or PyTorch tensor.
+            audios (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                A list or two dimensional array of audio to be prepared.
             sampling_rate (`int`, *optional*, defaults to 16000):
                 Sampling rate of the input audio. We expect 16kHz audio. Don't change this value unless you know what
                 you are doing.
               Returned when `audio` is not `None`.
             - **audio_token_start_idx** -- The index in the tokenized text where the audio starts. Returned when `audio` is not `None`.
         """
+        # TODO: Add support for multiple text inputs.
+        if audio is not None and audios is not None:
+            raise ValueError("Only one of `audio` or `audios` should be provided.")
+        elif audio is not None:
+            audios = audio if isinstance(audio, list) or audio.ndim == 2 else [audio]
+        elif audios is None:
+            audios = []
         data = {}
+        audio_is_continuation = []
+        if len(audios) > 0:
+            audios = [x.numpy() if isinstance(x, torch.Tensor) else x for x in audios]
+            # Pad out each audio to at least 2 hops (the minimum required by the processor).
+            hop_length = self.audio_processor.feature_extractor.hop_length
+            audios = [
+                (
+                    np.pad(x, (0, 2 * hop_length - len(x)), mode="constant")
+                    if len(x) < 2 * hop_length
+                    else x
+                )
+                for x in audios
+            ]
             # Main audio processing. The processor is model-specific.
+            x: transformers.BatchFeature = self.audio_processor(
+                audios,
                 sampling_rate=sampling_rate,
                 padding="longest",
+                pad_to_multiple_of=hop_length,  # The attention mask effectively gets padded to the hop length, so pad the audio to be consistent.
+                truncation=False,
                 return_attention_mask=True,
                 **kwargs,
             )
+            data.update(
+                self._chunk_and_pad_audio(
+                    audio_values=torch.as_tensor(
+                        x.input_features if "input_features" in x else x.input_values
+                    ),
+                    audio_lens=torch.as_tensor(x.attention_mask).sum(-1),
+                    include_audio_num_chunks=include_audio_num_chunks,
                 )
+            )
+            audio_is_continuation = data.pop("audio_is_continuation")
+            data["audio_token_len"] = torch.ceil(
+                data["audio_lens"] / (self.encoder_ds_factor * self.stack_factor)
+            ).to(dtype=torch.int)
+        if text is not None:
+            if not isinstance(text, str):
+                raise ValueError("Text must be a string. Batch mode not supported yet.")
             # Special tokens like BOS should already have been added by the caller.
+            tokenized_parts = self.tokenizer(
+                text.split(
+                    "<|audio|>"  # The placeholder isn't part of the vocabulary, so split the text around it.
+                ),
+                add_special_tokens=False,
+                **kwargs,
+            )
+            audio_token_start_idx = []
+            placeholder_index = -1
+            split_input_ids = tokenized_parts["input_ids"]
+            input_ids: List[int] = []
+            audio_token_replacement_token_id = self.vocab[self.audio_token_replacement]
+            for i, token_len in enumerate(data.get("audio_token_len", [])):
+                if not audio_is_continuation[i]:
+                    placeholder_index += 1
+                    if placeholder_index >= len(split_input_ids):
+                        raise ValueError(
+                            f"Text contains too few audio placeholders. (Expected {len(audios)} placeholders)"
+                        )
+                    input_ids.extend(split_input_ids[placeholder_index])
+                audio_token_start_idx.append(len(input_ids))
+                input_ids.extend([audio_token_replacement_token_id] * token_len)
+            # Include any tokens after the last audio.
+            placeholder_index += 1
+            if placeholder_index != len(split_input_ids) - 1:
+                raise ValueError(
+                    f"Text contains too many audio placeholders. (Expected {len(audios)} placeholders)"
+                )
+            input_ids.extend(split_input_ids[placeholder_index])
+            if "audio_token_len" in data:
+                data["audio_token_start_idx"] = torch.as_tensor(audio_token_start_idx)
+            data["input_ids"] = [input_ids]
+            data["attention_mask"] = [[1] * len(input_ids)]
+            # Ensure that there are no audio placeholders after the last audio.
         return transformers.BatchFeature(data=data, tensor_type=return_tensors)