Spaces:

chenjoya
/

videollm-online

Running

App Files Files Community

chenjoya commited on Jun 20

Commit

7d1b5a5

•

1 Parent(s): 98f88b8

Upload 9 files

Browse files

Files changed (9) hide show

models/__init__.py +10 -0
models/arguments_live.py +54 -0
models/configuration_live.py +21 -0
models/live_llama/__init__.py +2 -0
models/live_llama/configuration_live_llama.py +7 -0
models/live_llama/modeling_live_llama.py +154 -0
models/modeling_live.py +222 -0
models/tokenization_live.py +153 -0
models/vision_live.py +61 -0

models/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from transformers import HfArgumentParser
+from .arguments_live import LiveTrainingArguments, get_args_class
+from .live_llama import build_live_llama as build_model_and_tokenizer
+from .modeling_live import fast_greedy_generate
+def parse_args() -> LiveTrainingArguments:
+    args, = HfArgumentParser(LiveTrainingArguments).parse_args_into_dataclasses()
+    args, = HfArgumentParser(get_args_class(args.live_version)).parse_args_into_dataclasses()
+    return args

models/arguments_live.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from dataclasses import dataclass, field
+from transformers import TrainingArguments
+@dataclass
+class LiveTrainingArguments(TrainingArguments):
+    live_version: str = 'live1+'
+    system_prompt: str = (
+        "A multimodal AI assistant is helping users with some activities."
+        " Below is their conversation, interleaved with the list of video frames received by the assistant."
+    )
+    train_datasets: list[str] = None
+    eval_datasets: list[str] = None
+    stream_loss_weight: float = 1.0
+    llm_pretrained: str = 'meta-llama/Meta-Llama-3-8B-Instruct'
+    vision_pretrained: str = 'google/siglip-large-patch16-384'
+    lora_modules: str = "model.*(q_proj|k_proj|v_proj|o_proj|gate_proj|up_proj|down_proj)|lm_head$"
+    lora_r: int = 128
+    lora_alpha: int = 256
+    finetune_modules: list[str] = field(default_factory=lambda: ['connector'])
+    frame_fps: int = 2 # for training. inference can be 10
+    frame_token_cls: bool = None
+    frame_token_pooled: list[int] = None
+    frame_resolution: int = 384
+    frame_token_interval: str  = None
+    frame_token_interval_threshold: float = 0.0
+    augmentation: bool = False
+    attn_implementation: str = 'flash_attention_2'
+    output_dir: str = 'outputs/debug'
+@dataclass
+class LiveOneTrainingArguments(LiveTrainingArguments):
+    live_version: str = 'live1'
+    frame_token_cls: bool = True
+    frame_num_tokens: int = 1
+    frame_token_interval: str  = ''
+    embed_mark: str = '2fps_384_1'
+    max_num_frames: int = 7200 # 1h, 2fps, 7200 frames
+@dataclass
+class LiveOnePlusTrainingArguments(LiveTrainingArguments):
+    live_version: str = 'live1+'
+    frame_token_cls: bool = True
+    frame_token_pooled: list[int] = field(default_factory=lambda: [3,3])
+    frame_num_tokens: int = 10 # 1+3x3
+    embed_mark: str = '2fps_384_1+3x3'
+    frame_token_interval: str = ','
+    max_num_frames: int = 1200 # 10min, 2fps, 1200 frames
+def get_args_class(live_version: str):
+    if live_version == 'live1':
+        return LiveOneTrainingArguments
+    elif live_version == 'live1+':
+        return LiveOnePlusTrainingArguments
+    raise NotImplementedError

models/configuration_live.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from transformers import PretrainedConfig
+class LiveConfigMixin(PretrainedConfig):
+    def __init__(self, *, vision_pretrained: str = None,
+        frame_resolution: int = None, frame_token_cls: bool = None, frame_token_pooled: list[int] = None, frame_num_tokens: int = None,
+        v_placeholder: str = '<v>', frame_token_interval: str = None, v_placeholder_id: int = None, frame_token_interval_id: int = None,
+        stream_loss_weight: float = 1.0, vision_hidden_size=1024, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.vision_pretrained = vision_pretrained
+        self.frame_resolution = frame_resolution
+        self.frame_token_cls = frame_token_cls
+        self.frame_token_pooled = frame_token_pooled
+        self.frame_num_tokens = frame_num_tokens
+        self.vision_hidden_size = vision_hidden_size
+        self.stream_loss_weight = stream_loss_weight
+        self.v_placeholder = v_placeholder
+        self.frame_token_interval = frame_token_interval
+        self.v_placeholder_id = v_placeholder_id
+        self.frame_token_interval_id = frame_token_interval_id

models/live_llama/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .configuration_live_llama import LiveLlamaConfig
2	+ from .modeling_live_llama import LiveLlamaForCausalLM, build_live_llama

models/live_llama/configuration_live_llama.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from transformers import LlamaConfig
+from ..configuration_live import LiveConfigMixin
+class LiveLlamaConfig(LlamaConfig, LiveConfigMixin):
+    pass

models/live_llama/modeling_live_llama.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+from torch import nn
+from transformers import LlamaForCausalLM, Cache
+from transformers.activations import GELUActivation
+from transformers.utils import logging
+from .configuration_live_llama import LiveLlamaConfig
+from ..modeling_live import build_live, LiveMixin
+logger = logging.get_logger(__name__)
+class LiveLlamaForCausalLM(LlamaForCausalLM, LiveMixin):
+    config_class = LiveLlamaConfig
+    _keys_to_ignore_on_load_missing = ['vision_encoder', 'connector']
+    def __init__(self, config: LiveLlamaConfig):
+        super().__init__(config)
+        self.connector = torch.nn.Sequential(
+            torch.nn.Linear(config.vision_hidden_size, config.hidden_size, bias=True),
+            GELUActivation(config.hidden_size),
+            torch.nn.Linear(config.hidden_size, config.hidden_size, bias=True),
+        )
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        frames: torch.FloatTensor = None,
+        attention_mask: torch.Tensor = None,
+        position_ids: torch.LongTensor = None,
+        past_key_values: list[torch.FloatTensor] = None,
+        inputs_embeds: torch.FloatTensor = None,
+        labels: torch.LongTensor = None,
+        use_cache: bool = None,
+        output_attentions: bool = None,
+        output_hidden_states: bool = None,
+        return_dict: bool = None,
+        cache_position: torch.LongTensor = None,
+        **kwargs,
+    ):
+        if inputs_embeds is None:
+            inputs_embeds = self.joint_embed(input_ids, frames)
+        outputs = super().forward(
+            attention_mask = attention_mask,
+            position_ids = position_ids,
+            past_key_values = past_key_values,
+            inputs_embeds = inputs_embeds,
+            # labels
+            use_cache = use_cache,
+            output_attentions = output_attentions,
+            output_hidden_states = output_hidden_states,
+            return_dict = return_dict,
+            cache_position=cache_position,
+        )
+        loss = None
+        if labels is not None:
+            logits = outputs[0]
+            v_mask = input_ids.flatten(0, 1) == self.config.v_placeholder_id
+            weight = v_mask * self.config.stream_loss_weight + ~v_mask
+            loss = nn.functional.cross_entropy(logits.flatten(0, 1), labels.flatten(), reduction='none') * weight
+            loss = loss.sum() / (labels >= 0).sum()
+        if not return_dict:
+            return (loss,) + outputs[1:] if loss is not None else outputs
+        outputs.loss = loss
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        past_length = 0
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
+                max_cache_length = (
+                    torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
+                    if past_key_values.get_max_length() is not None
+                    else None
+                )
+                cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
+            # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, past_length :] # NOTE
+        # NOTE
+        if inputs_embeds is not None and past_length < inputs_embeds.size(1):
+            model_inputs = {"inputs_embeds": inputs_embeds[:, past_length:]}
+        else:
+            # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+            # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+            # TODO: use `next_tokens` directly instead.
+            model_inputs = {"input_ids": input_ids.contiguous()}
+        input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+        if cache_position is None:
+            cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
+        elif use_cache:
+            cache_position = cache_position[-input_length:]
+        model_inputs.update(
+            {
+                "position_ids": position_ids, # 长度为新的inputs，从past开始
+                "cache_position": cache_position, # 没有被cache的区域
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask, # cache + input的长度
+            }
+        )
+        return model_inputs
+def build_live_llama(**kwargs):
+    return build_live(config_class=LiveLlamaConfig, model_class=LiveLlamaForCausalLM, **kwargs)
+if __name__ == '__main__':
+    from ..arguments_live import LiveOnePlusTrainingArguments
+    print(LiveOnePlusTrainingArguments().to_dict())
+    model, tokenizer = build_live_llama(is_training=True, **LiveOnePlusTrainingArguments().to_dict())
+    print(model.config, tokenizer)

models/modeling_live.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch, os
+from peft import LoraConfig, get_peft_model, PeftModel
+from transformers import AutoModelForCausalLM, Cache
+from transformers.utils import logging
+from .tokenization_live import build_live_tokenizer_and_update_config
+from .vision_live import build_live_vision
+logger = logging.get_logger(__name__)
+class LiveMixin(AutoModelForCausalLM):
+    def set_vision_inside(self):
+        logger.warning_once("!!! Set vision encoder in the model, only recommended for on in-the-wild inference. "
+            "Please dont call this for efficient training & evaluation. Instead, do visual feature pre-extraction.")
+        self.vision_encoder, self.vision_encode = build_live_vision(self.config)
+    def unset_vision_inside(self):
+        del self.vision_encoder
+        del self.vision_encode
+    def visual_embed(self, frames: torch.Tensor):
+        if hasattr(self, 'vision_encode'):
+            with torch.cuda.amp.autocast():
+                frames = self.vision_encode(self.vision_encoder, frames)
+            frames = frames.to(self.dtype)
+        frames = self.connector(frames)
+        return frames.view(-1, frames.shape[-1])
+    def joint_embed(
+        self,
+        input_ids: torch.Tensor = None,
+        frames: torch.Tensor = None,
+    ):
+        if frames is None:
+            return self.get_input_embeddings()(input_ids)
+        if input_ids is None:
+            return self.visual_embed(frames)
+        inputs_embeds = self.get_input_embeddings()(input_ids.clamp(max=self.vocab_size-1))
+        v_mask = input_ids == self.config.v_placeholder_id
+        if v_mask.any():
+            inputs_embeds[v_mask] = self.visual_embed(frames)
+        return inputs_embeds
+    @torch.no_grad()
+    def stream_evaluate(
+        self,
+        input_ids: torch.LongTensor,
+        labels: torch.LongTensor,
+        frames: torch.ByteTensor,
+        ignore_token_id: int = -100,
+        frame_token_interval_threshold: float = 0.0,
+        **kwargs
+    ):
+        # 0. evaluation only supports batch_size = 1
+        assert input_ids.size(0) == labels.size(0) == 1
+        input_id, label = input_ids[0], labels[0]
+        device = input_id.device
+        zero = torch.tensor(0, dtype=torch.int, device=device)
+        one = torch.tensor(1, dtype=torch.int, device=device)
+        # 1. prepare multi-turn start and stop
+        turn_stops = ((input_id == self.config.eos_token_id).nonzero() + 1)[:,0].tolist()
+        turn_starts = [0] + turn_stops[:-1]
+        num_turns = len(turn_starts)
+        # 2. forward the full input_ids and labels, get tokenwise logits and losses
+        outputs = self.forward(input_ids=input_ids, frames=frames, return_dict=True, use_cache=True)
+        logit, past_key_values = outputs.logits[0], outputs.past_key_values
+        # 3. compute metrics for each turn
+        v_placeholder_id = self.config.v_placeholder_id
+        use_interval = self.config.frame_token_interval_id is not None
+        frame_token_interval_id = self.config.frame_token_interval_id if use_interval else self.config.eos_token_id
+        frame_num_tokens = self.config.frame_token_cls
+        if self.config.frame_token_pooled:
+            frame_num_tokens += self.config.frame_token_pooled[0] * self.config.frame_token_pooled[1]
+        past_num_frames = 0
+        lm_ppls, frame_diffs, fluencies, lm_correctness = [], [], [], []
+        for r, (turn_start, turn_stop) in enumerate(zip(turn_starts, turn_stops)):
+            ## 3.1. we only have two losses: stream loss on frame tokens, and lm loss. prepare corresponding mask according two losses
+            turn_label = label[turn_start:turn_stop]
+            turn_learn_mask = turn_label != ignore_token_id
+            if not turn_learn_mask.any():
+                continue
+            turn_logit = logit[turn_start:turn_stop]
+            turn_input_id = input_id[turn_start:turn_stop]
+            turn_v_mask = turn_input_id == v_placeholder_id
+            turn_num_frames = turn_v_mask.sum() // frame_num_tokens
+            turn_stream_mask = turn_v_mask & turn_learn_mask
+            turn_lm_mask = turn_learn_mask & ~turn_stream_mask
+            ## 3.2 ppl, offline metric
+            if turn_lm_mask.any():
+                turn_lm_masked_logit, turn_lm_masked_label = turn_logit[turn_lm_mask], turn_label[turn_lm_mask]
+                lm_ppl = torch.nn.functional.cross_entropy(turn_lm_masked_logit, turn_lm_masked_label).exp()
+                lm_ppls.append(lm_ppl)
+                turn_lm_masked_wrong_mask = turn_lm_masked_logit.argmax(dim=-1) != turn_lm_masked_label
+                if turn_lm_masked_wrong_mask.any():
+                    num_lm_correct_tokens = turn_lm_masked_wrong_mask.nonzero()[0,0]
+                else:
+                    num_lm_correct_tokens = (~turn_lm_masked_wrong_mask).sum()
+                lm_correctness.append(num_lm_correct_tokens / turn_lm_masked_label.numel())
+            ## 3.3. frame_diff (will be casted to time_diff in compute_metrics)
+            if turn_stream_mask.any():
+                ## 3.3.1: reply before (at) turn_num_frames
+                turn_score = turn_logit.softmax(dim=-1)
+                turn_stream_masked_score = turn_score[turn_stream_mask]
+                if frame_token_interval_threshold > 0:
+                    lower_threshold_mask = turn_stream_masked_score[:, frame_token_interval_id] < frame_token_interval_threshold
+                    turn_stream_masked_score[lower_threshold_mask] = 0
+                turn_stream_masked_pred_mask = turn_stream_masked_score.argmax(dim=-1) != frame_token_interval_id
+                if turn_stream_masked_pred_mask.any():
+                    frame_diff = turn_stream_mask.sum() - turn_stream_masked_pred_mask.nonzero()[0,0] - 1
+                else:
+                    ## 3.3.2: the most complex part,reply after turn_num_frames. we assume the 'assistant: ...' not exists
+                    turn_last_stream_idx = turn_stream_mask.nonzero()[-1,0]
+                    past_key_values_before_assistant = self.trim_past_key_values(past_key_values, 0, turn_start + turn_last_stream_idx + 1)
+                    if r == num_turns - 1: # no future frame. we assume the model should receive a signal when streaming ends (e.g. close button).
+                        frame_diff = zero
+                    else:
+                        next_turn_num_frames = (input_id[turn_starts[r+1]:turn_stops[r+1]] == v_placeholder_id).sum() // frame_num_tokens
+                        to_append_num_frames = min(next_turn_num_frames, turn_num_frames - 1) # avoid bias. current as center, two equal left/right side
+                        if to_append_num_frames == 0:
+                            frame_diff = zero
+                        else:
+                            to_append_frames = frames[past_num_frames+turn_num_frames:past_num_frames+turn_num_frames+to_append_num_frames]
+                            frame_placeholder = [v_placeholder_id] * frame_num_tokens
+                            if use_interval:
+                                frame_placeholder = [frame_token_interval_id] + frame_placeholder
+                            to_append_input_id = torch.tensor(frame_placeholder * to_append_num_frames, dtype=torch.long, device=device)
+                            to_append_logit = self.forward(
+                                input_ids=to_append_input_id[None],
+                                past_key_values=past_key_values_before_assistant,
+                                frames=to_append_frames,
+                                return_dict=True, use_cache=True
+                            ).logits[0]
+                            # we only use the last idx of each frame
+                            idxs = torch.arange(len(frame_placeholder)-1, len(to_append_input_id), len(frame_placeholder), device=device)
+                            to_append_score = to_append_logit[idxs].softmax(dim=-1)
+                            if frame_token_interval_threshold > 0:
+                                lower_threshold_mask = to_append_score[:, frame_token_interval_id] < frame_token_interval_threshold
+                                to_append_score[lower_threshold_mask] = 0
+                            to_append_score_pred_mask = to_append_score.argmax(dim=-1) != frame_token_interval_id
+                            if to_append_score_pred_mask.any():
+                                frame_diff = -(to_append_score_pred_mask.nonzero()[0,0] + 1)
+                            else:
+                                frame_diff = -to_append_num_frames
+                frame_diffs.append(frame_diff.abs())
+            ## 2.6 fluency
+            if turn_lm_mask.any() and turn_stream_mask.any():
+                num_learn_v_tokens = turn_stream_mask.sum()
+                num_learn_valid_tokens = turn_lm_masked_label.numel() + num_learn_v_tokens
+                if frame_diff == 0:
+                    fluency = (num_learn_v_tokens + num_lm_correct_tokens) / num_learn_valid_tokens
+                elif frame_diff > 0:
+                    fluency = (num_learn_v_tokens - frame_diff) / num_learn_valid_tokens
+                else:
+                    fluency = (num_learn_v_tokens - 1) / num_learn_valid_tokens
+                fluencies.append(fluency)
+            ## 2.7 next turn
+            past_num_frames += turn_num_frames
+        lm_ppl = torch.stack(lm_ppls).mean() if lm_ppls else one
+        frame_diff = torch.stack(frame_diffs).float().mean() if frame_diffs else zero
+        fluency = torch.stack(fluencies).float().mean() if fluencies else one
+        lm_correctness = torch.stack(lm_correctness).float().mean() if lm_correctness else one
+        return torch.stack([lm_ppl, frame_diff, fluency, lm_correctness])
+    def trim_past_key_values(self, past_key_values, start, stop):
+        return [[past_keys[:,:,start:stop], past_values[:,:,start:stop]] for past_keys, past_values in past_key_values]
+def fast_greedy_generate(*, model: LiveMixin, inputs_embeds: torch.Tensor, past_key_values: Cache, eos_token_id: int, inplace_output_ids: torch.Tensor):
+    for i in range(inplace_output_ids.size(1)):
+        outputs = model(inputs_embeds=inputs_embeds, past_key_values=past_key_values, use_cache=True)
+        past_key_values = outputs.past_key_values
+        new_token_id = outputs.logits[:, -1:].argmax(dim=-1)
+        inplace_output_ids[:, i] = new_token_id
+        if new_token_id == eos_token_id:
+            break
+        inputs_embeds = model.get_input_embeddings()(new_token_id)
+    return inplace_output_ids[:, :i+1], past_key_values
+def build_live(
+    *,
+    is_training: bool,
+    config_class: type,
+    model_class: type,
+    llm_pretrained: str = None,
+    finetune_modules: list[str] = None,
+    lora_modules: str = None,
+    lora_r: int = None,
+    lora_alpha: int = None,
+    set_vision_inside: bool = False,
+    resume_from_checkpoint: str = '',
+    attn_implementation: str = 'flash_attention_2',
+    torch_dtype: str | torch.dtype = 'auto',
+    **kwargs
+):
+    model = model_class.from_pretrained(llm_pretrained, config=config_class.from_pretrained(llm_pretrained, **kwargs), torch_dtype=torch_dtype, attn_implementation=attn_implementation)
+    tokenizer = build_live_tokenizer_and_update_config(llm_pretrained, model.config)
+    if is_training:
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            target_modules=lora_modules,
+            lora_dropout=0.05,
+            task_type="CAUSAL_LM",
+            modules_to_save=finetune_modules,
+            inference_mode=False,
+        )
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+    else:
+        if resume_from_checkpoint and os.path.exists(resume_from_checkpoint):
+            model = PeftModel.from_pretrained(model, resume_from_checkpoint, is_trainable=False)
+        else:
+            logger.warning(f'!!! Fail to load checkpoint: {resume_from_checkpoint}. Return a new initialized model.')
+        if set_vision_inside:
+            model.set_vision_inside()
+        model.requires_grad_(False)
+    return model, tokenizer

models/tokenization_live.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+from transformers import AutoTokenizer
+from functools import partial
+from .configuration_live import LiveConfigMixin
+def get_stream_placeholder_len(num_frames: int, model_config: LiveConfigMixin) -> str:
+    return num_frames * model_config.frame_num_tokens * len(model_config.v_placeholder) + len(model_config.frame_token_interval) * (num_frames - 1)
+def get_stream_placeholder_jinja2(model_config: LiveConfigMixin) -> str:
+    return f"'{model_config.frame_token_interval}'.join([{model_config.frame_num_tokens} * '{model_config.v_placeholder}'] * message['num_frames'])"
+def get_stream_learn_ranges(num_frames: int, model_config: LiveConfigMixin) -> torch.Tensor:
+    len_frame_placeholder_with_interval = model_config.frame_num_tokens * len(model_config.v_placeholder) + len(model_config.frame_token_interval)
+    intermediate_interval_idxs = torch.arange(
+        len_frame_placeholder_with_interval,
+        len_frame_placeholder_with_interval * num_frames + 1,
+        len_frame_placeholder_with_interval
+    ) - len(model_config.frame_token_interval)
+    len_learn = len(model_config.frame_token_interval) if model_config.frame_token_interval else len(model_config.v_placeholder)
+    learn_ranges = torch.stack([
+        intermediate_interval_idxs,
+        intermediate_interval_idxs + len_learn
+    ], dim=1)
+    return learn_ranges
+def chat_template(self, stream_placeholder_jinja2: str):
+    """
+    system prompt
+    [<v>,<v>,<v>]
+    User: ...
+    Assistant: ...</s>
+    [<v>,<v>]
+    Assistant: ...</s>
+    User: ...
+    Assistant: ...</s>
+    """
+    template = (
+        "{% if messages[0]['role'] == 'system' %}"
+        "{{ bos_token + messages[0]['content'] + '\n' }}" # system
+        "{% set messages = messages[1:] %}"
+        "{% endif %}"
+        "{% for message in messages %}"
+        "{% if message['role'] == 'user' %}"
+        "{% if add_stream_query_prompt %}"
+        "{{ ']\nUser: ' + message['content'] }}"
+        "{% else %}"
+        "{{ '\nUser: ' + message['content'] }}"
+        "{% endif %}"
+        "{% elif message['role'] == 'assistant' %}"
+        "{{ '\nAssistant: '  + message['content'] + eos_token }}"
+        "{% elif message['role'] == 'stream' and message['num_frames'] > 0: %}"
+        "{{ '\n[' + STREAM_PLACEHOLDER + ']' }}"
+        "{% endif %}"
+        "{% endfor %}"
+        "{% if add_generation_prompt %}"
+        "{{ '\nAssistant:' }}"
+        "{% elif add_stream_prompt %}"
+        "{{ '\n[' }}"
+        "{% elif add_stream_generation_prompt %}"
+        "{{ ']\nAssistant:' }}"
+        "{% endif %}"
+    )
+    template = template.replace('STREAM_PLACEHOLDER', stream_placeholder_jinja2)
+    return template
+def chat_template_transition(tokenizer):
+    return {
+        (None, 'system'): tokenizer.bos_token,
+        ('system', 'user'): '\n\nUser: ',
+        ('system', 'stream'): '\n\n[',
+        ('user', 'assistant'): '\nAssistant: ',
+        ('user', 'stream'): '\n[',
+        ('user', 'user'): '\nUser: ',
+        ('assistant', 'user'): f'{tokenizer.eos_token}\nUser: ',
+        ('assistant', 'stream'): f'{tokenizer.eos_token}\n[',
+        ('stream', 'user'): ']\nUser: ',
+        ('stream', 'assistant'): ']\nAssistant: ',
+        'assistant': 'Assistant: ',
+        'eos_token': tokenizer.eos_token,
+    }
+def chat_template_offsets(tokenizer):
+    return {k:len(v) for k, v in chat_template_transition(tokenizer).items()}
+def get_learn_ranges(conversation: list[dict], *, chat_template_offsets: dict[tuple, int], model_config: LiveConfigMixin):
+    offset = 0
+    learn_ranges = []
+    last_role = None
+    for message in conversation:
+        role = message['role']
+        offset += chat_template_offsets[(last_role, role)]
+        last_role = role
+        if role == 'stream':
+            if message.get('learn', False):
+                ranges = get_stream_learn_ranges(message['num_frames'], model_config) + offset
+                # the last one has ]\n, should also consider \n
+                ranges[-1, 1] += 1
+                if not isinstance(message['learn'], bool):
+                    ranges = ranges[:message['learn']]
+                learn_ranges.extend([range(r[0], r[1]) for r in ranges])
+            offset += get_stream_placeholder_len(message['num_frames'], model_config)
+        else:
+            if role == 'assistant':
+                if message.get('learn', False):
+                    learn_ranges.append(range(offset - chat_template_offsets['assistant'], offset + len(message['content']) + chat_template_offsets['eos_token']))
+            offset += len(message['content'])
+    return learn_ranges
+def build_live_tokenizer_and_update_config(llm_pretrained: str, model_config: LiveConfigMixin) -> AutoTokenizer:
+    tokenizer = AutoTokenizer.from_pretrained(llm_pretrained, use_fast=True, padding_side='left')
+    tokenizer.add_special_tokens({'additional_special_tokens': [model_config.v_placeholder]})
+    v_placeholder_id = len(tokenizer) - 1
+    if model_config.frame_token_interval:
+        frame_token_interval_id = tokenizer.convert_tokens_to_ids(model_config.frame_token_interval)
+    else:
+        frame_token_interval_id = None
+    tokenizer.pad_token = tokenizer.eos_token
+    model_config.update(dict(v_placeholder_id=v_placeholder_id, frame_token_interval_id=frame_token_interval_id, eos_token_id=tokenizer.eos_token_id))
+    tokenizer.chat_template = chat_template(tokenizer, get_stream_placeholder_jinja2(model_config))
+    tokenizer.get_learn_ranges = partial(get_learn_ranges, chat_template_offsets=chat_template_offsets(tokenizer), model_config=model_config)
+    return tokenizer
+if __name__ == '__main__':
+    config = LiveConfigMixin(frame_token_interval=',', frame_token_cls=True, frame_token_pooled=[3,3], frame_num_tokens=10)
+    tokenizer = build_live_tokenizer_and_update_config('meta-llama/Meta-Llama-3-8B-Instruct', config)
+    chat = [
+        {'role': 'system', 'content': 'cool.'},
+        {'role': 'stream', 'num_frames': 2, 'learn': 1},
+        {'role': 'user', 'content': 'cool?'},
+        {'role': 'assistant', 'content': 'cool.', 'learn': True},
+        {'role': 'stream', 'num_frames': 3, 'learn': 3},
+        {'role': 'assistant', 'content': 'so cool.', 'learn': True},
+    ]
+    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
+    learn_ranges = tokenizer.get_learn_ranges(chat)
+    batch = tokenizer([prompt], return_offsets_mapping=True, add_special_tokens=False, return_tensors="pt", padding=True)
+    batch_labels = torch.full_like(batch.input_ids, -100, dtype=torch.long)
+    for text, labels, input_ids, offset_mapping, learn_range in zip(
+        [prompt], batch_labels, batch.input_ids, batch.offset_mapping, [learn_ranges]
+    ):
+        for learn_r in learn_range:
+            start = torch.nonzero(offset_mapping[:,0] == learn_r.start).item()
+            if offset_mapping[:,0][-1] >= learn_r.stop:
+                stop = torch.nonzero(offset_mapping[:,0] == learn_r.stop).item()
+            else: # the last eos token
+                stop = len(input_ids)
+            labels[start-1:stop-1] = input_ids[start:stop]
+            # NOTE: input_ids may out of boundary of len(tokenizer) - 1. (1 is the added vision placeholder)
+            # this is because some frames has v_placeholder_id target. so replace it with eos token.
+            labels[labels >= len(tokenizer) - 1] = tokenizer.eos_token_id
+    print(batch.input_ids)
+    print(batch_labels)

models/vision_live.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import math, torch
+from functools import partial
+from torch import nn, Tensor
+from torchvision.transforms.functional import normalize
+from transformers import AutoModel
+from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from .configuration_live import LiveConfigMixin
+def _siglip_vision_encode(vision_model: nn.Module, frames: Tensor, frame_token_cls: bool, frame_token_pooled: tuple,
+    mean=[0.5,0.5,0.5], std=[0.5,0.5,0.5], rescale_factor=0.00392156862745098, **kwargs):
+    frames = normalize(frames * rescale_factor, mean=mean, std=std)
+    with torch.cuda.amp.autocast():
+        vision_outputs = vision_model(frames)
+        last_hidden_state = vision_outputs.last_hidden_state
+        if frame_token_pooled:
+            s = int(math.sqrt(last_hidden_state.shape[1]))
+            spatial_tokens = torch.nn.functional.adaptive_avg_pool2d(
+                last_hidden_state.reshape(
+                    last_hidden_state.shape[0], s, s, last_hidden_state.shape[-1]
+                ).permute(0, 3, 1, 2),
+                frame_token_pooled
+            ).flatten(2, 3).permute(0, 2, 1)
+            if not frame_token_cls:
+                return spatial_tokens
+        if frame_token_cls:
+            cls_token = vision_outputs.pooler_output[:, None]
+            if not frame_token_pooled:
+                return cls_token
+    return torch.cat([cls_token, spatial_tokens], dim=1)
+def _clip_vision_encode(vision_model: nn.Module, frames: Tensor, frame_token_cls: bool, frame_token_pooled: tuple,
+    mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, rescale_factor=0.00392156862745098, **kwargs):
+    frames = normalize(frames * rescale_factor, mean=mean, std=std)
+    with torch.cuda.amp.autocast():
+        vision_outputs = vision_model(frames)
+        last_hidden_state = vision_outputs.last_hidden_state
+        if frame_token_pooled:
+            s = int(math.sqrt(last_hidden_state.shape[1]))
+            spatial_tokens = torch.nn.functional.adaptive_avg_pool2d(
+                last_hidden_state[:,1:].reshape(
+                    last_hidden_state.shape[0], s, s, last_hidden_state.shape[-1]
+                ).permute(0, 3, 1, 2),
+                frame_token_pooled
+            ).flatten(2, 3).permute(0, 2, 1)
+            if not frame_token_cls:
+                return spatial_tokens
+        if frame_token_cls:
+            cls_token = last_hidden_state[:,0]
+            if not frame_token_pooled:
+                return cls_token
+    return torch.cat([cls_token, spatial_tokens], dim=1)
+def build_live_vision(config: LiveConfigMixin):
+    model = AutoModel.from_pretrained(config.vision_pretrained).vision_model
+    if 'google/siglip-large-patch16-384' == config.vision_pretrained:
+        return model, partial(_siglip_vision_encode, frame_token_cls=config.frame_token_cls, frame_token_pooled=config.frame_token_pooled)
+    elif 'laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90k' == config.vision_pretrained or 'openai/clip-vit-large-patch14-336' == config.vision_pretrained:
+        return model, partial(_clip_vision_encode, config)
+    else:
+        raise ValueError(f'Unverified vision_pretrained: {config.vision_pretrained}')