ahatamiz
/

llama2

Safetensors

hf_gpt

Model card Files Files and versions Community

ahatamiz commited on Sep 26

Commit

46a13cf

•

1 Parent(s): 318d126

Upload 2 files

Browse files

Files changed (2) hide show

hf_model.py +297 -0
lm_harness.py +112 -0

hf_model.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import torch
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from .hf_config import HFConfig
+import torch.nn as nn
+from lit_gpt.model import Block, MBlock
+try:
+    from mamba_ssm.ops.triton.layernorm import RMSNorm, layer_norm_fn, rms_norm_fn
+except ImportError:
+    RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None
+from typing import Any, Literal, Optional, Type, Union, List, Tuple
+from lit_gpt.config import Config
+RoPECache = Tuple[torch.Tensor, torch.Tensor]
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+class HF_GPTPreTrainedModel(PreTrainedModel):
+    config_class = HFConfig
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Block"]
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+class HF_GPTModel(HF_GPTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.lit_config = Config.from_name(config.name)
+        config = self.lit_config
+        self.embeddings = nn.Embedding(config.vocab_size, config.n_embd)
+        self.h = nn.ModuleList([Block(config, i) for i in range(config.n_layer)])
+        self.ln_f= config.norm_class(config.n_embd, eps=config.norm_eps)
+        self.rope_cache: Optional[RoPECache] = None
+        self.mask_cache: Optional[torch.Tensor] = None
+        self.kv_caches: List[KVCache] = []
+        self.max_len = self.lit_config.block_size
+        self.mamba_init = self.lit_config.mamba or self.lit_config.mamba_init
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        max_seq_length: Optional[int] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        idx = input_ids
+        input_pos = position_ids
+        assert inputs_embeds is None
+        if self.lit_config.mamba:
+            hidden_states = self.embeddings(idx)
+            residual = None
+            for block in self.h:
+                hidden_states, residual = block(
+                    hidden_states, residual, inference_params=None
+                )
+            norm_f = self.ln_f
+            if not self.lit_config.fused_add_norm:
+                residual = (hidden_states + residual) if residual is not None else hidden_states
+                hidden_states = norm_f(residual.to(dtype= norm_f.weight.dtype))
+            else:
+                # Set prenorm=False here since we don't need the residual
+                fused_add_norm_fn = rms_norm_fn if isinstance(norm_f, RMSNorm) else layer_norm_fn
+                hidden_states = fused_add_norm_fn(
+                    hidden_states,
+                    norm_f.weight,
+                    norm_f.bias,
+                    eps=norm_f.eps,
+                    residual=residual,
+                    prenorm=False,
+                    residual_in_fp32=self.lit_config.residual_in_fp32,
+                )
+        else:
+            B, T = idx.size()
+            # use_kv_cache = input_pos is not None
+            use_kv_cache = input_pos is not None
+            block_size = self.lit_config.block_size
+            if max_seq_length is None:
+                max_seq_length = block_size
+            if use_kv_cache:  # not relevant otherwise
+                assert (
+                    max_seq_length >= T
+                ), f"Cannot forward sequence of length {T}, max seq length is only {max_seq_length}"
+            #assert max_seq_length <= block_size, f"Cannot attend to {max_seq_length}, block size is only {block_size}"
+            #assert block_size >= T, f"Cannot forward sequence of length {T}, block size is only {block_size}"
+            if not self.lit_config.nope:
+                if self.rope_cache is None:
+                    self.rope_cache = self.build_rope_cache(idx, self.max_len)
+                elif T> self.max_len:
+                    self.max_len = T
+                    self.rope_cache = self.build_rope_cache(idx, self.max_len)
+                cos, sin = self.rope_cache
+            # passing `attn_mask` to SDPA downgrades it to use the inefficient implementation. since we only need the mask
+            # for the kv-cache support (only during inference), we only create it in that situation
+            # this will be resolved by https://github.com/pytorch/pytorch/issues/96099
+            if use_kv_cache and self.mask_cache is None:
+                self.mask_cache = self.build_mask_cache(idx)
+            if use_kv_cache:
+                if not self.lit_config.nope:
+                    cos = cos.index_select(0, input_pos)
+                    sin = sin.index_select(0, input_pos)
+                mask = self.mask_cache.index_select(2, input_pos)
+                mask = mask[:, :, :, :max_seq_length]
+            else:
+                if not self.lit_config.nope:
+                    cos = cos[:T]
+                    sin = sin[:T]
+                mask = None
+            if self.lit_config.nope:
+                rope = None
+            else:
+                rope = (cos, sin)
+            # forward the model itself
+            x = self.embeddings(idx)  # token embeddings of shape (b, t, n_embd)
+            if not use_kv_cache:
+                for block in self.h:
+                    x, *_ = block(x, rope, max_seq_length)
+            else:
+                if self.lit_config.nope:
+                    self.kv_caches = self.kv_caches or self.build_kv_caches(x, max_seq_length, None )
+                else:
+                    self.kv_caches = self.kv_caches or self.build_kv_caches(x, max_seq_length, cos.size(-1) * 2)
+                for i, block in enumerate(self.h):
+                    x, self.kv_caches[i] = block(x, rope, max_seq_length, mask, input_pos, self.kv_caches[i])
+            hidden_states = self.ln_f(x)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=None,
+            hidden_states=hidden_states,
+            attentions=None
+        )
+    def build_rope_cache(self, idx: torch.Tensor, seq_len: int) -> RoPECache:
+        return build_rope_cache(
+            seq_len=seq_len,
+            n_elem=int(self.lit_config.rotary_percentage * self.lit_config.head_size),
+            dtype=torch.float32,
+            device=idx.device,
+            condense_ratio=self.lit_config.condense_ratio,
+        )
+    def build_mask_cache(self, idx: torch.Tensor) -> torch.Tensor:
+        ones = torch.ones((self.lit_config.block_size, self.lit_config.block_size), device=idx.device, dtype=torch.bool)
+        return torch.tril(ones).unsqueeze(0).unsqueeze(0)
+    def build_kv_caches(self, idx: torch.Tensor, max_seq_length: int, rope_cache_length: int) -> List[KVCache]:
+        B = idx.size(0)
+        heads = 1 if self.lit_config.n_query_groups == 1 else self.lit_config.n_query_groups
+        if rope_cache_length is not None:
+            k_cache_shape = (
+                B,
+                max_seq_length,
+                heads,
+                rope_cache_length + self.lit_config.head_size - int(self.lit_config.rotary_percentage * self.lit_config.head_size),
+            )
+        else:
+            k_cache_shape = (
+                B,
+                max_seq_length,
+                heads,
+                self.lit_config.head_size,
+            )
+        v_cache_shape = (B, max_seq_length, heads, self.lit_config.head_size)
+        device = idx.device
+        return [
+            (torch.zeros(k_cache_shape, device=device), torch.zeros(v_cache_shape, device=device))
+            for _ in range(self.lit_config.n_layer)
+        ]
+class HF_GPTForCausalLM(HF_GPTPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.lit_config = Config.from_name(config.name)
+        self.config = config
+        self.transformer = HF_GPTModel(config)
+        config = self.lit_config
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.n_layer = config.n_layer
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.transformer.embeddings
+    def set_input_embeddings(self, value):
+        self.transformer.embeddings = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        inputs = {"input_ids": input_ids}
+        if past is not None:
+            inputs["past_key_values"] = past
+        return inputs
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        transformer_outputs = self.transformer(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def _reorder_cache(self, past, beam_idx):
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx) for past_state in layer_past)
+            for layer_past in past
+        )
+def build_rope_cache(
+    seq_len: int, n_elem: int, dtype: torch.dtype, device: torch.device, base: int = 10000, condense_ratio: int = 1
+) -> RoPECache:
+    """Enhanced Transformer with Rotary Position Embedding.
+    Derived from: https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/labml_nn/
+    transformers/rope/__init__.py. MIT License:
+    https://github.com/labmlai/annotated_deep_learning_paper_implementations/blob/master/license.
+    """
+    # $\Theta = {\theta_i = 10000^{\frac{2(i-1)}{d}}, i \in [1, 2, ..., \frac{d}{2}]}$
+    theta = 1.0 / (base ** (torch.arange(0, n_elem, 2, device=device) / n_elem))
+    # Create position indexes `[0, 1, ..., seq_len - 1]`
+    seq_idx = torch.arange(seq_len, device=device) / condense_ratio
+    # Calculate the product of position index and $\theta_i$
+    idx_theta = torch.outer(seq_idx, theta)
+    cos, sin = torch.cos(idx_theta), torch.sin(idx_theta)
+    # added by peiyuan to ensure same data type with q, k, to use fused rotary embedding
+    if dtype == torch.bfloat16:
+        return cos.bfloat16(), sin.bfloat16()
+    # this is to mimic the behaviour of complex32, else we will get different results
+    if dtype in (torch.float16, torch.bfloat16, torch.int8):
+        return cos.half(), sin.half()
+    return cos, sin

lm_harness.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# -*- coding: utf-8 -*-
+from __future__ import annotations
+import fla  # noqa
+from lm_eval.__main__ import cli_evaluate
+from lm_eval.api.registry import register_model
+from lm_eval.models.huggingface import HFLM
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from hf_gpt.hf_model import HF_GPTForCausalLM
+from hf_gpt.hf_config import HFConfig
+import requests
+import wandb
+import lm_eval
+# from lm_eval.loggers import WandbLogger
+import argparse
+AutoConfig.register("hf_gpt",HFConfig)
+AutoModelForCausalLM.register(HFConfig,HF_GPTForCausalLM)
+import logging
+logging.basicConfig(level=logging.INFO)
+import torch
+import os
+import pdb
+import os
+os.environ['HF_HOME'] = '/lustre/fs8/portfolios/nvr/users/ahatamizadeh/hf_cache/'
+def is_directory_non_empty(directory):
+    if not os.path.isdir(directory):
+        return "The provided path is not a directory."
+    return len(os.listdir(directory)) > 0
+def main(args):
+    ### First convert to Huggingface models when neccessary
+    import datasets
+    datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+    hf_save_dir = args.hf_save_dir or os.path.dirname(args.ckpt_path)
+    # for example: /lustre/fsw/portfolios/nvr/users/soyang/code/next_gen_llm-1/checkpoint/outputs/tsz512x4k_20B_Samba_421M_tsz512x4k_20B_Samba_421M_sy_stream_v11/iter-009198-ckpt.pth
+    ckpt = torch.load(args.ckpt_path)
+    print("Checkpoint loaded")
+    hf_config = HFConfig(name=args.model_name)
+    hf_model = HF_GPTForCausalLM(hf_config)
+    model_weight = ckpt['model']
+    new_weight = {}
+    for k, v in model_weight.items():
+        if 'wte' in k:
+            new_weight[k.replace("wte", "embeddings")] = v
+        elif 'beta_proj' in k:
+            new_weight[k.replace("beta_proj", "b_proj")] = v
+        elif 'bias_proj' in k:
+            new_weight[k.replace("bias_proj", "b_proj")] = v
+        else:
+            new_weight[k] = v
+    hf_model.load_state_dict(new_weight)
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, trust_remote_code=True)
+    hf_model.save_pretrained(hf_save_dir)
+    tokenizer.save_pretrained(hf_save_dir)
+    print("Huggingface model saved")
+    ### Then call lm_eval
+    tasks = args.tasks.split(',')
+    assert hf_save_dir is not None
+    assert args.dtype in ['bfloat16', 'float32']
+    # wandb_logger = wandb.init(project="llm_next_gen", name=args.exp_name, id=args.exp_name, group=args.wandb_group_name)  # or empty if wandb.init(...) already called before
+    print("Start lm eval....")
+    results = lm_eval.simple_evaluate(
+    model="hf",
+    model_args=f"pretrained={hf_save_dir},trust_remote_code=True,dtype={args.dtype}",
+    tasks=tasks,
+    device="cuda",
+    log_samples=False,
+    batch_size=1,
+    num_fewshot=args.num_fewshot,
+        )['results']
+    print('swde: {}'.format(results['swde']['contains,none']))
+    print('squad_completion: {}'.format(results['squad_completion']['contains,none']))
+    # print('mmlu: {}'.format(results['mmlu']['acc,none']))
+    print('piqa: {}'.format(results['piqa']['acc,none']))
+    print('hellaswag: {}'.format(results['hellaswag']['acc_norm,none']))
+    print('winogrande: {}'.format(results['winogrande']['acc,none']))
+    print('arc_easy: {}'.format(results['arc_easy']['acc,none']))
+    print('arc_challenge: {}'.format(results['arc_challenge']['acc_norm,none']))
+    print('wikitext, ppl: {}'.format(results['wikitext']['word_perplexity,none']))
+    print('lambada_openai, acc: {}'.format(results['lambada_openai']['acc,none']))
+    print('lambada_openai, ppl: {}'.format(results['lambada_openai']['perplexity,none']))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='LLM Training')
+    parser.add_argument('--ckpt_path', type=str, default=None, help='Path to the ckpt directory')
+    parser.add_argument('--hf_save_dir', type=str, default=None, help='(Selective) Path to the saved HF model directory')
+    parser.add_argument('--dtype', type=str, default='bfloat16', help='Data type to use for inference')
+    parser.add_argument('--model_name', type=str, default='Samba_421M', help='Model name')
+    parser.add_argument('--exp_name', type=str, default='hf_eval', help='Experiment name')
+    parser.add_argument('--wandb_dir', type=str, default='/lustre/fsw/portfolios/nvr/users/soyang/code/next_gen_llm-1/checkpoint/outputs', help='Wandb directory')
+    parser.add_argument('--wandb_group_name', type=str, default='lm-eval-harness', help='Wandb group name')
+    parser.add_argument('--tasks', type=str, default='wikitext,lambada_openai,piqa,hellaswag,winogrande,arc_easy,arc_challenge,mmlu', help='Tasks to evaluate')
+    parser.add_argument('--tokenizer_name', type=str, default="TinyLlama/TinyLlama_v1.1", help="tokenizer name or path")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--num_fewshot', type=int, default=0)
+    # do convert or not
+    parser.add_argument('--skip_convert', action='store_true', help='Whether to convert to Huggingface model')
+    args = parser.parse_args()
+    main(args)