In [90]:
# @title # ðŸŒŠ AutoBitnet

# @markdown ---

# @markdown ### âœ¨ Model Parameters

MODEL_CONFIG = "NousResearch/Nous-Hermes-llama-2-7b" # @param {type:"string"}
HEADS = 6 # @param {type: "number"}
DIMENSIONS = 768 # @param {type: "number"}
LAYERS = 6 # @param {type: "number"}
INTERMEDIATE_SIZE= 1024 # @param {type: "number"}
CONTEXT_LENGTH = 256 # @param {type: "number"}
HUGGINGFACE_ID = "saadnaeem" # @param {type:"string"}
NEW_MODEL = "Llama2-70M-Cosmopedia-100k-Pretrained" # @param {type:"string"}
WANDB_TOKEN=''
HF_TOKEN=''

# @markdown ---

# @markdown ### ðŸ’¥ Training Parameters

DATASET = "abideen/Cosmopedia-100k-pretrain" # @param {type:"string"}
BATCH_SIZE = 32 # @param {type:"number"}
LEARNING_RATE = 1.5e-4 # @param {type:"number"}
EPOCHS = 1 # @param {type:"number"}
!pip install datasets wandb accelerate
from torch import nn
from transformers.models.llama.modeling_llama import *
from transformers import (AutoTokenizer, AutoConfig, LlamaForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments, AutoModel)
from datasets import load_dataset
from huggingface_hub import login
import wandb
# wandb.ai/saadnaeem-dev

from huggingface_hub import create_repo, HfApi

def activation_quant(x):
    scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
    y = (x * scale).round().clamp_(-128, 127) / scale
    return y
def weight_quant(w):
    scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
    u = (w * scale).round().clamp_(-1, 1) / scale
    return u

class BitLinear(nn.Linear):
    def forward(self, x):
        w = self.weight # a weight tensor with shape [d, k]
        x = x.to(w.device)
        RMSNorm = LlamaRMSNorm(x.shape[-1]).to(w.device)
        x_norm = RMSNorm(x)
        # A trick for implementing Straightâˆ’Throughâˆ’Estimator (STE) using detach()
        x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
        w_quant = w + (weight_quant(w) - w).detach()
        y = F.linear(x_quant, w_quant)
        return y

def convert_to_bitnet(model, copy_weights):
    for name, module in model.named_modules():
        # Replace linear layers with BitNet
        if isinstance(module, LlamaSdpaAttention) or isinstance(module, LlamaMLP):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, nn.Linear):
                    bitlinear = BitLinear(child_module.in_features, child_module.out_features, child_module.bias is not None).to(device="cuda:0")
                    if copy_weights:
                        bitlinear.weight = child_module.weight
                        if child_module.bias is not None:
                            bitlinear.bias = child_module.bias
                    setattr(module, child_name, bitlinear)
        # Remove redundant input_layernorms
        elif isinstance(module, LlamaDecoderLayer):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, LlamaRMSNorm) and child_name == "input_layernorm":
                    setattr(module, child_name, nn.Identity().to(device="cuda:0"))


wandb.login(key=WANDB_TOKEN)
login(token=HF_TOKEN)
data = load_dataset(DATASET)






Token is valid (permission: write).
Your token has been saved to C:\Users\saad.naeem\.cache\huggingface\token
Login successful


In [80]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CONFIG)

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=False,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=True,
        return_length=True,
    )
    # Combine all tokens
    combined = []
    for tokenized_doc in outputs['input_ids']:
        combined += tokenized_doc + [tokenizer.eos_token_id]
    # Chunk
    input_batch = []
    for i in range(0, len(combined) - CONTEXT_LENGTH, CONTEXT_LENGTH):
        input_batch.append(combined[i:i+CONTEXT_LENGTH])
    return {"input_ids": input_batch}



tokenized_data = data.map(
    tokenize, batched=True, remove_columns=data["train"].column_names,
)

In [81]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 476702
    })
})

In [82]:
from datasets import DatasetDict

# Set the number of rows
tokenized_data['train'].set_format(type='pandas')

In [83]:
sampled_dataset = tokenized_data['train'].select(range(1000))
sampled_dataset_dict = DatasetDict({
    'train': sampled_dataset
})

In [85]:
sampled_dataset_dict['train'][0]

Unnamed: 0,input_ids
0,"[1, 2266, 338, 385, 6597, 515, 263, 24499, 299..."


In [86]:
tokenized_data = sampled_dataset_dict
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1000
    })
})

In [87]:
total_tokens = tokenized_data['train'].num_rows * CONTEXT_LENGTH
print(f"Training on {total_tokens:_} tokens")

config = AutoConfig.from_pretrained(
    MODEL_CONFIG,
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

config.hidden_size = DIMENSIONS
config.max_position_embeddings = DIMENSIONS
config.num_attention_heads = HEADS
config.num_hidden_layers = LAYERS
config.num_key_value_heads = HEADS
config.intermediate_size = INTERMEDIATE_SIZE

### Create the llama model with our custom config. Convert it to bitnet.
model = LlamaForCausalLM(config)
convert_to_bitnet(model, copy_weights=False)
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

output_path = "./Llama2-70M-Cosmopedia-100k-Pretrained"
args = TrainingArguments(
    output_dir=output_path,
    per_device_train_batch_size=BATCH_SIZE,
    logging_steps=100,
    gradient_accumulation_steps=2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    warmup_steps=0.1,
    lr_scheduler_type="cosine",
    learning_rate=LEARNING_RATE,
    # max_steps=5000,
    save_steps=0.25,
    fp16=True,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"],
)

Training on 256_000 tokens
Model size: 77.5M parameters


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [88]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=16, training_loss=9.391032218933105, metrics={'train_runtime': 110.6973, 'train_samples_per_second': 9.034, 'train_steps_per_second': 0.145, 'total_flos': 81244717056000.0, 'train_loss': 9.391032218933105, 'epoch': 1.0})

In [None]:
trainer.save_model(f"{output_path}")
folder = f"{output_path}"
api = HfApi()
create_repo(
    repo_id = f"{HUGGINGFACE_ID}/{NEW_MODEL}",
    repo_type="model",
    exist_ok=True,
    token=HF_TOKEN,
)

api.upload_folder(
    folder_path=folder,
    repo_type="model",
    repo_id=f"{HUGGINGFACE_ID}/{NEW_MODEL}",
    token=HF_TOKEN,
)

Upload 9 LFS files:   0%|          | 0/9 [00:00<?, ?it/s]

optimizer.pt:   0%|          | 0.00/620M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/620M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.llama.modeling_llama import *
# Load a pretrained BitNet model
model = "saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)


def activation_quant(x):
    scale = 127.0 / x.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
    y = (x * scale).round().clamp_(-128, 127)
    y = y / scale
    return y
def weight_quant(w):
    scale = 1.0 / w.abs().mean().clamp_(min=1e-5)
    u = (w * scale).round().clamp_(-1, 1)
    u = u / scale
    return u

class BitLinear(nn.Linear):
    def forward(self, x):
        w = self.weight # a weight tensor with shape [d, k]
        x = x.to(w.device)
        RMSNorm = LlamaRMSNorm(x.shape[-1]).to(w.device)
        x_norm = RMSNorm(x)
        # A trick for implementing Straightâˆ’Throughâˆ’Estimator (STE) using detach()
        x_quant = x_norm + (activation_quant(x_norm) - x_norm).detach()
        w_quant = w + (weight_quant(w) - w).detach()
        y = F.linear(x_quant, w_quant)
        return y

def convert_to_bitnet(model, copy_weights):
    for name, module in model.named_modules():
        # Replace linear layers with BitNet
        if isinstance(module, LlamaSdpaAttention) or isinstance(module, LlamaMLP):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, nn.Linear):
                    bitlinear = BitLinear(child_module.in_features, child_module.out_features, child_module.bias is not None).to(device="cuda:0")
                    if copy_weights:
                        bitlinear.weight = child_module.weight
                        if child_module.bias is not None:
                            bitlinear.bias = child_module.bias
                    setattr(module, child_name, bitlinear)
        # Remove redundant input_layernorms
        elif isinstance(module, LlamaDecoderLayer):
            for child_name, child_module in module.named_children():
                if isinstance(child_module, LlamaRMSNorm) and child_name == "input_layernorm":
                    setattr(module, child_name, nn.Identity().to(device="cuda:0"))


convert_to_bitnet(model, copy_weights=True)
model.to(device="cuda:0")

prompt = "What is Machine Learning?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at abideen/Bitnet-Llama-70M and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


'What is Machine Learning?\n\nIn todayâ€™s digital age, machine learning has become a crucial aspect of our lives. With the increasing popularity of machine learning, machine learning has become a powerful tool for learning and learning. With the'

In [4]:
prompt = "Write a short poem"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'Write a short poem about a "The Witcher" by the author of the book "The Witcher of the Book of the Book of the Book of the Book of the Book of the Book of the Book of the Book of the'

In [5]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 768, padding_idx=0)
    (layers): ModuleList(
      (0-5): 6 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)
          (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)
          (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Identity()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): 

In [6]:
# print number of parameters
model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")

Model size: 77.5M parameters


In [8]:
# Save the model to disk
import torch

# Assuming that `model` is your model
torch.save(model.state_dict(), 'Llama2-70M-Cosmopedia-100k-Pretrain.pth')

In [9]:
!pwd

'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [12]:
Trainer

transformers.trainer.Trainer

In [14]:
trainer.save_model("Llama2-70M-Cosmopedia-100k-Pretrained")
tokenizer.save_pretrained("Llama2-70M-Cosmopedia-100k-Pretrained"),

(('Llama2-70M-Cosmopedia-100k-Pretrained\\tokenizer_config.json',
  'Llama2-70M-Cosmopedia-100k-Pretrained\\special_tokens_map.json',
  'Llama2-70M-Cosmopedia-100k-Pretrained\\tokenizer.json'),)

### Testing model from Huggingface Hub

In [29]:
prompt = "what is machine learning"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"what is machine learning' di'''.icaiaian, isÃ© isÃ©\ninestinieninamentWriteinieningienAienAest\ninamenterninest\ninest\ninest\ninament"

In [30]:
folder = r"C:\Users\saad.naeem\PycharmProjects\NLP-Projects-NHV-1-Bit-LLM\Llama2-70M-Cosmopedia-100k-Pretrain"
api = HfApi()


In [31]:
api.create_repo(
    repo_id = f"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained",
    repo_type="model",
    exist_ok=True
)

RepoUrl('https://huggingface.co./saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained', endpoint='https://huggingface.co.', repo_type='model', repo_id='saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained')

In [32]:
api.upload_folder(
    folder_path=folder,
    repo_type="model",
    repo_id=f"saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained",
    token="",
)

optimizer.pt:   0%|          | 0.00/620M [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/627 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.47k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co./saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained/commit/b3b67c7a7dcb199a07244be3a493cd649cf3731f', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b3b67c7a7dcb199a07244be3a493cd649cf3731f', pr_url=None, pr_revision=None, pr_num=None)

In [35]:
from transformers import (AutoTokenizer, AutoModelForCausalLM)

In [36]:
# Load a pretrained BitNet model
model = "saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained"

In [37]:
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)

Some weights of LlamaForCausalLM were not initialized from the model checkpoint at saadnaeem/Llama2-70M-Cosmopedia-100k-Pretrained and are newly initialized: ['model.layers.0.input_layernorm.weight', 'model.layers.1.input_layernorm.weight', 'model.layers.2.input_layernorm.weight', 'model.layers.3.input_layernorm.weight', 'model.layers.4.input_layernorm.weight', 'model.layers.5.input_layernorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

In [38]:
convert_to_bitnet(model, copy_weights=True)

In [39]:
model.to(device="cuda:0")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 768, padding_idx=0)
    (layers): ModuleList(
      (0-5): 6 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): BitLinear(in_features=768, out_features=1024, bias=False)
          (up_proj): BitLinear(in_features=768, out_features=1024, bias=False)
          (down_proj): BitLinear(in_features=1024, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Identity()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): 

In [40]:
prompt = "What is Machine Learning?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
generate_ids = model.generate(inputs.input_ids, max_length=50)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

"What is Machine Learning?\n\nI've been working on a project for a project that has been working on a project that has been working on a project. I am not sure what I am doing. I am not sure what I do"