# Installing Dependencies

In [3]:
! pip install -q datasets transformers trl peft accelerate bitsandbytes auto-gptq optimum pandas scikit-learn matplotlib

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Loading Dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("Harshvardhan27/Wikicorpus_Fine_Tuned_Mistral_FinalCheckpoint")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'text', 'input_prompt', 'output_text', 'output_length', 'output_cleaned'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['id', 'title', 'text', 'input_prompt', 'output_text', 'output_length', 'output_cleaned'],
        num_rows: 200
    })
})

In [7]:
dataset['train'][0]

{'id': '5599879',
 'title': 'Amine (singer)',
 'text': 'Amine Mounder is a French R&B singer born in Casablanca, Morocco. He is known for his singular music style, Raï\'n\'B, which is a mix of R&B and Raï.\n\nHis biggest hits are "Sobri (notre Destin)," a duet with French singer Leslie, and "J\'voulais." Both reached the #1 spot on the French singles chart.\n\n\n\nDiscography.\n\nAlbums.\n\nAu delà des rêves (31 December 2005) #16 FR;\n\n\n\nSingles.\n\n2005 "Ma Vie" #15 FR;\n\n2006 "J\'voulais" #1 FR;\n\n2006 "My Girl" #29;\n\n\n\nExternal links.\n\nOfficial website (in French);\n\nEMI Music France;',
 'input_prompt': 'Amine Mounder is a French R&B singer born',
 'output_text': "ень 1982 in mohammedia algeria he is best known for his 2009 single   which has over 16 million views on youtube as of 2013 he has released three albums and several singles since 2004 and has collaborated with french rappers including saïd and mc solaar and singers like kenza farah and zied  he has also acted 

# Truncating output_text to 126


In [12]:
model_checkpoint = "mistralai/Mistral-7B-v0.1"
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
tokenizer

LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-v0.1', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [14]:
train, test = 0, 0
for i in range(len(dataset['train'])):
    initial_token_count = len(tokenizer.tokenize(dataset['train'][i]['output_cleaned']))
    if initial_token_count > 126:
        train += 1

for i in range(len(dataset['test'])):
    initial_token_count = len(tokenizer.tokenize(dataset['test'][i]['output_cleaned']))
    if initial_token_count > 126:
        test += 1
print(train, test)

1000 200


In [28]:
dataset['train'][0]

{'id': '5599879',
 'title': 'Amine (singer)',
 'text': 'Amine Mounder is a French R&B singer born in Casablanca, Morocco. He is known for his singular music style, Raï\'n\'B, which is a mix of R&B and Raï.\n\nHis biggest hits are "Sobri (notre Destin)," a duet with French singer Leslie, and "J\'voulais." Both reached the #1 spot on the French singles chart.\n\n\n\nDiscography.\n\nAlbums.\n\nAu delà des rêves (31 December 2005) #16 FR;\n\n\n\nSingles.\n\n2005 "Ma Vie" #15 FR;\n\n2006 "J\'voulais" #1 FR;\n\n2006 "My Girl" #29;\n\n\n\nExternal links.\n\nOfficial website (in French);\n\nEMI Music France;',
 'input_prompt': 'Amine Mounder is a French R&B singer born',
 'output_text': "ень 1982 in mohammedia algeria he is best known for his 2009 single   which has over 16 million views on youtube as of 2013 he has released three albums and several singles since 2004 and has collaborated with french rappers including saïd and mc solaar and singers like kenza farah and zied  he has also acted 

In [30]:
list1 = []
max_token_count  = 126
for i in range(1000):
    tokens = tokenizer.tokenize(dataset['train'][i]['output_cleaned'])
    initial_token_count = len(tokens)
    truncated_tokens = tokens[:max_token_count]
    text = tokenizer.convert_tokens_to_string(truncated_tokens)
    updated_token_count = len(tokenizer.tokenize(text))
    list1.append(text)
    #print(f"Original Token Count: {initial_token_count}, Updated Token Count: {updated_token_count}")

In [32]:
list2 = []
max_token_count  = 126
for i in range(200):
    tokens = tokenizer.tokenize(dataset['test'][i]['output_cleaned'])
    initial_token_count = len(tokens)
    truncated_tokens = tokens[:max_token_count]
    text = tokenizer.convert_tokens_to_string(truncated_tokens)
    updated_token_count = len(tokenizer.tokenize(text))
    list2.append(text)
    #print(f"Original Token Count: {initial_token_count}, Updated Token Count: {updated_token_count}")

In [34]:
import pandas as pd
train = pd.DataFrame(list1, columns=['text'])
test = pd.DataFrame(list2, columns=['text'])

In [36]:
from datasets import Dataset, DatasetDict
import pandas as pd

train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)

dataset = DatasetDict({
    'train': train,
    'test': test
})

In [40]:
len(tokenizer.tokenize(dataset['train'][0]['text']))

126

# Dataset saving

In [41]:
from datasets import load_dataset
import pandas as pd
for split, save_data in dataset.items():
    # Convert to pandas DataFrame
    df = pd.DataFrame(save_data)

    # Export to CSV
    df.to_csv(f"{split}_data.csv", index=False)

In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 200
    })
})

In [43]:
dataset['train'][10]

{'text': 'щем by the irish recorded music association ( irma ) and published by the  интернет на  в интервью с ёжиком 2009 года из 100 самых знаменитых людей по версии газеты « Комсомольская правда» занял 22 @@ е место на 2 @@ е месте удона ( 1 @@ е место в ноябре 2008 года ) находится 4 @@ е место среди самых'}

In [44]:
dataset['test'][10]

{'text': 'й team that plays in the Arizona League the Giants are affiliated with the San Francisco Giants and play their home games at Scottsdale Stadium in Scottsdale Arizona the Giants franchise began play in 2007 as the Scottsdale Phillies when it was affiliated with the philadelphia phillies the team became the Scottsdale Giants in 2010 when the franchise affiliation switched to the san francisco giants the arizona league season runs from june through august the team is owned and operated by the й  corporation which also owns and operates the major league spring '}

# Importing Dependencies

In [45]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, BitsAndBytesConfig, TrainingArguments, TrainerCallback
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

# Chatbot Config

In [46]:
class Config:
    MODEL_ID = "mistralai/Mistral-7B-v0.1" # mistralai/Mistral-7B-v0.1 TheBloke/Mistral-7B-v0.1-GPTQ
    # DATASET_ID = dataset_dict
    CONTEXT_FIELD= ""
    INSTRUCTION_FIELD = "text"
    TARGET_FIELD = "label"
    DATASET_TEXT_FIELD = "text"

    BITS = 4
    DISABLE_EXLLAMA = True
    DEVICE_MAP = "auto"
    USE_CACHE = False

    LORA_R = 16
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.05
    BIAS = "none"
    TARGET_MODULES = ['q_proj','k_proj','v_proj','o_proj']
    TASK_TYPE = "CAUSAL_LM"

    OUTPUT_DIR = "./Mistral_finetuned_adapters"

    BATCH_SIZE = 16
    GRAD_ACCUMULATION_STEPS = 1
    OPTIMIZER = "paged_adamw_32bit"
    LR = 1e-4
    LR_SCHEDULER = "cosine"
    LOGGING_STEPS = 50
    SAVE_STRATEGY = "epoch"
    EVALUATION_STRATEGY = "epoch"
    NUM_TRAIN_EPOCHS = 4 # epochs setting different for reference model

    # MAX_STEPS = 250
    FP16 = False
    BF16 = True
    PUSH_TO_HUB = False
    MAX_SEQ_LENGTH = 128
    PACKING = False

    use_4bit = True
    bnb_4bit_compute_dtype = "float16"
    bnb_4bit_quant_type = "nf4"
    bnb_4bit_use_double_quant=True

# Model Trainer

In [47]:
class ModelTrainer:

    def __init__(self):

        '''
        A Trainer used to train the Zephyr 7B model which beats Llama2-70b-chat model for your custom usecase

        Initialized:
        config: Parameters required for the trainer to create and process dataset, train and save model finally
        tokenizer: Tokenizer required in training loop
        '''

        self.config = Config()
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_ID)
        self.tokenizer.pad_token = self.tokenizer.unk_token

    def process_data_sample(self, example):

        processed_example =f'''{example[self.config.INSTRUCTION_FIELD]} </s>'''
        return processed_example

    def create_dataset(self):


        # data = load_dataset(self.config.DATASET_ID, split="train")
        data = dataset
        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED DATASET")
        print("\n====================================================================\n")

        # Access the 'train' dataset
        temp_train_dataset = data['train']

        # Convert the 'train' dataset to a list of dictionaries
        temp_data = [item for item in temp_train_dataset]

        # Create a DataFrame from the list
        big_df = pd.DataFrame(temp_data)
        df = big_df #[:1000]
        # df = data.to_pandas()
        df[self.config.DATASET_TEXT_FIELD] = df[[self.config.INSTRUCTION_FIELD]].apply(lambda x: self.process_data_sample(x), axis=1)

        print("\n====================================================================\n")
        print("\t\t\tPROCESSED DATASET")
        print(df.iloc[0])
        print("\n====================================================================\n")

        processed_data = Dataset.from_pandas(df[[self.config.DATASET_TEXT_FIELD]])
        return processed_data

    def create_test_dataset(self):

        '''
        Downloads and processes the dataset

        Returns:
        processed_data: Training ready processed dataset
        '''

        # data = load_dataset(self.config.DATASET_ID, split="train")
        data = dataset
        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED DATASET")
        print("\n====================================================================\n")

        # Access the 'train' dataset
        temp_test_dataset = data['test']

        # Convert the 'train' dataset to a list of dictionaries
        temp_data = [item for item in temp_test_dataset]

        # Create a DataFrame from the list
        big_df = pd.DataFrame(temp_data)
        df = big_df
        # df = data.to_pandas()
        df[self.config.DATASET_TEXT_FIELD] = df[[self.config.INSTRUCTION_FIELD]].apply(lambda x: self.process_data_sample(x), axis=1)

        print("\n====================================================================\n")
        print("\t\t\tPROCESSED DATASET")
        print(df.iloc[0])
        print("\n====================================================================\n")

        processed_data = Dataset.from_pandas(df[[self.config.DATASET_TEXT_FIELD]])
        return processed_data

    def prepare_model(self):

        '''
        Prepares model for finetuning by quantizing it and attaching lora modules to the model

        Returns:
        model - Model ready for finetuning
        peft_config - LoRA Adapter config
        '''

        bnb_config = BitsAndBytesConfig(
                                        load_in_4bit=self.config.use_4bit,
                                        bnb_4bit_quant_type=self.config.bnb_4bit_quant_type,
                                        bnb_4bit_compute_dtype=self.config.bnb_4bit_compute_dtype,
                                        bnb_4bit_use_double_quant=self.config.bnb_4bit_use_double_quant,
                                        )
        # bnb_config = GPTQConfig(
        #                             bits=self.config.BITS,
        #                             disable_exllama=self.config.DISABLE_EXLLAMA,
        #                             tokenizer=self.tokenizer
        #                         )

        model = AutoModelForCausalLM.from_pretrained(
                                                        self.config.MODEL_ID,
                                                        quantization_config=bnb_config,
                                                        device_map=self.config.DEVICE_MAP
                                                    )

        print("\n====================================================================\n")
        print("\t\t\tDOWNLOADED MODEL")
        print(model)
        print("\n====================================================================\n")

        if torch.cuda.device_count() > 1: # If more than 1 GPU
          model.is_parallelizable = True
          model.model_parallel = True

        model.config.use_cache=self.config.USE_CACHE
        model.config.pretraining_tp=1
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        print("\n====================================================================\n")
        print("\t\t\tMODEL CONFIG UPDATED")
        print("\n====================================================================\n")

        peft_config = LoraConfig(
                                    r=self.config.LORA_R,
                                    lora_alpha=self.config.LORA_ALPHA,
                                    lora_dropout=self.config.LORA_DROPOUT,
                                    bias=self.config.BIAS,
                                    task_type=self.config.TASK_TYPE,
                                    target_modules=self.config.TARGET_MODULES
                                )

        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()

        print("\n====================================================================\n")
        print("\t\t\tPREPARED MODEL FOR FINETUNING")
        print(model)
        print("\n====================================================================\n")

        return model, peft_config

    def set_training_arguments(self):

        '''
        Sets the arguments for the training loop in TrainingArguments class
        '''

        training_arguments = TrainingArguments(
                                                output_dir=self.config.OUTPUT_DIR,
                                                per_device_train_batch_size=self.config.BATCH_SIZE,
                                                gradient_accumulation_steps=self.config.GRAD_ACCUMULATION_STEPS,
                                                optim=self.config.OPTIMIZER,
                                                learning_rate=self.config.LR,
                                                lr_scheduler_type=self.config.LR_SCHEDULER,
                                                save_strategy=self.config.SAVE_STRATEGY,
                                                evaluation_strategy=self.config.EVALUATION_STRATEGY,
                                                logging_steps=self.config.LOGGING_STEPS,
                                                num_train_epochs=self.config.NUM_TRAIN_EPOCHS,
                                                # max_steps=self.config.MAX_STEPS,
                                                fp16=self.config.FP16,
                                                bf16=self.config.BF16,
                                                push_to_hub=self.config.PUSH_TO_HUB
                                            )

        return training_arguments

    class PerplexityCallback(TrainerCallback):
        def __init__(self):
            # Initialize any necessary variables
            pass

        def on_evaluate(self, args, state, control, metrics=None, **kwargs):
            # This is a new method that you can call at the end of evaluation to calculate perplexity
            if metrics and 'eval_loss' in metrics:
                eval_loss = metrics['eval_loss']
                perplexity = torch.exp(torch.tensor(eval_loss)).item()
                # Update the metrics with the Perplexity value
                metrics['perplexity'] = perplexity
            else:
                print("Eval loss not found in metrics at this step.")

    def train(self):

        '''
        Trains the model on the specified dataset in config
        '''

        train_data = self.create_dataset()
        test_data = self.create_test_dataset()
        model, peft_config = self.prepare_model()
        training_args = self.set_training_arguments()
        ppl_callback = self.PerplexityCallback()

        print("\n====================================================================\n")
        print("\t\t\tPREPARED FOR FINETUNING")
        print("\n====================================================================\n")

        trainer = SFTTrainer(
                                model=model,
                                train_dataset=train_data,
                                eval_dataset=test_data,
                                peft_config=peft_config,
                                dataset_text_field=self.config.DATASET_TEXT_FIELD,
                                args=training_args,
                                tokenizer=self.tokenizer,
                                packing=self.config.PACKING,
                                max_seq_length=self.config.MAX_SEQ_LENGTH,
                                callbacks=[ppl_callback]
                            )
        trainer.train()

        print("\n====================================================================\n")
        print("\t\t\tFINETUNING COMPLETED")
        print("\n====================================================================\n")

        # trainer.push_to_hub()
        # Save the tokenizer and model in the same directory
        output_dir = "./Mistral_finetuned_Final_Adapter"
        # Save model's state dictionary and configuration
        model.save_pretrained(output_dir)
        # Save tokenizer's configuration and vocabulary
        self.tokenizer.save_pretrained(output_dir)

        print("\n====================================================================\n")
        print("\t\t\tFINETUNED MODEL SAVED")
        print("\n====================================================================\n")

In [48]:
if __name__ == "__main__":
    Model_trainer = ModelTrainer()
    Model_trainer.train()



			DOWNLOADED DATASET




			PROCESSED DATASET
text    ень 1982 in mohammedia algeria he is best know...
Name: 0, dtype: object




			DOWNLOADED DATASET




			PROCESSED DATASET
text    ол the city of cincinnati ohio in hamilton cou...
Name: 0, dtype: object




config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]



			DOWNLOADED MODEL
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNo

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Unnamed: 3
1,1.8565,1.672414,5.325006
2,1.5298,1.635722,5.133161
3,1.3719,1.664973,5.285533
4,1.1578,1.713535,5.548539




			FINETUNING COMPLETED




			FINETUNED MODEL SAVED




# Inference Code

In [49]:
from peft import AutoPeftModelForCausalLM
from transformers import GenerationConfig
from transformers import AutoTokenizer
import torch

def process_data_sample(example):

    processed_example = f'''{example["text"]} </s>'''
    return processed_example

tokenizer = AutoTokenizer.from_pretrained("./Mistral_finetuned_Final_Adapter")
tokenizer.padding_side='left'

model = AutoPeftModelForCausalLM.from_pretrained(
    "./Mistral_finetuned_Final_Adapter",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")

generation_config = GenerationConfig(
    do_sample=True,
    # top_k=1,
    # temperature=0.1,
    temperature=0.5,
    top_p=0.95,
    top_k=40,
    max_new_tokens=128,
    pad_token_id=tokenizer.eos_token_id
    # temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [52]:
inp_str = process_data_sample(
    {
        #"text": 'What are animals?'
        "text": 'Walker off to running start with Irish Its',
    }
)

inputs = tokenizer(inp_str, return_tensors="pt").to("cuda")

In [51]:
import time
st_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)
    print(tokenizer.decode(outputs[0], skip_special_tokens=False))
print(f"\n\nExecution Time:", time.time()-st_time)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> What are animals? </s>м 1994 the term animals was used to refer to all multicellular eukaryotes that are not plants or fungi however this definition is not universally accepted and the term is often used more restrictively to refer to a specific clade of organisms that includes all animals except the sea sponges (porifera) the term animals is sometimes used more broadly to refer to all heterotrophic organisms ( all organisms that consume other organisms for food )  including protists and some algae  this broader  definition is used in some textbooks and popular works 


Execution Time: 8.000709295272827


In [53]:
import time
st_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)
    print(tokenizer.decode(outputs[0], skip_special_tokens=False))
print(f"\n\nExecution Time:", time.time()-st_time)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> Walker off to running start with Irish Its </s> 1988 89 season the first under new head coach david wilkinson the team finished with a record of 13 @@ 15 and a third @@ place finish in the big east conference the team was led by senior guard john walker who averaged 18 @@ 6 points per game and was named to the all @@ big east second team walker was also the team's leading scorer for the 1987 88 season when  он  finished with a record of 14 14  и  в  и 


Execution Time: 6.86424446105957


In [41]:
## Old execution

import time
st_time = time.time()
with torch.no_grad():
    outputs = model.generate(**inputs, generation_config=generation_config)
    print(tokenizer.decode(outputs[0], skip_special_tokens=False))
print(f"\n\nExecution Time:", time.time()-st_time)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


<s> Walker off to running start with Irish Its lineage is as long as it is distinguished, conjuring up a who #39;s who among Notre Dame football greats. Some of the best players in Fighting Irish history have graced the No. </s>  position, including Paul Hornung, Joe Theismann, Tim Brown and Raghib Ismail.



The latest in the line of great Irish No. 3s is junior running back Armando Allen, who has taken over the position from fellow junior Robert Hughes.



Allen 5 11 </s>


Execution Time: 3.0769588947296143


# Models Size Calculation Code

In [None]:
import os

def get_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

print(get_size(), 'bytes')

1723852637 bytes
