sdyy
/

test_trainer

+# -*- coding: utf-8 -*-
+"""AutoTrain_LLM.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/github/huggingface/autotrain-advanced/blob/main/colabs/AutoTrain_LLM.ipynb
+"""
+#@title 🤗 AutoTrain LLM
+#@markdown In order to use this colab
+#@markdown - upload train.csv to a folder named `data/`
+#@markdown - train.csv must contain a `text` column
+#@markdown - choose a project name if you wish
+#@markdown - change model if you wish, you can use most of the text-generation models from Hugging Face Hub
+#@markdown - add huggingface information (token) if you wish to push trained model to huggingface hub
+#@markdown - update hyperparameters if you wish
+#@markdown - click `Runtime > Run all` or run each cell individually
+#@markdown - report issues / feature requests here: https://github.com/huggingface/autotrain-advanced/issues
+import os
+!pip install -U autotrain-advanced > install_logs.txt
+!autotrain setup --colab > setup_logs.txt
+#@markdown ---
+#@markdown #### Project Config
+#@markdown Note: if you are using a restricted/private model, you need to enter your Hugging Face token in the next step.
+project_name = 'my-autotrain-llm' # @param {type:"string"}
+model_name = 'abhishek/llama-2-7b-hf-small-shards' # @param {type:"string"}
+#@markdown ---
+#@markdown #### Push to Hub?
+#@markdown Use these only if you want to push your trained model to a private repo in your Hugging Face Account
+#@markdown If you dont use these, the model will be saved in Google Colab and you are required to download it manually.
+#@markdown Please enter your Hugging Face write token. The trained model will be saved to your Hugging Face account.
+#@markdown You can find your token here: https://huggingface.co/settings/tokens
+push_to_hub = False # @param ["False", "True"] {type:"raw"}
+hf_token = "hf_XXX" #@param {type:"string"}
+hf_username = "abc" #@param {type:"string"}
+#@markdown ---
+#@markdown #### Hyperparameters
+learning_rate = 2e-4 # @param {type:"number"}
+num_epochs = 1 #@param {type:"number"}
+batch_size = 1 # @param {type:"slider", min:1, max:32, step:1}
+block_size = 1024 # @param {type:"number"}
+trainer = "sft" # @param ["default", "sft", "orpo"] {type:"raw"}
+warmup_ratio = 0.1 # @param {type:"number"}
+weight_decay = 0.01 # @param {type:"number"}
+gradient_accumulation = 4 # @param {type:"number"}
+mixed_precision = "fp16" # @param ["fp16", "bf16", "none"] {type:"raw"}
+peft = True # @param ["False", "True"] {type:"raw"}
+quantization = "int4" # @param ["int4", "int8", "none"] {type:"raw"}
+lora_r = 16 #@param {type:"number"}
+lora_alpha = 32 #@param {type:"number"}
+lora_dropout = 0.05 #@param {type:"number"}
+os.environ["PROJECT_NAME"] = project_name
+os.environ["MODEL_NAME"] = model_name
+os.environ["PUSH_TO_HUB"] = str(push_to_hub)
+os.environ["HF_TOKEN"] = hf_token
+os.environ["LEARNING_RATE"] = str(learning_rate)
+os.environ["NUM_EPOCHS"] = str(num_epochs)
+os.environ["BATCH_SIZE"] = str(batch_size)
+os.environ["BLOCK_SIZE"] = str(block_size)
+os.environ["WARMUP_RATIO"] = str(warmup_ratio)
+os.environ["WEIGHT_DECAY"] = str(weight_decay)
+os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
+os.environ["MIXED_PRECISION"] = str(mixed_precision)
+os.environ["PEFT"] = str(peft)
+os.environ["QUANTIZATION"] = str(quantization)
+os.environ["LORA_R"] = str(lora_r)
+os.environ["LORA_ALPHA"] = str(lora_alpha)
+os.environ["LORA_DROPOUT"] = str(lora_dropout)
+os.environ["HF_USERNAME"] = hf_username
+os.environ["TRAINER"] = trainer
+!autotrain llm \
+--train \
+--model ${MODEL_NAME} \
+--project-name ${PROJECT_NAME} \
+--data-path data/ \
+--text-column text \
+--lr ${LEARNING_RATE} \
+--batch-size ${BATCH_SIZE} \
+--epochs ${NUM_EPOCHS} \
+--block-size ${BLOCK_SIZE} \
+--warmup-ratio ${WARMUP_RATIO} \
+--lora-r ${LORA_R} \
+--lora-alpha ${LORA_ALPHA} \
+--lora-dropout ${LORA_DROPOUT} \
+--weight-decay ${WEIGHT_DECAY} \
+--gradient-accumulation ${GRADIENT_ACCUMULATION} \
+--quantization ${QUANTIZATION} \
+--mixed-precision ${MIXED_PRECISION} \
+--username ${HF_USERNAME} \
+--trainer ${TRAINER} \
+$( [[ "$PEFT" == "True" ]] && echo "--peft" ) \
+$( [[ "$PUSH_TO_HUB" == "True" ]] && echo "--push-to-hub --token ${HF_TOKEN}" )

finetune.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# -*- coding: utf-8 -*-
+"""Finetune.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1b_AA5GHhblSKrQymYs_uYYDEqvqklfrV
+"""
+!pip install datasets transformers[torch]
+!pip install evaluate
+!pip install accelerate -U
+from datasets import load_dataset
+dataset = load_dataset("yelp_review_full")
+dataset["train"][100]
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+def tokenize_function(examples):
+    return tokenizer(examples["text"], padding="max_length", truncation=True)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+from transformers import TrainingArguments
+training_args = TrainingArguments(output_dir="test_trainer")
+import numpy as np
+import evaluate
+metric = evaluate.load("accuracy")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+from transformers import TrainingArguments, Trainer
+training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=small_train_dataset,
+    eval_dataset=small_eval_dataset,
+    compute_metrics=compute_metrics,
+)
+trainer.train()

script.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+"""Finetune.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1b_AA5GHhblSKrQymYs_uYYDEqvqklfrV
+"""
+from datasets import load_dataset
+dataset = load_dataset("yelp_review_full")
+dataset["train"][100]
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+def tokenize_function(examples):
+    return tokenizer(examples["text"], padding="max_length", truncation=True)
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(100))
+small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(100))
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+from transformers import TrainingArguments
+training_args = TrainingArguments(output_dir="test_trainer")
+import numpy as np
+import evaluate
+metric = evaluate.load("accuracy")
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predictions = np.argmax(logits, axis=-1)
+    return metric.compute(predictions=predictions, references=labels)
+from transformers import TrainingArguments, Trainer
+training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=small_train_dataset,
+    eval_dataset=small_eval_dataset,
+    compute_metrics=compute_metrics,
+)
+trainer.train()
+trainer.push_to_hub()