chris7374's picture
Create run.txt
45185c9 verified
GPU = NVIDIA RTX 4000 Ada Generation. Max memory = max_memoryGB.
16.811 GB of memory reserved.
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from unsloth import FastLanguageModel, is_bf16_supported, is_bfloat16_supported
from unsloth.chat_templates import train_on_responses_only
import torch
import datetime
from datasets import load_dataset
model = FastLanguageModel.get_peft_model(
model,
r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 16,
lora_dropout = 0.1, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
dataset = load_dataset('chris7374/esg-net-zero', revision='100_per_class_v3')
val_dataset = load_dataset('chris7374/esg-net-zero', revision='validation')
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
You are an expert ESG (Environmental, Social, and Governance) analyst who conducts ESG research by analyzing texts to identify the presence of climate balance targets. Your primary task is to classify identified targets into one of four predefined classes and determine the target year for the climate balance target. Only consider overall climate balance targets, meaning that they are company-wide.
The possible classes are “Carbon neutral(ity)”, “Emissions reduction target”, “Net zero”, and “No target”.
Each class has equal importance, and the correct classification should reflect the most explicit target mentioned in the text. In cases where multiple classes are present:
• “Net zero” should only be prioritized if explicitly mentioned as a company’s overarching target.
• “Carbon neutral(ity)” takes precedence over “Emissions reduction target” only if it is the primary focus of the text.
• “Emissions reduction target” should be classified if it is directly stated and not overshadowed by “Net zero” or “Carbon neutral(ity)” commitments.
• If no explicit target is mentioned, classify as “No target”.
Ensure the classification is based on explicit information from the text, without assuming that one target implies another unless clearly stated.
### Context:
{}
### Response Formatting:
Only answer in the following XML format:\n<answer><classification><end_target>Target</end_target></classification><extraction><end_target_year>Year</end_target_year></extraction><quote>...</quote></answer>
"""
output = """
<answer>
<classification>
<end_target>{}</end_target>
</classification>
<extraction>
<end_target_year>{}</end_target_year>
</extraction>
<quote>{}</quote>
</answer>
"""
def formatting_prompts_func(examples):
end_target = examples['end_target']
end_target_year = examples['end_target_year']
context = examples['custom_text']
quote = examples['custom_short_description']
texts = []
for end_target, end_target_year, context,quote in zip(end_target, end_target_year, context,quote):
messages = [
{"role": "user", "content": f"{prompt.format(context)}",},
{"role" : "assistant", "content" : output.format(end_target,end_target_year,quote)}
]
#https://huggingface.co./docs/transformers/main/chat_templating
text = tokenizer.apply_chat_template(
messages,
add_generation_prompt=False,
return_tensors="pt",
tokenize=False,
add_special_tokens=False,
)
texts.append(text)
return {"text" : texts}
pass
dataset = dataset.map(formatting_prompts_func, batched = True)
dataset = dataset['train']
dataset = dataset.shuffle(seed=1234) # Shuffle dataset with same seed for reproducable runs.
train_data = dataset
val_dataset = val_dataset.map(formatting_prompts_func, batched = True)
val_dataset = val_dataset['train']
val_dataset = val_dataset.shuffle(seed=1234) # Shuffle dataset with same seed for reproducable runs.
val_dataset = val_dataset
project = "esg"
base_model_name = "gemma2-2b-it"
run_name = base_model_name + "-" + project
output_dir = "outputs" + run_name
num_train_epochs = 3
batch_size = 2
gradient_accumulation_steps = 4
eval_frequency = 1
#This is needed for training on completions only and needs to be updated for different models than gemma22b
instruction_template = "<start_of_turn>user"
response_template = "<start_of_turn>model"
collator = DataCollatorForCompletionOnlyLM(instruction_template = instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset=train_data,
eval_dataset=val_dataset,
dataset_text_field="text",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
data_collator=collator,
#data_collator=DataCollatorForSeq2Seq(tokenizer = tokenizer),
args = TrainingArguments(
per_device_train_batch_size = batch_size,
per_device_eval_batch_size = 2,
gradient_accumulation_steps = gradient_accumulation_steps,
warmup_steps = 0,
learning_rate = 2e-4,
warmup_ratio= 0.1,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
num_train_epochs=num_train_epochs,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "cosine",
do_eval=True,
eval_strategy="steps",
eval_steps = 5,
save_strategy="epoch",
report_to="wandb",
logging_steps=5,
seed = 3407,
output_dir = output_dir,
run_name=f"{run_name}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')}"
),
)
#trainer = train_on_responses_only(trainer, instruction_template,response_template)
==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1
\\ /| Num examples = 419 | Num Epochs = 3
O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 4
\ / Total batch size = 8 | Total steps = 156
"-____-" Number of trainable parameters = 10,383,360
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: chris382. Use `wandb login --relogin` to force relogin
Tracking run with wandb version 0.18.2
Run data is saved locally in /workspace/wandb/run-20240928_164716-8kjxmhj3
Syncing run gemma2-2b-it-esg-2024-09-28-16-47 to Weights & Biases (docs)
View project at https://wandb.ai/chris382/huggingface
View run at https://wandb.ai/chris382/huggingface/runs/8kjxmhj3
[156/156 13:50, Epoch 2/3]
Step Training Loss Validation Loss
5 0.510000 0.288729
10 0.182900 0.077718
15 0.075600 0.047878
20 0.076000 0.044263
25 0.048800 0.040280
30 0.047800 0.037031
35 0.068300 0.033870
40 0.073200 0.034806
45 0.066400 0.036861
50 0.031900 0.032882
55 0.044000 0.034806
60 0.050800 0.037792
65 0.025600 0.034382
70 0.040600 0.034536
75 0.025800 0.033386
80 0.020400 0.033675
85 0.030500 0.034336
90 0.034500 0.034875
95 0.035200 0.035417
100 0.037200 0.033550
105 0.026700 0.035225
110 0.030800 0.037146
115 0.016100 0.036048
120 0.014800 0.034015
125 0.016900 0.033881
130 0.015900 0.034430
135 0.022800 0.035105
140 0.027400 0.035405
145 0.011300 0.035336
150 0.016100 0.035120
155 0.015300 0.035540
TrainOutput(global_step=156, training_loss=0.055958470449042626, metrics={'train_runtime': 856.8681, 'train_samples_per_second': 1.467, 'train_steps_per_second': 0.182, 'total_flos': 2.5379759787734016e+16, 'train_loss': 0.055958470449042626, 'epoch': 2.9714285714285715})