|
GPU = NVIDIA RTX 4000 Ada Generation. Max memory = max_memoryGB. |
|
16.811 GB of memory reserved. |
|
|
|
from transformers import TrainingArguments, DataCollatorForSeq2Seq |
|
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM |
|
from unsloth import FastLanguageModel, is_bf16_supported, is_bfloat16_supported |
|
from unsloth.chat_templates import train_on_responses_only |
|
import torch |
|
import datetime |
|
from datasets import load_dataset |
|
|
|
|
|
model = FastLanguageModel.get_peft_model( |
|
model, |
|
r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 |
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", |
|
"gate_proj", "up_proj", "down_proj"], |
|
lora_alpha = 16, |
|
lora_dropout = 0.1, # Supports any, but = 0 is optimized |
|
bias = "none", # Supports any, but = "none" is optimized |
|
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context |
|
random_state = 3407, |
|
use_rslora = False, # We support rank stabilized LoRA |
|
loftq_config = None, # And LoftQ |
|
) |
|
|
|
dataset = load_dataset('chris7374/esg-net-zero', revision='100_per_class_v3') |
|
val_dataset = load_dataset('chris7374/esg-net-zero', revision='validation') |
|
|
|
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
You are an expert ESG (Environmental, Social, and Governance) analyst who conducts ESG research by analyzing texts to identify the presence of climate balance targets. Your primary task is to classify identified targets into one of four predefined classes and determine the target year for the climate balance target. Only consider overall climate balance targets, meaning that they are company-wide. |
|
The possible classes are “Carbon neutral(ity)”, “Emissions reduction target”, “Net zero”, and “No target”. |
|
Each class has equal importance, and the correct classification should reflect the most explicit target mentioned in the text. In cases where multiple classes are present: |
|
• “Net zero” should only be prioritized if explicitly mentioned as a company’s overarching target. |
|
• “Carbon neutral(ity)” takes precedence over “Emissions reduction target” only if it is the primary focus of the text. |
|
• “Emissions reduction target” should be classified if it is directly stated and not overshadowed by “Net zero” or “Carbon neutral(ity)” commitments. |
|
• If no explicit target is mentioned, classify as “No target”. |
|
Ensure the classification is based on explicit information from the text, without assuming that one target implies another unless clearly stated. |
|
|
|
### Context: |
|
{} |
|
|
|
### Response Formatting: |
|
Only answer in the following XML format:\n<answer><classification><end_target>Target</end_target></classification><extraction><end_target_year>Year</end_target_year></extraction><quote>...</quote></answer> |
|
""" |
|
|
|
output = """ |
|
<answer> |
|
<classification> |
|
<end_target>{}</end_target> |
|
</classification> |
|
<extraction> |
|
<end_target_year>{}</end_target_year> |
|
</extraction> |
|
<quote>{}</quote> |
|
</answer> |
|
""" |
|
|
|
def formatting_prompts_func(examples): |
|
end_target = examples['end_target'] |
|
end_target_year = examples['end_target_year'] |
|
context = examples['custom_text'] |
|
quote = examples['custom_short_description'] |
|
texts = [] |
|
for end_target, end_target_year, context,quote in zip(end_target, end_target_year, context,quote): |
|
messages = [ |
|
{"role": "user", "content": f"{prompt.format(context)}",}, |
|
{"role" : "assistant", "content" : output.format(end_target,end_target_year,quote)} |
|
] |
|
#https://huggingface.co./docs/transformers/main/chat_templating |
|
text = tokenizer.apply_chat_template( |
|
messages, |
|
add_generation_prompt=False, |
|
return_tensors="pt", |
|
tokenize=False, |
|
add_special_tokens=False, |
|
) |
|
texts.append(text) |
|
return {"text" : texts} |
|
pass |
|
|
|
dataset = dataset.map(formatting_prompts_func, batched = True) |
|
dataset = dataset['train'] |
|
dataset = dataset.shuffle(seed=1234) # Shuffle dataset with same seed for reproducable runs. |
|
train_data = dataset |
|
val_dataset = val_dataset.map(formatting_prompts_func, batched = True) |
|
val_dataset = val_dataset['train'] |
|
val_dataset = val_dataset.shuffle(seed=1234) # Shuffle dataset with same seed for reproducable runs. |
|
val_dataset = val_dataset |
|
|
|
project = "esg" |
|
base_model_name = "gemma2-2b-it" |
|
run_name = base_model_name + "-" + project |
|
output_dir = "outputs" + run_name |
|
|
|
num_train_epochs = 3 |
|
batch_size = 2 |
|
gradient_accumulation_steps = 4 |
|
eval_frequency = 1 |
|
|
|
#This is needed for training on completions only and needs to be updated for different models than gemma22b |
|
instruction_template = "<start_of_turn>user" |
|
response_template = "<start_of_turn>model" |
|
collator = DataCollatorForCompletionOnlyLM(instruction_template = instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False) |
|
trainer = SFTTrainer( |
|
model = model, |
|
tokenizer = tokenizer, |
|
train_dataset=train_data, |
|
eval_dataset=val_dataset, |
|
dataset_text_field="text", |
|
max_seq_length = max_seq_length, |
|
dataset_num_proc = 2, |
|
packing = False, # Can make training 5x faster for short sequences. |
|
data_collator=collator, |
|
#data_collator=DataCollatorForSeq2Seq(tokenizer = tokenizer), |
|
args = TrainingArguments( |
|
per_device_train_batch_size = batch_size, |
|
per_device_eval_batch_size = 2, |
|
gradient_accumulation_steps = gradient_accumulation_steps, |
|
warmup_steps = 0, |
|
learning_rate = 2e-4, |
|
warmup_ratio= 0.1, |
|
fp16 = not is_bfloat16_supported(), |
|
bf16 = is_bfloat16_supported(), |
|
num_train_epochs=num_train_epochs, |
|
optim = "adamw_8bit", |
|
weight_decay = 0.01, |
|
lr_scheduler_type = "cosine", |
|
do_eval=True, |
|
eval_strategy="steps", |
|
eval_steps = 5, |
|
save_strategy="epoch", |
|
report_to="wandb", |
|
logging_steps=5, |
|
seed = 3407, |
|
output_dir = output_dir, |
|
run_name=f"{run_name}-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')}" |
|
), |
|
) |
|
#trainer = train_on_responses_only(trainer, instruction_template,response_template) |
|
|
|
|
|
|
|
==((====))== Unsloth - 2x faster free finetuning | Num GPUs = 1 |
|
\\ /| Num examples = 419 | Num Epochs = 3 |
|
O^O/ \_/ \ Batch size per device = 2 | Gradient Accumulation steps = 4 |
|
\ / Total batch size = 8 | Total steps = 156 |
|
"-____-" Number of trainable parameters = 10,383,360 |
|
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. |
|
wandb: Currently logged in as: chris382. Use `wandb login --relogin` to force relogin |
|
|
|
Tracking run with wandb version 0.18.2 |
|
Run data is saved locally in /workspace/wandb/run-20240928_164716-8kjxmhj3 |
|
Syncing run gemma2-2b-it-esg-2024-09-28-16-47 to Weights & Biases (docs) |
|
View project at https://wandb.ai/chris382/huggingface |
|
View run at https://wandb.ai/chris382/huggingface/runs/8kjxmhj3 |
|
[156/156 13:50, Epoch 2/3] |
|
Step Training Loss Validation Loss |
|
5 0.510000 0.288729 |
|
10 0.182900 0.077718 |
|
15 0.075600 0.047878 |
|
20 0.076000 0.044263 |
|
25 0.048800 0.040280 |
|
30 0.047800 0.037031 |
|
35 0.068300 0.033870 |
|
40 0.073200 0.034806 |
|
45 0.066400 0.036861 |
|
50 0.031900 0.032882 |
|
55 0.044000 0.034806 |
|
60 0.050800 0.037792 |
|
65 0.025600 0.034382 |
|
70 0.040600 0.034536 |
|
75 0.025800 0.033386 |
|
80 0.020400 0.033675 |
|
85 0.030500 0.034336 |
|
90 0.034500 0.034875 |
|
95 0.035200 0.035417 |
|
100 0.037200 0.033550 |
|
105 0.026700 0.035225 |
|
110 0.030800 0.037146 |
|
115 0.016100 0.036048 |
|
120 0.014800 0.034015 |
|
125 0.016900 0.033881 |
|
130 0.015900 0.034430 |
|
135 0.022800 0.035105 |
|
140 0.027400 0.035405 |
|
145 0.011300 0.035336 |
|
150 0.016100 0.035120 |
|
155 0.015300 0.035540 |
|
|
|
TrainOutput(global_step=156, training_loss=0.055958470449042626, metrics={'train_runtime': 856.8681, 'train_samples_per_second': 1.467, 'train_steps_per_second': 0.182, 'total_flos': 2.5379759787734016e+16, 'train_loss': 0.055958470449042626, 'epoch': 2.9714285714285715}) |