|
import math |
|
from datasets import Dataset |
|
from huggingface_hub import login |
|
from transformers import ( |
|
TrainingArguments, |
|
DataCollatorForLanguageModeling, |
|
AutoTokenizer, |
|
AutoModelForMaskedLM, |
|
Trainer, |
|
default_data_collator |
|
) |
|
import torch |
|
import collections |
|
import numpy as np |
|
|
|
def tokenize_function(examples): |
|
result = tokenizer(examples["text"], padding=True, truncation=True) |
|
if tokenizer.is_fast: |
|
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] |
|
print(f"tokenize function result: {result}") |
|
return result |
|
|
|
|
|
def group_texts(examples): |
|
|
|
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} |
|
|
|
total_length = len(concatenated_examples[list(examples.keys())[0]]) |
|
|
|
total_length = (total_length // chunk_size) * chunk_size |
|
|
|
result = { |
|
k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)] |
|
for k, t in concatenated_examples.items() |
|
} |
|
|
|
result["topic"] = result["input_ids"].copy() |
|
print(f"group texts result: {result}") |
|
return result |
|
|
|
|
|
def whole_word_masking_data_collator(features): |
|
|
|
|
|
|
|
for feature in features: |
|
word_ids = feature.pop("word_ids") |
|
|
|
|
|
mapping = collections.defaultdict(list) |
|
current_word_index = -1 |
|
current_word = None |
|
for idx, word_id in enumerate(word_ids): |
|
if word_id is not None: |
|
if word_id != current_word: |
|
current_word = word_id |
|
current_word_index += 1 |
|
mapping[current_word_index].append(idx) |
|
|
|
|
|
mask = np.random.binomial(1, wwm_probability, (len(mapping),)) |
|
input_ids = feature["input_ids"] |
|
labels = feature["labels"] |
|
new_labels = [-100] * len(labels) |
|
for word_id in np.where(mask)[0]: |
|
word_id = word_id.item() |
|
for idx in mapping[word_id]: |
|
new_labels[idx] = labels[idx] |
|
input_ids[idx] = tokenizer.mask_token_id |
|
feature["labels"] = new_labels |
|
|
|
return default_data_collator(features) |
|
|
|
def train_model(): |
|
|
|
batch_size = 64 |
|
|
|
logging_steps = len(tw_dataset["train"]) // batch_size |
|
model_name = model_checkpoint.split("/")[-1] |
|
|
|
training_args = TrainingArguments( |
|
output_dir=f"{model_name}-finetuned-twitter", |
|
save_total_limit=3, |
|
overwrite_output_dir=True, |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
weight_decay=0.01, |
|
per_device_train_batch_size=batch_size, |
|
per_device_eval_batch_size=batch_size, |
|
push_to_hub=True, |
|
fp16=False, |
|
logging_steps=logging_steps, |
|
|
|
) |
|
tw_dataset["train"].set_format("torch", device="cuda") |
|
tw_dataset["test"].set_format("torch", device="cuda") |
|
trainer = Trainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tw_dataset["train"], |
|
eval_dataset=tw_dataset["test"], |
|
data_collator=data_collator, |
|
tokenizer=tokenizer, |
|
) |
|
|
|
|
|
eval_results = trainer.evaluate() |
|
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") |
|
trainer.train() |
|
eval_results = trainer.evaluate() |
|
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}") |
|
|
|
trainer.push_to_hub() |
|
|
|
|
|
if __name__ == "__main__": |
|
token = "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" |
|
login(token=token) |
|
model_checkpoint = "deepset/gbert-base" |
|
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint) |
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
tw_dataset = Dataset.from_file('../data/complete_sosec_dataset/data.arrow') |
|
tw_dataset = tw_dataset.rename_column('topic', 'labels') |
|
|
|
|
|
|
|
|
|
print(f"tw_dataset sample: {tw_dataset}") |
|
tokenized_datasets = tw_dataset.map( |
|
tokenize_function, batched=True, |
|
remove_columns=["text", "labels", 'id', 'sentiment', 'annotator', 'comment', 'topic_alt', 'lang', |
|
'conversation_id', 'created_at', 'author_id', 'query', 'public_metrics.like_count', |
|
'public_metrics.quote_count', 'public_metrics.reply_count', 'public_metrics.retweet_count', |
|
'public_metrics.impression_count', '__index_level_0__'] |
|
) |
|
print(f"tokenized_datsets: {tokenized_datasets}") |
|
chunk_size = 128 |
|
|
|
lm_datasets = tokenized_datasets.map(group_texts, batched=True) |
|
print(f"lm_datasets: {lm_datasets}") |
|
|
|
tw_dataset = lm_datasets.train_test_split( |
|
train_size=0.9, test_size=0.1, seed=42 |
|
) |
|
|
|
|
|
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) |
|
print(f"data collator: {data_collator}") |
|
wwm_probability = 0.2 |
|
|
|
train_model() |
|
|
|
|