File size: 5,451 Bytes
0999ba2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import math
from datasets import Dataset
from huggingface_hub import login
from transformers import (
    TrainingArguments,
    DataCollatorForLanguageModeling,
    AutoTokenizer,
    AutoModelForMaskedLM,
    Trainer,
    default_data_collator
)
import torch
import collections
import numpy as np

def tokenize_function(examples):
    result = tokenizer(examples["text"], padding=True, truncation=True)
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    print(f"tokenize function result: {result}")
    return result


def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["topic"] = result["input_ids"].copy()
    print(f"group texts result: {result}")
    return result


def whole_word_masking_data_collator(features):
    # This means that if you’re using the whole word masking collator,
    # you’ll also need to set remove_unused_columns=False to ensure
    # we don’t lose the word_ids column during training.
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

def train_model():

    batch_size = 64
    # Show the training loss with every epoch
    logging_steps = len(tw_dataset["train"]) // batch_size
    model_name = model_checkpoint.split("/")[-1]

    training_args = TrainingArguments(
        output_dir=f"{model_name}-finetuned-twitter",
        save_total_limit=3,
        overwrite_output_dir=True,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        weight_decay=0.01,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        push_to_hub=True,
        fp16=False, #True if gpu and 16bit
        logging_steps=logging_steps,
        #remove_unused_columns=False,
    )
    tw_dataset["train"].set_format("torch", device="cuda")
    tw_dataset["test"].set_format("torch", device="cuda")
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tw_dataset["train"],
        eval_dataset=tw_dataset["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )


    eval_results = trainer.evaluate()
    print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
    trainer.train()
    eval_results = trainer.evaluate()
    print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

    trainer.push_to_hub()


if __name__ == "__main__":
    token = "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"
    login(token=token)
    model_checkpoint = "deepset/gbert-base"
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    tw_dataset = Dataset.from_file('../data/complete_sosec_dataset/data.arrow')
    tw_dataset = tw_dataset.rename_column('topic', 'labels')
    #sample dataset
    #tw_dataset = tw_dataset.train_test_split(
    #    train_size=1000, test_size=10, seed=42
    #)
    print(f"tw_dataset sample: {tw_dataset}")
    tokenized_datasets = tw_dataset.map(
        tokenize_function, batched=True,
        remove_columns=["text", "labels", 'id', 'sentiment', 'annotator', 'comment', 'topic_alt', 'lang',
                        'conversation_id', 'created_at', 'author_id', 'query', 'public_metrics.like_count',
                        'public_metrics.quote_count', 'public_metrics.reply_count', 'public_metrics.retweet_count',
                        'public_metrics.impression_count', '__index_level_0__']
    )
    print(f"tokenized_datsets: {tokenized_datasets}")
    chunk_size = 128

    lm_datasets = tokenized_datasets.map(group_texts, batched=True)
    print(f"lm_datasets: {lm_datasets}")

    tw_dataset = lm_datasets.train_test_split(
        train_size=0.9, test_size=0.1, seed=42
    )


    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
    print(f"data collator: {data_collator}")
    wwm_probability = 0.2

    train_model()