tomaarsen HF staff commited on
Commit
1aa7c41
·
verified ·
1 Parent(s): d6235f6

Create train_script.py

Browse files
Files changed (1) hide show
  1. train_script.py +190 -0
train_script.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import traceback
3
+ from collections import defaultdict
4
+
5
+ from datasets import load_dataset
6
+ from datasets.load import load_from_disk
7
+ from torch import nn
8
+ import torch
9
+
10
+ from sentence_transformers import SentenceTransformer
11
+ from sentence_transformers.cross_encoder import CrossEncoder
12
+ from sentence_transformers.cross_encoder.evaluation.CENanoBEIREvaluator import CENanoBEIREvaluator
13
+ from sentence_transformers.cross_encoder.evaluation.CERerankingEvaluator import CERerankingEvaluator
14
+ from sentence_transformers.cross_encoder.losses.BinaryCrossEntropyLoss import BinaryCrossEntropyLoss
15
+ from sentence_transformers.cross_encoder.losses.CachedMultipleNegativesRankingLoss import (
16
+ CachedMultipleNegativesRankingLoss,
17
+ )
18
+ from sentence_transformers.cross_encoder.trainer import CrossEncoderTrainer
19
+ from sentence_transformers.cross_encoder.training_args import CrossEncoderTrainingArguments
20
+ from sentence_transformers.evaluation.SequentialEvaluator import SequentialEvaluator
21
+ from sentence_transformers.training_args import BatchSamplers
22
+ from sentence_transformers.util import mine_hard_negatives
23
+
24
+
25
+ def main():
26
+ model_name = "answerdotai/ModernBERT-base"
27
+
28
+ # Set the log level to INFO to get more information
29
+ logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
30
+
31
+ train_batch_size = 64
32
+ num_epochs = 1
33
+
34
+ # 1. Define our CrossEncoder model
35
+ model = CrossEncoder(model_name)
36
+ print("Model max length:", model.max_length)
37
+ print("Model num labels:", model.num_labels)
38
+
39
+ # 2. Load the MS MARCO dataset:
40
+ logging.info("Read train dataset")
41
+
42
+ embedding_model = SentenceTransformer("sentence-transformers/static-retrieval-mrl-en-v1")
43
+
44
+ full_dataset = load_dataset("sentence-transformers/natural-questions", split=f"train")
45
+ dataset_dict = full_dataset.train_test_split(test_size=1_000, seed=12)
46
+ train_dataset = dataset_dict["train"]
47
+ eval_dataset = dataset_dict["test"]
48
+
49
+ # '''
50
+ hard_eval_dataset = mine_hard_negatives(
51
+ eval_dataset,
52
+ embedding_model,
53
+ corpus=full_dataset["answer"],
54
+ num_negatives=30,
55
+ batch_size=256,
56
+ positive_among_negatives=True,
57
+ as_triplets=False,
58
+ # faiss_batch_size=4096,
59
+ use_faiss=True,
60
+ )
61
+ print(hard_eval_dataset)
62
+ # # breakpoint()
63
+ # indices = []
64
+ # for sample in eval_dataset:
65
+ # try:
66
+ # idx = list(sample.values())[2:].index(sample["answer"])
67
+ # except ValueError:
68
+ # idx = len(eval_dataset.column_names) - 2
69
+ # indices.append(idx)
70
+ # print(sum(indices) / len(indices))
71
+ # breakpoint()
72
+
73
+ hard_train_dataset = mine_hard_negatives(
74
+ train_dataset,
75
+ embedding_model,
76
+ num_negatives=5, # 5 negatives per question-answer pair
77
+ margin=0, # Similarity between query and negative samples should be at least 0.1 less than query-positive similarity
78
+ range_min=0, # Skip the 10 most similar samples
79
+ range_max=100, # Consider only the 100 most similar samples
80
+ sampling_strategy="top", # Randomly sample negatives from the range
81
+ batch_size=256,
82
+ as_triplets=False, # We want 7 columns: query, positive, negative1, negative2, negative3, negative4, negative5
83
+ use_faiss=True,
84
+ )
85
+ # breakpoint()
86
+ # hard_train_dataset.save_to_disk("nq-train-hard-negatives")
87
+ # hard_eval_dataset.save_to_disk("nq-eval-hard-negatives")
88
+ # '''
89
+ # hard_train_dataset = load_from_disk("nq-train-hard-negatives")
90
+ # hard_eval_dataset = load_from_disk("nq-eval-hard-negatives")
91
+
92
+ def mapper(batch):
93
+ batch_size = len(batch["query"])
94
+ num_negatives = len(batch) - 2
95
+ num_candidates = len(batch) - 1
96
+ return {
97
+ "query": batch["query"] * num_candidates,
98
+ "response": sum(list(batch.values())[1:], []),
99
+ "label": [1] * batch_size + [0] * num_negatives * batch_size,
100
+ }
101
+
102
+ hard_train_dataset = hard_train_dataset.map(mapper, batched=True, remove_columns=hard_train_dataset.column_names)
103
+ eval_dataset = eval_dataset.map(mapper, batched=True, remove_columns=eval_dataset.column_names)
104
+
105
+ # 3. Define our training loss
106
+ loss = BinaryCrossEntropyLoss(model=model, pos_weight=torch.tensor(5))
107
+
108
+ # 4. Define the evaluator. We use the CENanoBEIREvaluator, which is a light-weight evaluator for English reranking
109
+ reranking_evaluator = CERerankingEvaluator(
110
+ samples=[
111
+ {
112
+ "query": sample["query"],
113
+ "positive": [sample["answer"]],
114
+ "negative": [sample[column_name] for column_name in hard_eval_dataset.column_names[2:]],
115
+ }
116
+ for sample in hard_eval_dataset
117
+ ],
118
+ batch_size=train_batch_size,
119
+ negatives_are_ranked=True,
120
+ name="nq-dev",
121
+ )
122
+ nano_beir_evaluator = CENanoBEIREvaluator(
123
+ dataset_names=["msmarco", "nfcorpus", "nq"],
124
+ batch_size=train_batch_size,
125
+ )
126
+ evaluator = SequentialEvaluator([reranking_evaluator, nano_beir_evaluator])
127
+ evaluator(model)
128
+
129
+ # 5. Define the training arguments
130
+ short_model_name = model_name if "/" not in model_name else model_name.split("/")[-1]
131
+ run_name = f"reranker-{short_model_name}-nq-bce-static-retriever-hardest"
132
+ args = CrossEncoderTrainingArguments(
133
+ # Required parameter:
134
+ output_dir=f"models/{run_name}",
135
+ # Optional training parameters:
136
+ num_train_epochs=num_epochs,
137
+ per_device_train_batch_size=train_batch_size,
138
+ per_device_eval_batch_size=train_batch_size,
139
+ learning_rate=2e-5,
140
+ warmup_ratio=0.1,
141
+ fp16=False, # Set to False if you get an error that your GPU can't run on FP16
142
+ bf16=True, # Set to True if you have a GPU that supports BF16
143
+ dataloader_num_workers=4,
144
+ # (Cached)MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
145
+ load_best_model_at_end=True,
146
+ metric_for_best_model="eval_nq-dev_ndcg@10",
147
+ # Optional tracking/debugging parameters:
148
+ eval_strategy="steps",
149
+ eval_steps=1000,
150
+ save_strategy="steps",
151
+ save_steps=1000,
152
+ save_total_limit=2,
153
+ logging_steps=200,
154
+ logging_first_step=True,
155
+ run_name=run_name, # Will be used in W&B if `wandb` is installed
156
+ seed=12,
157
+ )
158
+
159
+ # 6. Create the trainer & start training
160
+ trainer = CrossEncoderTrainer(
161
+ model=model,
162
+ args=args,
163
+ train_dataset=hard_train_dataset,
164
+ eval_dataset=eval_dataset,
165
+ loss=loss,
166
+ evaluator=evaluator,
167
+ )
168
+ trainer.train()
169
+
170
+ # 7. Evaluate the final model, useful to include these in the model card
171
+ evaluator(model)
172
+
173
+ # 8. Save the final model
174
+ final_output_dir = f"models/{run_name}/final"
175
+ model.save_pretrained(final_output_dir)
176
+
177
+ # 9. (Optional) save the model to the Hugging Face Hub!
178
+ # It is recommended to run `huggingface-cli login` to log into your Hugging Face account first
179
+ try:
180
+ model.push_to_hub(run_name)
181
+ except Exception:
182
+ logging.error(
183
+ f"Error uploading model to the Hugging Face Hub:\n{traceback.format_exc()}To upload it manually, you can run "
184
+ f"`huggingface-cli login`, followed by loading the model using `model = CrossEncoder({final_output_dir!r})` "
185
+ f"and saving it using `model.push_to_hub('{run_name}')`."
186
+ )
187
+
188
+
189
+ if __name__ == "__main__":
190
+ main()