Using tevatron, unpushed code

bs=32
lr=7e-6

gradient_accumulation_steps=1
real_bs=$(($bs / $gradient_accumulation_steps))
echo "real_bs: $real_bs"
echo "expected_bs: $bs"
sleep 1s

epoch=5
teacher=crystina-z/monoXLMR.pft-msmarco

dataset=Tevatron/msmarco-passage && dataset_name=enMarco
output_dir=margin-mse.distill/teacher-$(basename $teacher).student-mbert.epoch-${epoch}.${bs}x2.lr.$lr.data-$dataset_name.$commit_id
mkdir -p $output_dir

CUDA_VISIBLE_DEVICES=$device WANDB_PROJECT=distill \
python examples/distill_marginmse/distil_train.py \
  --output_dir $output_dir \
  --model_name_or_path bert-base-multilingual-cased \
  --teacher_model_name_or_path $teacher \
  --save_steps 1000 \
  --dataset_name $dataset \
  --fp16 \
  --per_device_train_batch_size $real_bs \
  --gradient_accumulation_steps 4 \
  --train_n_passages 2 \
  --learning_rate $lr \
  --q_max_len 16 \
  --p_max_len 128 \
  --num_train_epochs $epoch \
  --logging_steps 500 \
  --overwrite_output_dir \
  --dataloader_num_workers 4 \
Downloads last month
3
Inference Examples
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social visibility and check back later, or deploy to Inference Endpoints (dedicated) instead.

Dataset used to train crystina-z/marginmse.teacher-monoXLMR.pft-msmarco.epoch-5.32x2.lr.7e-6.pft-msmarco