|
#!/bin/bash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
module purge |
|
module load slurm |
|
|
|
source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh |
|
conda activate base |
|
|
|
CONTAINER_IMAGE="./eric/trl.sqsh" |
|
GPUS_PER_NODE=8 |
|
echo "SLURM_NNODES=${SLURM_NNODES}" |
|
echo "NODELIST="$SLURM_JOB_NODELIST |
|
echo "SLURM_NODEID=$SLURM_NODEID" |
|
echo "SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID" |
|
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) |
|
export MASTER_PORT=12345 |
|
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 |
|
export CUDA_LAUNCH_BLOCKING=1 |
|
|
|
export LD_LIBRARY_PATH=/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/miniconda3/envs/whisper/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat/lib.real:/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 |
|
|
|
SRUN_ARGS=" \ |
|
--wait=60 \ |
|
--kill-on-bad-exit=1 \ |
|
--mpi=pmix \ |
|
--container-image=${CONTAINER_IMAGE} \ |
|
--container-writable \ |
|
--container-mounts=/mnt/home/ntuspeechlabtaipei1/:/mnt/home/ntuspeechlabtaipei1/,/mnt/home/ntuspeechlabtaipei1/.cache:/root/.cache \ |
|
" |
|
|
|
PRE_LAUNCH="export TORCH_DISTRIBUTED_TIMEOUT=7200; source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh; conda activate base;" |
|
|
|
LAUNCHER="accelerate launch \ |
|
--num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ |
|
--num_machines $SLURM_NNODES \ |
|
--machine_rank \${SLURM_NODEID} \ |
|
--rdzv_backend c10d \ |
|
--main_process_ip $MASTER_ADDR \ |
|
--main_process_port $MASTER_PORT \ |
|
--deepspeed_config_file /mnt/home/ntuspeechlabtaipei1/ds_config.json \ |
|
--deepspeed_hostfile /mnt/home/ntuspeechlabtaipei1/eric/hostfile \ |
|
--deepspeed_multinode_launcher standard \ |
|
--dynamo_backend no \ |
|
--use_deepspeed \ |
|
--mixed_precision bf16 \ |
|
" |
|
|
|
CMD="/mnt/home/ntuspeechlabtaipei1/train_conv_slurm_full.py" |
|
|
|
clear; srun $SRUN_ARGS bash -c "$PRE_LAUNCH$LAUNCHER $CMD" |
|
echo "END TIME: $(date)" |
|
|
|
|
|
|