#!/bin/bash #SBATCH -N 13 #SBATCH -p tp1-user #SBATCH --exclusive #SBATCH --ntasks-per-node=1 #SBATCH --cpus-per-task=200 #SBATCH --mem=200G #SBATCH --gres=gpu:8 #SBATCH --time=30-00:00:00 #SBATCH --output=/mnt/home/ntuspeechlabtaipei1/eric/result/%j-slurm.out #SBATCH --exclude=cnode3-004,cnode3-019 module purge module load slurm source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh conda activate base CONTAINER_IMAGE="./eric/trl.sqsh" GPUS_PER_NODE=8 echo "SLURM_NNODES=${SLURM_NNODES}" echo "NODELIST="$SLURM_JOB_NODELIST echo "SLURM_NODEID=$SLURM_NODEID" echo "SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID" export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) export MASTER_PORT=12345 export TORCH_NCCL_ASYNC_ERROR_HANDLING=1 export CUDA_LAUNCH_BLOCKING=1 export LD_LIBRARY_PATH=/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/miniconda3/envs/whisper/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat/lib.real:/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 SRUN_ARGS=" \ --wait=60 \ --kill-on-bad-exit=1 \ --mpi=pmix \ --container-image=${CONTAINER_IMAGE} \ --container-writable \ --container-mounts=/mnt/home/ntuspeechlabtaipei1/:/mnt/home/ntuspeechlabtaipei1/,/mnt/home/ntuspeechlabtaipei1/.cache:/root/.cache \ " PRE_LAUNCH="export TORCH_DISTRIBUTED_TIMEOUT=7200; source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh; conda activate base;" LAUNCHER="accelerate launch \ --num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \ --num_machines $SLURM_NNODES \ --machine_rank \${SLURM_NODEID} \ --rdzv_backend c10d \ --main_process_ip $MASTER_ADDR \ --main_process_port $MASTER_PORT \ --deepspeed_config_file /mnt/home/ntuspeechlabtaipei1/ds_config.json \ --deepspeed_hostfile /mnt/home/ntuspeechlabtaipei1/eric/hostfile \ --deepspeed_multinode_launcher standard \ --dynamo_backend no \ --use_deepspeed \ --mixed_precision bf16 \ " CMD="/mnt/home/ntuspeechlabtaipei1/train_conv_slurm_full.py" clear; srun $SRUN_ARGS bash -c "$PRE_LAUNCH$LAUNCHER $CMD" echo "END TIME: $(date)"