mmlm-conv-training-full / train_conv_slurm_full.sh
voidful's picture
Training in progress, step 200
e7affe4 verified
raw
history blame
2.42 kB
#!/bin/bash
#SBATCH -N 13
#SBATCH -p tp1-user
#SBATCH --exclusive
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=200
#SBATCH --mem=200G
#SBATCH --gres=gpu:8
#SBATCH --time=30-00:00:00
#SBATCH --output=/mnt/home/ntuspeechlabtaipei1/eric/result/%j-slurm.out
#SBATCH --exclude=cnode3-004,cnode3-019
module purge
module load slurm
source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh
conda activate base
CONTAINER_IMAGE="./eric/trl.sqsh"
GPUS_PER_NODE=8
echo "SLURM_NNODES=${SLURM_NNODES}"
echo "NODELIST="$SLURM_JOB_NODELIST
echo "SLURM_NODEID=$SLURM_NODEID"
echo "SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID"
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=12345
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export CUDA_LAUNCH_BLOCKING=1
export LD_LIBRARY_PATH=/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/miniconda3/envs/whisper/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat/lib.real:/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
--mpi=pmix \
--container-image=${CONTAINER_IMAGE} \
--container-writable \
--container-mounts=/mnt/home/ntuspeechlabtaipei1/:/mnt/home/ntuspeechlabtaipei1/,/mnt/home/ntuspeechlabtaipei1/.cache:/root/.cache \
"
PRE_LAUNCH="export TORCH_DISTRIBUTED_TIMEOUT=7200; source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh; conda activate base;"
LAUNCHER="accelerate launch \
--num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
--num_machines $SLURM_NNODES \
--machine_rank \${SLURM_NODEID} \
--rdzv_backend c10d \
--main_process_ip $MASTER_ADDR \
--main_process_port $MASTER_PORT \
--deepspeed_config_file /mnt/home/ntuspeechlabtaipei1/ds_config.json \
--deepspeed_hostfile /mnt/home/ntuspeechlabtaipei1/eric/hostfile \
--deepspeed_multinode_launcher standard \
--dynamo_backend no \
--use_deepspeed \
--mixed_precision bf16 \
"
CMD="/mnt/home/ntuspeechlabtaipei1/train_conv_slurm_full.py"
clear; srun $SRUN_ARGS bash -c "$PRE_LAUNCH$LAUNCHER $CMD"
echo "END TIME: $(date)"