mmlm-conv-training-full / train_conv_slurm_full.sh

Training in progress, step 200

e7affe4 verified about 1 month ago

2.42 kB

	#!/bin/bash
	#SBATCH -N 13
	#SBATCH -p tp1-user
	#SBATCH --exclusive
	#SBATCH --ntasks-per-node=1
	#SBATCH --cpus-per-task=200
	#SBATCH --mem=200G
	#SBATCH --gres=gpu:8
	#SBATCH --time=30-00:00:00
	#SBATCH --output=/mnt/home/ntuspeechlabtaipei1/eric/result/%j-slurm.out
	#SBATCH --exclude=cnode3-004,cnode3-019

	module purge
	module load slurm

	source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh
	conda activate base

	CONTAINER_IMAGE="./eric/trl.sqsh"
	GPUS_PER_NODE=8
	echo "SLURM_NNODES=${SLURM_NNODES}"
	echo "NODELIST="$SLURM_JOB_NODELIST
	echo "SLURM_NODEID=$SLURM_NODEID"
	echo "SLURM_ARRAY_TASK_ID=$SLURM_ARRAY_TASK_ID"
	export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST \| head -n 1)
	export MASTER_PORT=12345
	export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
	export CUDA_LAUNCH_BLOCKING=1

	export LD_LIBRARY_PATH=/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/miniconda3/lib64:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/local/lib:/mnt/home/ntuspeechlabtaipei1/miniconda3/envs/whisper/lib:/usr/local/cuda/lib64:/usr/local/cuda/compat/lib.real:/usr/local/lib/python3.10/dist-packages/torch/lib:/usr/local/lib/python3.10/dist-packages/torch_tensorrt/lib:/usr/local/cuda/compat/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64

	SRUN_ARGS=" \
	--wait=60 \
	--kill-on-bad-exit=1 \
	--mpi=pmix \
	--container-image=${CONTAINER_IMAGE} \
	--container-writable \
	--container-mounts=/mnt/home/ntuspeechlabtaipei1/:/mnt/home/ntuspeechlabtaipei1/,/mnt/home/ntuspeechlabtaipei1/.cache:/root/.cache \
	"

	PRE_LAUNCH="export TORCH_DISTRIBUTED_TIMEOUT=7200; source /mnt/home/ntuspeechlabtaipei1/miniconda3/etc/profile.d/conda.sh; conda activate base;"

	LAUNCHER="accelerate launch \
	--num_processes $((SLURM_NNODES * GPUS_PER_NODE)) \
	--num_machines $SLURM_NNODES \
	--machine_rank \${SLURM_NODEID} \
	--rdzv_backend c10d \
	--main_process_ip $MASTER_ADDR \
	--main_process_port $MASTER_PORT \
	--deepspeed_config_file /mnt/home/ntuspeechlabtaipei1/ds_config.json \
	--deepspeed_hostfile /mnt/home/ntuspeechlabtaipei1/eric/hostfile \
	--deepspeed_multinode_launcher standard \
	--dynamo_backend no \
	--use_deepspeed \
	--mixed_precision bf16 \
	"

	CMD="/mnt/home/ntuspeechlabtaipei1/train_conv_slurm_full.py"

	clear; srun $SRUN_ARGS bash -c "$PRE_LAUNCH$LAUNCHER $CMD"
	echo "END TIME: $(date)"