root
initial commit
a344f64
# CHECK_EVERY=900
# DURATION_DAYS=10
# CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
# NEPOCH_PRE=99
# NEPOCH_SFT=159
# NAME="audio-gen-train_audiogen"
# for (( i = 1; i <= $CHECK_TOTAL; i++ ))
# do
# RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
# PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
# for STATE in "RUNNING" "PENDING" "NOT-RUN"
# do
# echo "===========${STATE}=========="
# if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
# echo ${NAME}
# elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
# echo ${NAME}
# elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
# base_path="/lustre/fsw/portfolios/adlr/users/sreyang/ckpts/stable_llm/harmonai_train/"
# # Find the last subfolder
# last_subfolder=$(ls -d "$base_path"*/ | sort -V | tail -n 1)
# # Find the last checkpoint in the subfolder
# last_ckpt=$(ls "$last_subfolder/checkpoints/"*.ckpt | sort -V | tail -n 1)
# echo $last_ckpt
# sh submit_job.sh "True" $last_ckpt
# sleep 1
# fi
# done
# echo "============================"
# sleep $CHECK_EVERY
# done
CHECK_EVERY=900
DURATION_DAYS=10
CHECK_TOTAL=$((DURATION_DAYS*86400/CHECK_EVERY))
NEPOCH_PRE=99
NEPOCH_SFT=159
NAME="eval"
for (( i = 1; i <= $CHECK_TOTAL; i++ ))
do
RUNNING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep RUNNING | grep polar | sort)
PENDING_JOBS=$(sacct -o JobName%-150,JobID,Partition%-15,State | grep -v inference | grep PENDING | grep polar | sort)
for STATE in "RUNNING" "PENDING" "NOT-RUN"
do
echo "===========${STATE}=========="
if [[ ${STATE} == "RUNNING" && ${RUNNING_JOBS} =~ "${NAME}" ]]; then
echo ${NAME}
elif [[ ${STATE} == "PENDING" && ${PENDING_JOBS} =~ "${NAME}" ]]; then
echo ${NAME}
elif [[ ${STATE} == "NOT-RUN" && ! ${RUNNING_JOBS} =~ "${NAME}" && ! ${PENDING_JOBS} =~ "${NAME}" ]]; then
sh submit.sh
sleep 1
fi
done
echo "============================"
sleep $CHECK_EVERY
done