[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,68 @@
|
||||
#!/bin/bash
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
toolsPath=$3
|
||||
if [ -f /.dockerenv ];then
|
||||
CLUSTER=$4
|
||||
MPIRUN_ALL_IP="$5"
|
||||
export CLUSTER=${CLUSTER}
|
||||
fi
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
currtime=`date +%Y%m%d%H%M%S`
|
||||
mkdir -p ${currentDir%train*}/train/result/tf_bert_large/training_job_${currtime}/
|
||||
train_job_dir=${currentDir%train*}/train/result/tf_bert_large/training_job_${currtime}/
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] see more config info in ${currentDir}/config"
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train result in ${train_job_dir}"
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
# device 列表, 若无指定 device 根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
# get last device id in device_group, hw log in performance from the dir named first_device_id
|
||||
device_group_str=`echo ${device_group} | sed 's/ //g'`
|
||||
first_device_id=`echo ${device_group_str: 0:1}`
|
||||
|
||||
# user env
|
||||
export JOB_ID=9999001
|
||||
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
|
||||
export RANK_SIZE=${rank_size}
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export DEVICE_ID=0
|
||||
export DEVICE_INDEX=$DEVICE_ID
|
||||
|
||||
if [ x"${CLUSTER}" == x"True" ];then
|
||||
# ln hw log
|
||||
ln -snf ${train_job_dir}/0/hw_bert.log ${train_job_dir}
|
||||
this_ip=$(hostname -I |awk '{print $1}')
|
||||
for ip in $MPIRUN_ALL_IP;do
|
||||
if [ x"$ip" != x"$this_ip" ];then
|
||||
scp $yamlPath root@$ip:$yamlPath
|
||||
fi
|
||||
done
|
||||
export PATH=$PATH:/usr/local/mpirun4.0/bin
|
||||
mpirun -H ${mpirun_ip} \
|
||||
--bind-to none -map-by slot\
|
||||
--allow-run-as-root \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
|
||||
--prefix /usr/local/mpirun4.0/ \
|
||||
${currentDir}/scripts/train.sh 0 $currtime $yamlPath 0 True ${toolsPath} ${rank_size}
|
||||
else
|
||||
# ln hw log
|
||||
ln -snf ${train_job_dir}/${first_device_id}/hw_bert.log ${train_job_dir}
|
||||
rank_id=0
|
||||
for device_id in ${device_group};do
|
||||
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ./main.log
|
||||
${currentDir}/scripts/train.sh $device_id $currtime $yamlPath $rank_id solo ${toolsPath} ${rank_size} &
|
||||
let rank_id++
|
||||
done
|
||||
fi
|
||||
wait
|
||||
|
||||
|
||||
Reference in New Issue
Block a user