Files
ascend-tools/train/atlas_benchmark-master/utils/shell/start.sh
T
2020-10-19 20:22:23 +08:00

140 lines
4.5 KiB
Bash

#!/bin/bash
model=$1
hardware=$2
yamlPath=$3
modelDir=$4
framework=$5
modelScripts="$modelDir/scripts"
currentDir=$(cd "$(dirname "$0")"; pwd)
yamlDir=$(cd "$(dirname "${yamlPath}")";pwd)
train_dir=${currentDir%train*}/train
timeout=360000
# 从 yaml 获取配置
if [ x"${framework}" == x"tensorflow" ]; then
config_section="tensorflow_config"
elif [ x"${framework}" == x"pytorch" ]; then
config_section="pytorch_config"
else
config_section="mindspore_config"
fi
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
if [ x"${hardware}" == x"cluster" ];then
export CLUSTER=True
IFS=","
array=($mpirun_ip)
m=${array[0]#*:}
rank_size=0
mpirun_all_ip=""
for var in ${array[@]}; do
n=${var#*:}
mpirun_all_ip+=" ${var%:*}"
let a="$n & ($n-1)"
let rank_size+=$n
if [ $a -ne 0 ] || [ $n -ne $m ];then
echo "mpirun_ip: $mpirun_ip error"
exit 1
fi
done
export MPIRUN_ALL_IP=${mpirun_all_ip#?}
else
rank_size=${hardware%?}
fi
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
#tensorflow docker时要映射的路径
if [ x"${framework}" == x"tensorflow" ]; then
if [ x"${hardware}" != x"cluster" ];then
# 仅单机执行需要配置 json
bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1
fi
yaml_file_name=${yamlPath##*/}
train_model_name=${yaml_file_name%%.*}
if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then
data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}"
elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then
raw_data=${training_file_pattern%raw_data*}raw_data
data_urls="-v ${raw_data}:${raw_data}"
else
data_urls="-v ${data_url}:${data_url}"
fi
fi
if [ x"${framework}" == x"pytorch" ]; then
if [ x"${train_model_name}" == x"ResNet50" ]; then
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
else
data_urls="-v ${data_url}:${data_url}"
fi
fi
if [ x"$model" == x"docker" ];then
# docker 侧执行
if [ x"${hardware}" == x"cluster" ];then
# docker多机
docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" &
else
DEVICE_DEV=""
for device_id in $device_group;do
DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}`
done
docker run -i --ipc=host \
${DEVICE_DEV} --device=/dev/davinci_manager \
--device=/dev/devmm_svm --device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v ${train_dir}:${train_dir} \
-v ${modelDir}:${modelDir} \
${data_urls} \
-v ${yamlDir}:${yamlDir} \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
/bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" &
fi
elif [ x"$model" == x"host" ]; then
# host 侧执行
bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} &
fi
workshell=$!
timeused=0
while true
do
ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep`
if [ x"${ret}" = x ];
then
break
else
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s "
sleep 5
let timeused+=5
#如果超过配置的timeout时间,则kill 掉python训练进程
if [ ${timeused} -gt ${timeout} ];
then
echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training timeout ! "
#获取python进程ID
train_sh_pid=`pgrep -P $(pgrep -P $workshell)`
for pid in $train_sh_pid
do
id=`pgrep -P $pid`
kill -9 $id
done
break
fi
fi
done
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] process end "