[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,139 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
model=$1
|
||||
hardware=$2
|
||||
yamlPath=$3
|
||||
modelDir=$4
|
||||
framework=$5
|
||||
|
||||
modelScripts="$modelDir/scripts"
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
yamlDir=$(cd "$(dirname "${yamlPath}")";pwd)
|
||||
train_dir=${currentDir%train*}/train
|
||||
timeout=360000
|
||||
# 从 yaml 获取配置
|
||||
if [ x"${framework}" == x"tensorflow" ]; then
|
||||
config_section="tensorflow_config"
|
||||
elif [ x"${framework}" == x"pytorch" ]; then
|
||||
config_section="pytorch_config"
|
||||
else
|
||||
config_section="mindspore_config"
|
||||
fi
|
||||
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
|
||||
|
||||
if [ x"${hardware}" == x"cluster" ];then
|
||||
export CLUSTER=True
|
||||
IFS=","
|
||||
array=($mpirun_ip)
|
||||
m=${array[0]#*:}
|
||||
rank_size=0
|
||||
mpirun_all_ip=""
|
||||
for var in ${array[@]}; do
|
||||
n=${var#*:}
|
||||
mpirun_all_ip+=" ${var%:*}"
|
||||
let a="$n & ($n-1)"
|
||||
let rank_size+=$n
|
||||
if [ $a -ne 0 ] || [ $n -ne $m ];then
|
||||
echo "mpirun_ip: $mpirun_ip error"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
export MPIRUN_ALL_IP=${mpirun_all_ip#?}
|
||||
else
|
||||
rank_size=${hardware%?}
|
||||
fi
|
||||
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
#tensorflow docker时要映射的路径
|
||||
if [ x"${framework}" == x"tensorflow" ]; then
|
||||
if [ x"${hardware}" != x"cluster" ];then
|
||||
# 仅单机执行需要配置 json
|
||||
bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1
|
||||
fi
|
||||
yaml_file_name=${yamlPath##*/}
|
||||
train_model_name=${yaml_file_name%%.*}
|
||||
if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then
|
||||
data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}"
|
||||
elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then
|
||||
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
|
||||
elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then
|
||||
raw_data=${training_file_pattern%raw_data*}raw_data
|
||||
data_urls="-v ${raw_data}:${raw_data}"
|
||||
else
|
||||
data_urls="-v ${data_url}:${data_url}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
if [ x"${framework}" == x"pytorch" ]; then
|
||||
if [ x"${train_model_name}" == x"ResNet50" ]; then
|
||||
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
|
||||
else
|
||||
data_urls="-v ${data_url}:${data_url}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
if [ x"$model" == x"docker" ];then
|
||||
# docker 侧执行
|
||||
if [ x"${hardware}" == x"cluster" ];then
|
||||
# docker多机
|
||||
docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" &
|
||||
else
|
||||
DEVICE_DEV=""
|
||||
for device_id in $device_group;do
|
||||
DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}`
|
||||
done
|
||||
docker run -i --ipc=host \
|
||||
${DEVICE_DEV} --device=/dev/davinci_manager \
|
||||
--device=/dev/devmm_svm --device=/dev/hisi_hdc \
|
||||
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
|
||||
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
|
||||
-v ${train_dir}:${train_dir} \
|
||||
-v ${modelDir}:${modelDir} \
|
||||
${data_urls} \
|
||||
-v ${yamlDir}:${yamlDir} \
|
||||
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
|
||||
-v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
|
||||
-v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
|
||||
/bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" &
|
||||
fi
|
||||
elif [ x"$model" == x"host" ]; then
|
||||
# host 侧执行
|
||||
bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} &
|
||||
fi
|
||||
workshell=$!
|
||||
timeused=0
|
||||
while true
|
||||
do
|
||||
ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep`
|
||||
if [ x"${ret}" = x ];
|
||||
then
|
||||
break
|
||||
else
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s "
|
||||
sleep 5
|
||||
let timeused+=5
|
||||
#如果超过配置的timeout时间,则kill 掉python训练进程
|
||||
if [ ${timeused} -gt ${timeout} ];
|
||||
then
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training timeout ! "
|
||||
#获取python进程ID
|
||||
train_sh_pid=`pgrep -P $(pgrep -P $workshell)`
|
||||
for pid in $train_sh_pid
|
||||
do
|
||||
id=`pgrep -P $pid`
|
||||
kill -9 $id
|
||||
done
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] process end "
|
||||
Reference in New Issue
Block a user