#!/bin/bash model=$1 hardware=$2 yamlPath=$3 modelDir=$4 framework=$5 modelScripts="$modelDir/scripts" currentDir=$(cd "$(dirname "$0")"; pwd) yamlDir=$(cd "$(dirname "${yamlPath}")";pwd) train_dir=${currentDir%train*}/train timeout=360000 # 从 yaml 获取配置 if [ x"${framework}" == x"tensorflow" ]; then config_section="tensorflow_config" elif [ x"${framework}" == x"pytorch" ]; then config_section="pytorch_config" else config_section="mindspore_config" fi eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section}) if [ x"${hardware}" == x"cluster" ];then export CLUSTER=True IFS="," array=($mpirun_ip) m=${array[0]#*:} rank_size=0 mpirun_all_ip="" for var in ${array[@]}; do n=${var#*:} mpirun_all_ip+=" ${var%:*}" let a="$n & ($n-1)" let rank_size+=$n if [ $a -ne 0 ] || [ $n -ne $m ];then echo "mpirun_ip: $mpirun_ip error" exit 1 fi done export MPIRUN_ALL_IP=${mpirun_all_ip#?} else rank_size=${hardware%?} fi eval device_group=\$device_group_${rank_size}p if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then device_group="$(seq 0 "$(expr $rank_size - 1)")" fi #tensorflow docker时要映射的路径 if [ x"${framework}" == x"tensorflow" ]; then if [ x"${hardware}" != x"cluster" ];then # 仅单机执行需要配置 json bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1 fi yaml_file_name=${yamlPath##*/} train_model_name=${yaml_file_name%%.*} if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}" elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}" elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then raw_data=${training_file_pattern%raw_data*}raw_data data_urls="-v ${raw_data}:${raw_data}" else data_urls="-v ${data_url}:${data_url}" fi fi if [ x"${framework}" == x"pytorch" ]; then if [ x"${train_model_name}" == x"ResNet50" ]; then data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}" else data_urls="-v ${data_url}:${data_url}" fi fi if [ x"$model" == x"docker" ];then # docker 侧执行 if [ x"${hardware}" == x"cluster" ];then # docker多机 docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" & else DEVICE_DEV="" for device_id in $device_group;do DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}` done docker run -i --ipc=host \ ${DEVICE_DEV} --device=/dev/davinci_manager \ --device=/dev/devmm_svm --device=/dev/hisi_hdc \ -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \ -v ${train_dir}:${train_dir} \ -v ${modelDir}:${modelDir} \ ${data_urls} \ -v ${yamlDir}:${yamlDir} \ -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \ -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \ -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \ /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" & fi elif [ x"$model" == x"host" ]; then # host 侧执行 bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} & fi workshell=$! timeused=0 while true do ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep` if [ x"${ret}" = x ]; then break else echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s " sleep 5 let timeused+=5 #如果超过配置的timeout时间,则kill 掉python训练进程 if [ ${timeused} -gt ${timeout} ]; then echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training timeout ! " #获取python进程ID train_sh_pid=`pgrep -P $(pgrep -P $workshell)` for pid in $train_sh_pid do id=`pgrep -P $pid` kill -9 $id done break fi fi done echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] process end "