[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,53 @@
# setting main path
MAIN_PATH=$(dirname $(readlink -f $0))
echo $MAIN_PATH
DEVICE_NUM=$1
ckpt_path=$2
#echo $1
#echo $2
# set env
export DDK_VERSION_FLAG=1.60.T49.0.B201
export NEW_GE_FE_ID=1
export GE_AICPU_FLAG=1
export SOC_VERSION=Ascend910
export JOB_ID=10087
export FUSION_TENSOR_SIZE=1000000000
export RANK_ID=yolo
#echo "device_num is $DEVICE_NUM"
for((i=0;i<${DEVICE_NUM};i++));
do
export RANK_SIZE=$DEVICE_NUM
export DEVICE_ID=$i
export DEVICE_INDEX=$i
#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[debug]\" --device "$RANK_ID
cd ${MAIN_PATH}/../result
if [ x"${ckpt_path}" == x"" ];then
lastresult=$(ls -t | grep -E "Train*" | head -n 1)
RESTORE_PATH=${lastresult}/${i}/training/
else
lastresult=${ckpt_path}
RESTORE_PATH=${ckpt_path}/${i}/training/
fi
echo $RESTORE_PATH
python3.7 ${MAIN_PATH}/../code/eval.py \
--save_json True \
--score_thresh 0.0001 \
--nms_thresh 0.55 \
--max_boxes 100 \
--restore_path $RESTORE_PATH \
--max_test 10000 \
--save_json_path eval_res_D$DEVICE_NUM.json > ${lastresult}/eval_$i.out 2>&1
done
@@ -0,0 +1,77 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
currentDir=$(cd "$(dirname "$0")/.."; pwd)
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
source ${currentDir}/config/npu_set_env.sh
if [ x"$runmode" != x"evaluate" ];then
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/tf_yolov3/training_job_${currtime}/
train_job_dir=${currentDir%train*}/train/result/tf_yolov3/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
fi
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named first_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
argsFilePath=${currentDir}/code/args_${mode}.py
#echo "argsFilePath is "${argsFilePath}
sed -i "0,/batch_size.*$/s//batch_size\ = ${batch_size}/g" ${argsFilePath}
sed -i "s/save_epoch.*$/save_epoch\ = ${save_epoch}/g" ${argsFilePath}
sed -i "s/total_epoches =.*$/total_epoches\ = ${total_epoches}/g" ${argsFilePath}
sed -i 's/\r//g' ${argsFilePath}
if [ x"${CLUSTER}" == x"True" ];then
# ln hw log
ln -snf ${train_job_dir}/0/hw_yolov3.log ${train_job_dir}
this_ip=$(hostname -I |awk '{print $1}')
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp $argsFilePath root@$ip:$argsFilePath
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
elif [ $runmode == "train" ];then
ln -snf ${train_job_dir}/${first_device_id}/hw_yolov3.log ${train_job_dir}
rank_id=0
for device_id in $device_group;do
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ${currentDir}/result/main.log
${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id&
let rank_id++
done
else
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${ckpt_path} &"
ln -snf ${train_job_dir}/${first_device_id}/hw_yolov3.log ${train_job_dir}
bash ${currentDir}/scripts/eval.sh ${rank_size} ${ckpt_path}
fi
wait
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train exit " >> ${currentDir}/result/main.log
@@ -0,0 +1,115 @@
#!/bin/bash
scriptDir=$(cd "$(dirname "$0")"; pwd)
mainDir=$(cd "$(dirname "$scriptDir")"; pwd)
device_id=$1
rank_size=$2
yamlPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=$4
toolsPath=$5
export YAML_PATH=$3
mkdir -p ${currentDir%train*}/train/result/tf_yolov3/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/tf_yolov3/training_job_${currtime}/
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
source ${currentDir}/config/npu_set_env.sh
# 声明变量
export REMARK_LOG_FILE=hw_yolov3.log # 打点日志文件名称, 必须hw_后跟模型名称小写
# 添加日志打点模块路径
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
# user env
export HCCL_CONNECT_TIMEOUT=600
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
export DEVICE_INDEX=${DEVICE_INDEX}
export DEVICE_INDEX=$RANK_ID
export JOB_ID=123678
export FUSION_TENSOR_SIZE=1000000000
if [ ${profiling_mode} == True ];
then
export PROFILING_MODE=true
else
export PROFILING_MODE=false
fi
if [ ${aicpu_profiling_mode} == True ];
then
export AICPU_PROFILING_MODE=true
else
export AICPU_PROFILING_MODE=false
fi
export PROFILING_OPTIONS=${profiling_options}
export FP_POINT=${fp_point}
export BP_POINT=${bp_point}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
num_cpus=$(getconf _NPROCESSORS_ONLN)
num_cpus_per_device=$((num_cpus/8))
PID_START=$((num_cpus_per_device*device_id))
PID_END=$((num_cpus_per_device*device_id+num_cpus_per_device-1))
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
#KERNEL_NUM=20
#PID_START=$((KERNEL_NUM * DEVICE_ID))
#PID_END=$((PID_START + KERNEL_NUM - 1))
#sleep 5
taskset -c $PID_START-$PID_END python3.7 $mainDir/code/train.py --mode $mode > ${train_job_dir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;then
echo ":::ABK 1.0.0 yolov3 train success"
echo ":::ABK 1.0.0 yolov3 train success" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 yolov3 train success" >> ${train_job_dir}/${device_id}/hw_yolov3.log
else
echo ":::ABK 1.0.0 yolov3 train failed"
echo ":::ABK 1.0.0 yolov3 train failed" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 yolov3 train failed" >> ${train_job_dir}/${device_id}/hw_yolov3.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ${hour}:${min}:${sec}
echo ":::ABK 1.0.0 yolov3 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_yolov3.log