[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,67 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
train_job_dir=${currentDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] see more config info in ${currentDir}/config"
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train result in ${train_job_dir}"
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named first_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
# user env
export JOB_ID=9999001
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=0
export DEVICE_INDEX=$DEVICE_ID
if [ x"${CLUSTER}" == x"True" ];then
# ln hw log
ln -snf ${train_job_dir}/0/hw_bert.log ${train_job_dir}
this_ip=$(hostname -I |awk '{print $1}')
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $currtime $yamlPath 0 True ${toolsPath} ${rank_size}
else
# ln hw log
ln -snf ${train_job_dir}/${first_device_id}/hw_bert.log ${train_job_dir}
rank_id=0
for device_id in ${device_group};do
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ./main.log
${currentDir}/scripts/train.sh $device_id $currtime $yamlPath $rank_id solo ${toolsPath} ${rank_size} &
let rank_id++
done
fi
wait
@@ -0,0 +1,157 @@
#!/bin/bash
# 0 $currtime $yamlPath 0 cluster ${toolsPath}
device_id=$1
currtime=$2
yamlPath=$3
toolsPath=$6
rank_size=$7
export YAML_PATH=$3
mainDir=$(cd "$(dirname "$0")/.."; pwd)
mkdir -p ${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
export train_job_dir=${mainDir%train*}/train/result/tf_bert_base/training_job_${currtime}/
#exec_path=${train_job_dir}
cd ${train_job_dir}
export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils"; pwd)
export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils/atlasboost"; pwd)
source ${mainDir}/config/npu_set_env.sh
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
# 声明变量
export REMARK_LOG_FILE=hw_bert.log # 打点日志文件名称, 必须hw_后跟模型名称小写
# 添加日志打点模块路径
benchmark_log_path=${mainDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
export JOB_ID=9999001
export RANK_TABLE_FILE=${mainDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
export DEVICE_INDEX=$DEVICE_ID
export RANK_INDEX=0
export PROFILING_OPTIONS=${PROFILING_OPTIONS}
export FP_POINT=${FP_POINT}
export BP_POINT=${BP_POINT}
if [ ${PROFILING_MODE} == True ];
then
export PROFILING_MODE=true
else
export PROFILING_MODE=false
fi
if [ ${PROFILING_MODE} == True ];
then
export AICPU_PROFILING_MODE=true
else
export AICPU_PROFILING_MODE=false
fi
if [ x"${device_id}" = x ] ;
then
echo "turing train fail" >> ${exec_path}/train_${device_id}.log
exit
else
export DEVICE_ID=${device_id}
fi
env > ${currentDir}/env_${device_id}.log
cd ${train_job_dir}
if [ x"$5" != x"True" ];then
rank_id=$4
export RANK_ID=$4
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
#echo rank_id is $rank_id
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
#echo device_id is $device_id
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${mainDir}/config/${rank_size}p.json
fi
env > ${currentDir}/env_${device_id}.log
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}/ckpt${DEVICE_ID}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
#start exec
python3.7 ${mainDir}/code/pretrain/run_pretraining.py \
--bert_config_file=${mainDir}/config/${bert_config_file} \
--max_seq_length=${max_seq_length} \
--max_predictions_per_seq=${max_predictions_per_seq} \
--train_batch_size=${train_batch_size} \
--learning_rate=${learning_rate} \
--num_warmup_steps=${num_warmup_steps} \
--num_train_steps=${num_train_steps} \
--optimizer_type=${optimizer_type} \
--manual_fp16=${manual_fp16} \
--use_fp16_cls=${use_fp16_cls} \
--input_files_dir=${input_files_dir} \
--eval_files_dir=${eval_files_dir} \
--npu_bert_debug=${npu_bert_debug} \
--npu_bert_use_tdt=${npu_bert_use_tdt} \
--do_train=${do_train} \
--do_eval=${do_eval} \
--num_accumulation_steps=${num_accumulation_steps} \
--npu_bert_job_start_file=None \
--iterations_per_loop=${iterations_per_loop} \
--npu_bert_loss_scale=${npu_bert_loss_scale} \
--distributed=${distributed} \
--save_checkpoints_steps=${save_checkpoints_steps} \
--npu_bert_clip_by_global_norm=${npu_bert_clip_by_global_norm} \
--output_dir=${train_job_dir}/${device_id}/ckpt${DEVICE_ID} > ${train_job_dir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;then
echo ":::ABK 1.0.0 bert train success"
echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/${device_id}/hw_bert.log
else
echo ":::ABK 1.0.0 bert train failed"
echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/train_${device_id}.log
echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/${device_id}/hw_bert.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}"
echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_bert.log
#if [ x"$5" == x"solo" ];
#then
# /bin/cp -f hw_bert.log $perfDir/hw_bert.log
#fi