#!/bin/bash # 0 $currtime $yamlPath 0 cluster ${toolsPath} device_id=$1 currtime=$2 yamlPath=$3 toolsPath=$6 rank_size=$7 export YAML_PATH=$3 mainDir=$(cd "$(dirname "$0")/.."; pwd) mkdir -p ${mainDir%train*}/train/result/tf_bert_large/training_job_${currtime}/ export train_job_dir=${mainDir%train*}/train/result/tf_bert_large/training_job_${currtime}/ #exec_path=${train_job_dir} cd ${train_job_dir} export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils"; pwd) export utilDir=$(cd "$(dirname "$yamlPath")/../atlas_benchmark-master/utils/atlasboost"; pwd) source ${mainDir}/config/npu_set_env.sh # 从 yaml 获取配置 eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config") # 声明变量 export REMARK_LOG_FILE=hw_bert.log # 打点日志文件名称, 必须hw_后跟模型名称小写 # 添加日志打点模块路径 benchmark_log_path=${mainDir%atlas_benchmark-master*}/atlas_benchmark-master/utils export PYTHONPATH=$PYTHONPATH:${benchmark_log_path} export JOB_ID=9999001 export RANK_TABLE_FILE=${mainDir}/config/${rank_size}p.json export RANK_SIZE=${rank_size} export SLOG_PRINT_TO_STDOUT=0 export DEVICE_ID=${device_id} export DEVICE_INDEX=$DEVICE_ID export RANK_INDEX=0 if [ ${PROFILING_MODE} == True ]; then export PROFILING_MODE=true else export PROFILING_MODE=false fi if [ ${PROFILING_MODE} == True ]; then export AICPU_PROFILING_MODE=true else export AICPU_PROFILING_MODE=false fi export PROFILING_OPTIONS=${PROFILING_OPTIONS} export FP_POINT=${FP_POINT} export BP_POINT=${BP_POINT} if [ x"${device_id}" = x ] ; then echo "turing train fail" >> ${exec_path}/train_${device_id}.log exit else export DEVICE_ID=${device_id} fi env > ${currentDir}/env_${device_id}.log cd ${train_job_dir} if [ x"$5" != x"True" ];then rank_id=$4 export RANK_ID=$4 else device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \ device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \ atlasboost.set_device_id(device_id);print(atlasboost.rank())") device_id_mo=`echo $device_id_mo` rank_id=${device_id_mo##* } #echo rank_id is $rank_id export RANK_ID=${rank_id} device=${device_id_mo##*deviceid = } device_id=${device%% phyid=*} export DEVICE_ID=${device_id} #echo device_id is $device_id hccljson=${train_job_dir}/*.json cp ${hccljson} ${mainDir}/config/${rank_size}p.json fi env > ${currentDir}/env_${device_id}.log #mkdir exec path mkdir -p ${train_job_dir}/${device_id}/ckpt${DEVICE_ID} cd ${train_job_dir}/${device_id} startTime=`date +%Y%m%d-%H:%M:%S` startTime_s=`date +%s` #start exec python3.7 ${mainDir}/code/bert-Nv/run_pretraining.py \ --bert_config_file=${mainDir}/config/${bert_config_file} \ --max_seq_length=${max_seq_length} \ --max_predictions_per_seq=${max_predictions_per_seq} \ --train_batch_size=${train_batch_size} \ --learning_rate=${learning_rate} \ --num_warmup_steps=${num_warmup_steps} \ --num_train_steps=${num_train_steps} \ --optimizer_type=${optimizer_type} \ --manual_fp16=${manual_fp16} \ --use_fp16_cls=${use_fp16_cls} \ --input_files_dir=${input_files_dir} \ --eval_files_dir=${eval_files_dir} \ --do_train=${do_train} \ --do_eval=${do_eval} \ --num_accumulation_steps=${num_accumulation_steps} \ --npu_bert_job_start_file=None \ --iterations_per_loop=${iterations_per_loop} \ --npu_bert_loss_scale=${npu_bert_loss_scale} \ --distributed=${distributed} \ --graph_memory_max_size=${graph_memory_max_size} \ --variable_memory_max_size=${variable_memory_max_size} \ --npu_bert_tail_optimize=${npu_bert_tail_optimize} \ --save_checkpoints_steps=${save_checkpoints_steps} \ --npu_bert_clip_by_global_norm=${npu_bert_clip_by_global_norm} \ --output_dir=${train_job_dir}/${device_id}/ckpt${DEVICE_ID} > ${train_job_dir}/train_${device_id}.log 2>&1 if [ $? -eq 0 ] ;then echo ":::ABK 1.0.0 bert train success" echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/train_${device_id}.log echo ":::ABK 1.0.0 bert train success" >> ${train_job_dir}/${device_id}/hw_bert.log else echo ":::ABK 1.0.0 bert train failed" echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/train_${device_id}.log echo ":::ABK 1.0.0 bert train failed" >> ${train_job_dir}/${device_id}/hw_bert.log fi endTime=`date +%Y%m%d-%H:%M:%S` endTime_s=`date +%s` sumTime=$[ $endTime_s - $startTime_s ] hour=$(( $sumTime/3600 )) min=$(( ($sumTime-${hour}*3600)/60 )) sec=$(( $sumTime-${hour}*3600-${min}*60 )) echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" echo ":::ABK 1.0.0 bert train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_bert.log