[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,22 @@
export ASCEND_HOME=/usr/local/Ascend
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
export SLOG_PRINT_TO_STDOUT=0
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device 7"
export TASK_QUEUE_ENABLE=0
taskset -c 111-150 python3 densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu 7 \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size 256 \
--epoch 90 \
--evaluate \
--resume checkpoint.pth.tar \
--data /opt/npu/dataset/imagenet
@@ -0,0 +1,62 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
toolsPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
model_name=$(cd $currentDir/..;basename `pwd`)
if [ -f /.dockerenv ];then
CLUSTER=$4
MPIRUN_ALL_IP="$5"
export CLUSTER=${CLUSTER}
fi
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
# 清除旧日志
rm -rf /var/log/npu/slog/host-0/*
rm -rf ${currentDir}/result/*.log
#mkdir train job path
currtime=`date +%Y%m%d%H%M%S`
mkdir -p ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
# device 列表, 若无指定 device 根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
# get last device id in device_group, hw log in performance from the dir named last_device_id
device_group_str=`echo ${device_group} | sed 's/ //g'`
first_device_id=`echo ${device_group_str: 0:1}`
if [ x"${CLUSTER}" == x"True" ];then
this_ip=$(hostname -I |awk '{print $1}')
ln -snf ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/0/hw_densenet121.log ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
for ip in $MPIRUN_ALL_IP;do
if [ x"$ip" != x"$this_ip" ];then
scp $yamlPath root@$ip:$yamlPath
scp ${jsonFilePath} root@$ip:${jsonFilePath}
fi
done
export PATH=$PATH:/usr/local/mpirun4.0/bin
mpirun -H ${mpirun_ip} \
--bind-to none -map-by slot\
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
--prefix /usr/local/mpirun4.0/ \
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
else
rank_id=0
#for device_id in $device_group;do
ln -snf ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/${first_device_id}/hw_densenet121.log ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
# let rank_id++
# done
fi
wait
@@ -0,0 +1,141 @@
#!/usr/bin/env bash
device_id=$1
rank_size=$2
yamlPath=$3
currentDir=$(cd "$(dirname "$0")/.."; pwd)
currtime=$4
toolsPath=$5
export YAML_PATH=$3
mkdir -p ${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
export train_job_dir=${currentDir%train*}/train/result/pt_densenet121/training_job_${currtime}/
# 从 yaml 获取配置
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "pytorch_config")
export REMARK_LOG_FILE=hw_densenet121.log # 打点日志文件名称, 必须hw_后跟模型名称小写
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
#source ${currentDir}/config/npu_set_env.sh
source ${currentDir}/config/set_env_b023.sh
# user env
export HCCL_CONNECT_TIMEOUT=600
export JOB_ID=9999001
export HCCL_RANK_TABLE_PATH=${currentDir}/config/${rank_size}p.json
export RANK_SIZE=${rank_size}
export SLOG_PRINT_TO_STDOUT=0
export DEVICE_ID=${device_id}
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
cd ${train_job_dir}
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
export PYTHONPATH=$PYTHONPATH:${curd_dir}
if [ x"$6" != x"True" ];then
rank_id=$6
export RANK_ID=$6
else
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
device_id_mo=`echo $device_id_mo`
rank_id=${device_id_mo##* }
export RANK_ID=${rank_id}
device=${device_id_mo##*deviceid = }
device_id=${device%% phyid=*}
export DEVICE_ID=${device_id}
hccljson=${train_job_dir}/*.json
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
fi
#mkdir exec path
mkdir -p ${train_job_dir}/${device_id}
cd ${train_job_dir}/${device_id}
startTime=`date +%Y%m%d-%H:%M:%S`
startTime_s=`date +%s`
# 根据单卡/多卡区分调用参数
if [ x"$6" == x"True" ];then
# 多卡多机
export CLUSTER=True
fi
if [ x"${mode}" == x"evaluate" ];then
taskset -c 111-150 python3.7 ${currentDir}/code/densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu 7 \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size 256 \
--epoch 90 \
--evaluate \
--resume checkpoint.pth.tar \
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
elif [ x"${rank_size}" == x"1" ];then
# 单卡
#source ${currentDir}/config/set_env_b023.sh
taskset -c 1-40 python3.7 ${currentDir}/code/densenet121_1p_main.py \
--workers 40 \
--arch densenet121 \
--npu ${device_single} \
--lr 0.1 \
--momentum 0.9 \
--amp \
--batch-size ${batch_size} \
--epoch ${epoches} \
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
elif [ ${rank_size} -le 8 ];then
# 单机多卡
#source ${currentDir}/config/set_env_b023.sh
python3.7 ${currentDir}/code/densenet121_8p_main.py \
--addr=$(hostname -I |awk '{print $1}') \
--seed 49 \
--workers 160 \
--lr ${lr} \
--print-freq 1 \
--eval-freq 5\
--arch densenet121 \
--dist-url 'tcp://127.0.0.1:50000' \
--dist-backend 'hccl' \
--multiprocessing-distributed \
--world-size 1 \
--batch-size ${batch_size} \
--epochs ${epoches} \
--rank 0 \
--amp \
--benchmark 0 \
--device-list ${device_group_multi} \
--data ${data_url} > ${train_job_dir}/train_${rank_size}p.log 2>&1
fi
#taskset -c 0-20 python3.7 ${currentDir}/code/densenet121.py > ./train.log 2>&1
if [ $? -eq 0 ];then
echo ":::ABK 1.0.0 densenet121 train success"
echo ":::ABK 1.0.0 densenet121 train success" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 densenet121 train success" >> ./hw_densenet121.log
else
echo ":::ABK 1.0.0 densenet121 train failed"
echo ":::ABK 1.0.0 densenet121 train failed" >> ${train_job_dir}/train_${rank_size}p.log
echo ":::ABK 1.0.0 densenet121 train failed" >> ./hw_densenet121.log
fi
endTime=`date +%Y%m%d-%H:%M:%S`
endTime_s=`date +%s`
sumTime=$[ $endTime_s - $startTime_s ]
hour=$(( $sumTime/3600 ))
min=$(( ($sumTime-${hour}*3600)/60 ))
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
echo ":::ABK 1.0.0 densenet121 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_densenet121.log