[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,182 @@
|
||||
{
|
||||
"board_id" : "0x002F",
|
||||
"chip_info" : "910",
|
||||
"deploy_mode" : "lab",
|
||||
"group_count" : "1",
|
||||
"group_list" : [
|
||||
{
|
||||
"device_num" : "16",
|
||||
"server_num" : "2",
|
||||
"group_name" : "",
|
||||
"instance_count" : "16",
|
||||
"instance_list" : [
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "0",
|
||||
"device_ip" : "192.168.104.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "0",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "1",
|
||||
"device_ip" : "192.168.105.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "1",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "2",
|
||||
"device_ip" : "192.168.106.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "2",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "3",
|
||||
"device_ip" : "192.168.107.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "3",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "4",
|
||||
"device_ip" : "192.168.108.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "4",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "5",
|
||||
"device_ip" : "192.168.109.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "5",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "6",
|
||||
"device_ip" : "192.168.110.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "6",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "7",
|
||||
"device_ip" : "192.168.111.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "7",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "0",
|
||||
"device_ip" : "192.168.100.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "8",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "1",
|
||||
"device_ip" : "192.168.101.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "9",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "2",
|
||||
"device_ip" : "192.168.102.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "10",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "3",
|
||||
"device_ip" : "192.168.103.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "11",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "4",
|
||||
"device_ip" : "192.168.100.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "12",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "5",
|
||||
"device_ip" : "192.168.101.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "13",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "6",
|
||||
"device_ip" : "192.168.102.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "14",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "7",
|
||||
"device_ip" : "192.168.103.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "15",
|
||||
"server_id" : "90.90.176.102"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"para_plane_nic_location" : "device",
|
||||
"para_plane_nic_name" : [
|
||||
"eth0"
|
||||
],
|
||||
"para_plane_nic_num" : "1",
|
||||
"status" : "completed"
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"group_name": "worker",
|
||||
"device_count": "1",
|
||||
"instance_count": "1",
|
||||
"instance_list": [{"devices":[{"device_id":"2","device_ip":"192.168.101.102"}],"pod_name":"npu1p","server_id":"127.0.0.1"}]
|
||||
}
|
||||
],
|
||||
"status": "completed"
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"group_name": "worker",
|
||||
"device_count": "8",
|
||||
"instance_count": "1",
|
||||
"instance_list": [{"devices":[{"device_id":"0","device_ip":"192.168.190.102"},{"device_id":"1","device_ip":"192.168.191.102"},{"device_id":"2","device_ip":"192.168.192.102"},{"device_id":"3","device_ip":"192.168.193.102"},{"device_id":"4","device_ip":"192.168.190.103"},{"device_id":"5","device_ip":"192.168.191.103"},{"device_id":"6","device_ip":"192.168.192.103"},{"device_id":"7","device_ip":"192.168.193.103"}],"pod_name":"npu8p","server_id":"127.0.0.1"}]
|
||||
}
|
||||
],
|
||||
"status": "completed"
|
||||
}
|
||||
+48
@@ -0,0 +1,48 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
DEVICE_LIST=$@
|
||||
|
||||
export exec_type={MODE}
|
||||
|
||||
prog_exit()
|
||||
{
|
||||
if [ x"${exec_type}" = xdocker ];
|
||||
then
|
||||
# stop slogd progress
|
||||
bash /usr/local/Ascend/driver/tools/docker_stop_post_sys.sh
|
||||
fi
|
||||
}
|
||||
|
||||
# register prog_exit
|
||||
trap "prog_exit" SIGTERM
|
||||
|
||||
if [ x"${exec_type}" = xdocker ];
|
||||
then
|
||||
#set env
|
||||
. ${currentDir}/npu_set_env.sh
|
||||
|
||||
# start slogd progress
|
||||
mkdir -p /var/log/npu/slog/slogd
|
||||
/usr/local/Ascend/driver/tools/docker/slogd &
|
||||
|
||||
# start main.sh
|
||||
${currentDir}/main.sh ${DEVICE_LIST} &
|
||||
|
||||
# wait slogd stop
|
||||
flag=1
|
||||
while [ $flag -ne 0 ];
|
||||
do
|
||||
sleep 5;
|
||||
flag=`ps -ef | grep train.sh | grep -v grep | wc -l`
|
||||
ps -ef >> ${currentDir}/ps.log
|
||||
echo "" >> ${currentDir}/ps.log
|
||||
done
|
||||
else
|
||||
RANK_ID=`cat ${currentDir}/npu_set_env.sh | grep "RANK_ID=" | awk -F"=" '{print $2}'`
|
||||
# start main.sh
|
||||
su - HwHiAiUser -c ". ${currentDir}/npu_set_env.sh;export PROFILING_DIR=/var/log/npu/profiling/container/${RANK_ID};${currentDir}/main.sh ${DEVICE_LIST}" &
|
||||
wait
|
||||
fi
|
||||
|
||||
+6
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
device_group=$@
|
||||
device_num=$#
|
||||
|
||||
touch ${currentDir}/main.log
|
||||
|
||||
for device_phy_id in ${device_group}
|
||||
do
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
|
||||
${currentDir}/train.sh ${device_phy_id} &
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
|
||||
+40
@@ -0,0 +1,40 @@
|
||||
# main env
|
||||
if [ -d /usr/local/Ascend/nnae/latest ];then
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
|
||||
else
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
fi
|
||||
export SOC_VERSION=Ascend910
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
|
||||
# user env
|
||||
export JOB_ID={JOB_ID}
|
||||
export RANK_TABLE_FILE={RANK_TABLE_FILE}
|
||||
#export RANK_SIZE={RANK_SIZE}
|
||||
#export RANK_INDEX={RANK_INDEX}
|
||||
#export RANK_ID={RANK_ID}
|
||||
|
||||
# profiling env
|
||||
export PROFILING_MODE={PROFILING_MODE}
|
||||
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
|
||||
export PROFILING_OPTIONS={PROFILING_OPTIONS}
|
||||
export FP_POINT={FP_POINT}
|
||||
export BP_POINT={BP_POINT}
|
||||
|
||||
# debug env
|
||||
#export DUMP_GE_GRAPH=2
|
||||
#export DUMP_OP=1
|
||||
#export DUMP_OP_LESS=1
|
||||
#export PRINT_MODEL=1
|
||||
#export TE_PARALLEL_COMPILER=0
|
||||
|
||||
# system env
|
||||
ulimit -c unlimited
|
||||
+33
@@ -0,0 +1,33 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
PWD=${currentDir}
|
||||
|
||||
device_id=$1
|
||||
if [ x"${device_id}" = x ] ;
|
||||
then
|
||||
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
|
||||
exit
|
||||
else
|
||||
export DEVICE_ID=${device_id}
|
||||
fi
|
||||
|
||||
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
|
||||
export DEVICE_INDEX=${DEVICE_INDEX}
|
||||
|
||||
env > ${currentDir}/env_${device_id}.log
|
||||
|
||||
#mkdir exec path
|
||||
mkdir -p ${currentDir}/${device_id}
|
||||
rm -rf ${currentDir}/${device_id}/*
|
||||
cd ${currentDir}/${device_id}
|
||||
|
||||
#start exec
|
||||
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
|
||||
if [ $? -eq 0 ] ;
|
||||
then
|
||||
echo "turing train success" >> ${currentDir}/train_${device_id}.log
|
||||
else
|
||||
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
|
||||
fi
|
||||
@@ -0,0 +1,60 @@
|
||||
#!/bin/bash
|
||||
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
toolsPath=$3
|
||||
if [ -f /.dockerenv ];then
|
||||
CLUSTER=$4
|
||||
MPIRUN_ALL_IP="$5"
|
||||
export CLUSTER=${CLUSTER}
|
||||
fi
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
currtime=`date +%Y%m%d%H%M%S`
|
||||
mkdir -p ${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/
|
||||
train_job_dir=${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
|
||||
# device 列表, 若无指定 device 或大于等于 8p 时根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
# get last device id in device_group, hw log in performance from the dir named last_device_id
|
||||
device_group_str=`echo ${device_group} | sed 's/ //g'`
|
||||
first_device_id=`echo ${device_group_str: 0:1}`
|
||||
|
||||
rank_id=0
|
||||
|
||||
if [ x"${CLUSTER}" == x"True" ];then
|
||||
# ln hw log
|
||||
ln -snf ${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/0/hw_vgg16.log ${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/
|
||||
this_ip=$(hostname -I |awk '{print $1}')
|
||||
for ip in $MPIRUN_ALL_IP;do
|
||||
if [ x"$this_ip" != x"$ip" ];then
|
||||
scp $yamlPath root@$ip:$yamlPath
|
||||
fi
|
||||
done
|
||||
export PATH=$PATH:/usr/local/mpirun4.0/bin
|
||||
mpirun -H ${mpirun_ip} \
|
||||
--bind-to none -map-by slot\
|
||||
--allow-run-as-root \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
|
||||
--prefix /usr/local/mpirun4.0/ \
|
||||
${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
|
||||
else
|
||||
# ln hw log
|
||||
ln -snf ${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/${first_device_id}/hw_vgg16.log ${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/
|
||||
for device_id in $device_group;do
|
||||
#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ${currentDir}/result/main.log
|
||||
${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id&
|
||||
let rank_id++
|
||||
done
|
||||
fi
|
||||
wait
|
||||
|
||||
|
||||
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
device_id=$1
|
||||
rank_size=$2
|
||||
yamlPath=$3
|
||||
currentDir=$(cd "$(dirname "$0")/.."; pwd)
|
||||
currtime=$4
|
||||
toolsPath=$5
|
||||
mkdir -p ${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/
|
||||
export train_job_dir=${currentDir%train*}/train/result/tf_vgg16/training_job_${currtime}/
|
||||
|
||||
source ${currentDir}/config/npu_set_env.sh
|
||||
|
||||
# 声明变量
|
||||
export REMARK_LOG_FILE=hw_vgg16.log # 打点日志文件名称, 必须hw_后跟模型名称小写
|
||||
# 添加日志打点模块路径
|
||||
benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
|
||||
export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
|
||||
|
||||
# user env
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
export JOB_ID=9999001
|
||||
export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
|
||||
export RANK_SIZE=${rank_size}
|
||||
export RANK_INDEX=0
|
||||
export SLOG_PRINT_TO_STDOUT=0
|
||||
export DEVICE_ID=$1
|
||||
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
|
||||
export DEVICE_INDEX=${DEVICE_INDEX}
|
||||
export YAML_PATH=$3
|
||||
|
||||
cd ${train_job_dir}
|
||||
curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
|
||||
export PYTHONPATH=$PYTHONPATH:${curd_dir}
|
||||
|
||||
if [ x"$6" != x"True" ];then
|
||||
rank_id=$6
|
||||
export RANK_ID=$6
|
||||
else
|
||||
device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
|
||||
device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
|
||||
atlasboost.set_device_id(device_id);print(atlasboost.rank())")
|
||||
device_id_mo=`echo $device_id_mo`
|
||||
rank_id=${device_id_mo##* }
|
||||
export RANK_ID=${rank_id}
|
||||
device=${device_id_mo##*deviceid = }
|
||||
device_id=${device%% phyid=*}
|
||||
export DEVICE_ID=${device_id}
|
||||
hccljson=${train_job_dir}/*.json
|
||||
cp ${hccljson} ${currentDir}/config/${rank_size}p.json
|
||||
fi
|
||||
|
||||
#mkdir exec path
|
||||
mkdir -p ${train_job_dir}/${device_id}
|
||||
cd ${train_job_dir}/${device_id}
|
||||
|
||||
startTime=`date +%Y%m%d-%H:%M:%S`
|
||||
startTime_s=`date +%s`
|
||||
|
||||
# 根据单卡/多卡区分调用参数
|
||||
|
||||
if [ x"$6" == x"True" ];then
|
||||
export CLUSTER=True
|
||||
rm -rf ${currentDir}/result/*.log
|
||||
if [ ${rank_size} -le 4 ];then
|
||||
# python3.7 ${currentDir}/vgg16/train.py --config_file vgg16_config_1p > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
python3.7 ${currentDir}/vgg16/train.py --rank_size=${rank_size}\
|
||||
--mode=${mode_1p} \
|
||||
--max_train_steps=${max_train_steps} \
|
||||
--iterations_per_loop=${iterations_per_loop_1p} \
|
||||
--data_dir=${data_url} \
|
||||
--display_every=${display_every} \
|
||||
--log_dir=${log_dir} \
|
||||
--log_name=${log_name_1p}> ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
else
|
||||
# python3.7 ${currentDir}/vgg16/train.py --config_file vgg16_config_${rank_size}p > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
python3.7 ${currentDir}/vgg16/train.py --rank_size=${rank_size} \
|
||||
--mode=${mode_8p} \
|
||||
--max_epochs=${max_epoches} \
|
||||
--iterations_per_loop=${iterations_per_loop_8p}\
|
||||
--epochs_between_evals=${epochs_between_evals} \
|
||||
--data_dir=${data_url} \
|
||||
--lr=${lr} \
|
||||
--log_dir=${log_dir} \
|
||||
--log_name=${log_name_8p} > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
fi
|
||||
|
||||
elif [ ${rank_size} -le 4 ];then
|
||||
# 单卡
|
||||
# python3.7 ${currentDir}/vgg16/train.py --config_file vgg16_config_1p > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
python3.7 ${currentDir}/vgg16/train.py --rank_size=${rank_size} \
|
||||
--mode=${mode_1p} \
|
||||
--max_train_steps=${max_train_steps} \
|
||||
--iterations_per_loop=${iterations_per_loop_1p} \
|
||||
--data_dir=${data_url} \
|
||||
--display_every=${display_every} \
|
||||
--log_dir=${log_dir} \
|
||||
--log_name=${log_name_1p}> ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
elif [ ${rank_size} -le 8 ];then
|
||||
# 多卡单机
|
||||
# python3.7 ${currentDir}/vgg16/train.py --config_file vgg16_config_8p > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
|
||||
python3.7 ${currentDir}/vgg16/train.py --rank_size=${rank_size} \
|
||||
--mode=${mode_8p} \
|
||||
--max_epochs=${max_epoches} \
|
||||
--iterations_per_loop=${iterations_per_loop_8p}\
|
||||
--epochs_between_evals=${epochs_between_evals} \
|
||||
--data_dir=${data_url} \
|
||||
--lr=${lr} \
|
||||
--log_dir=${log_dir} \
|
||||
--log_name=${log_name_8p} > ${train_job_dir}/train_${device_id}.log 2>&1
|
||||
fi
|
||||
|
||||
#cp ./hw_vgg16.log ${currentDir}/../../../../performance/
|
||||
|
||||
if [ $? -eq 0 ] ;then
|
||||
echo ":::ABK 1.0.0 vgg16 train success"
|
||||
echo ":::ABK 1.0.0 vgg16 train success" >> ${train_job_dir}/train_${device_id}.log 2
|
||||
echo ":::ABK 1.0.0 vgg16 train success" >> ${train_job_dir}/${device_id}/hw_vgg16.log
|
||||
else
|
||||
echo ":::ABK 1.0.0 vgg16 train failed"
|
||||
echo ":::ABK 1.0.0 vgg16 train failed" >> ${train_job_dir}/train_${device_id}.log 2
|
||||
echo ":::ABK 1.0.0 vgg16 train failed" >> ${train_job_dir}/${device_id}/hw_vgg16.log
|
||||
fi
|
||||
|
||||
endTime=`date +%Y%m%d-%H:%M:%S`
|
||||
endTime_s=`date +%s`
|
||||
sumTime=$[ $endTime_s - $startTime_s ]
|
||||
hour=$(( $sumTime/3600 ))
|
||||
min=$(( ($sumTime-${hour}*3600)/60 ))
|
||||
sec=$(( $sumTime-${hour}*3600-${min}*60 ))
|
||||
echo ${hour}:${min}:${sec}
|
||||
echo ":::ABK 1.0.0 vgg16 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_vgg16.log
|
||||
+22
@@ -0,0 +1,22 @@
|
||||
import tensorflow as tf
|
||||
import os,sys
|
||||
|
||||
|
||||
class CreateSession():
|
||||
def __init__(self):
|
||||
self.estimator_config = tf.ConfigProto(
|
||||
inter_op_parallelism_threads=10,
|
||||
intra_op_parallelism_threads=10,
|
||||
allow_soft_placement=True)
|
||||
|
||||
self.estimator_config.gpu_options.allow_growth = True
|
||||
|
||||
self.set_env()
|
||||
|
||||
def set_env(self):
|
||||
gpu_thread_count = 2
|
||||
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
|
||||
os.environ['TF_GPU_THREAD_COUNT'] = str(gpu_thread_count)
|
||||
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
|
||||
os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
|
||||
|
||||
+133
@@ -0,0 +1,133 @@
|
||||
import numpy as np
|
||||
import preprocessing
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.util import nest
|
||||
import os,sys
|
||||
import numpy as np
|
||||
|
||||
|
||||
class DataLoader:
|
||||
|
||||
def __init__(self, args):
|
||||
self.args = args
|
||||
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames_train = sorted(tf.gfile.Glob(filename_pattern % 'train'))
|
||||
self.num_training_samples = get_num_records(filenames_train)
|
||||
self.args.num_training_samples = self.num_training_samples
|
||||
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames_val = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
|
||||
self.num_evaluating_samples = get_num_records(filenames_val)
|
||||
self.args.num_evaluating_samples = self.num_evaluating_samples
|
||||
|
||||
print( 'total num_training_sampels: %d' % self.num_training_samples )
|
||||
print( 'total num_evaluating_sampels: %d' % self.num_evaluating_samples )
|
||||
|
||||
self.training_samples_per_rank = self.num_training_samples
|
||||
|
||||
def get_train_input_fn(self):
|
||||
take_count = self.training_samples_per_rank
|
||||
|
||||
return make_dataset(self.args, take_count, self.args.batch_size, training=True)
|
||||
|
||||
def get_eval_input_fn(self):
|
||||
take_count = self.num_evaluating_samples
|
||||
|
||||
return make_dataset(self.args, take_count, self.args.batch_size, training=False)
|
||||
|
||||
|
||||
def get_num_records(filenames):
|
||||
def count_records(tf_record_filename):
|
||||
count = 0
|
||||
for _ in tf.python_io.tf_record_iterator(tf_record_filename):
|
||||
count += 1
|
||||
return count
|
||||
|
||||
nfile = len(filenames)
|
||||
return (count_records(filenames[0]) * (nfile - 1) +
|
||||
count_records(filenames[-1]))
|
||||
|
||||
|
||||
def _parse_example_proto(example_serialized):
|
||||
feature_map = {
|
||||
'image/encoded': tf.FixedLenFeature([], dtype=tf.string,
|
||||
default_value=''),
|
||||
'image/class/label': tf.FixedLenFeature([], dtype=tf.int64, default_value=-1),
|
||||
'image/class/text': tf.FixedLenFeature([], dtype=tf.string,
|
||||
default_value=''),
|
||||
}
|
||||
sparse_float32 = tf.VarLenFeature(dtype=tf.float32)
|
||||
# Sparse features in Example proto.
|
||||
feature_map.update(
|
||||
{k: sparse_float32 for k in ['image/object/bbox/xmin',
|
||||
'image/object/bbox/ymin',
|
||||
'image/object/bbox/xmax',
|
||||
'image/object/bbox/ymax']})
|
||||
|
||||
features = tf.parse_single_example(example_serialized, feature_map)
|
||||
label = tf.cast(features['image/class/label'], dtype=tf.int32)
|
||||
|
||||
xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
|
||||
ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
|
||||
xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
|
||||
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
|
||||
|
||||
# Note that we impose an ordering of (y, x) just to make life difficult.
|
||||
bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
|
||||
|
||||
# Force the variable number of bounding boxes into the shape
|
||||
# [1, num_boxes, coords].
|
||||
bbox = tf.expand_dims(bbox, 0)
|
||||
bbox = tf.transpose(bbox, [0, 2, 1])
|
||||
|
||||
return features['image/encoded'], label, bbox
|
||||
|
||||
|
||||
# since the preprocessing is done here, we add args file
|
||||
def parse_record(raw_record, is_training):
|
||||
image_buffer, label, bbox = _parse_example_proto(raw_record)
|
||||
|
||||
image = preprocessing.parse_and_preprocess_image_record(image_buffer, bbox, training=is_training)
|
||||
|
||||
# label-1 for VGG16
|
||||
return image, label-1
|
||||
|
||||
|
||||
def make_dataset(args, take_count, batch_size,
|
||||
training=False, shard=False):
|
||||
|
||||
shuffle_buffer_size = 10000
|
||||
num_readers = 10
|
||||
|
||||
rank_size = int(os.getenv('RANK_SIZE'))
|
||||
rank_id = int(os.getenv('DEVICE_INDEX'))
|
||||
|
||||
if training:
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames = sorted(tf.gfile.Glob(filename_pattern % 'train'))
|
||||
else:
|
||||
filename_pattern = os.path.join(args.data_dir, '%s-*')
|
||||
filenames = sorted(tf.gfile.Glob(filename_pattern % 'validation'))
|
||||
|
||||
ds = tf.data.Dataset.from_tensor_slices(filenames)
|
||||
|
||||
if not training:
|
||||
ds = ds.take(take_count)
|
||||
|
||||
if training:
|
||||
ds = ds.shuffle(1000, seed=7*(1+rank_id))
|
||||
|
||||
ds = ds.interleave(tf.data.TFRecordDataset, cycle_length=num_readers, block_length=1)
|
||||
counter = tf.data.Dataset.range(sys.maxsize)
|
||||
ds = tf.data.Dataset.zip((ds, counter))
|
||||
|
||||
if training:
|
||||
ds = ds.apply(tf.data.experimental.shuffle_and_repeat(shuffle_buffer_size, seed=5*(1+rank_id)))
|
||||
|
||||
ds = ds.map(lambda image, counter: parse_record(image, training), num_parallel_calls=14)
|
||||
|
||||
ds = ds.batch(batch_size, drop_remainder=True)
|
||||
return ds
|
||||
|
||||
|
||||
+45
@@ -0,0 +1,45 @@
|
||||
import tensorflow as tf
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
|
||||
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch, T_max, eta_min=0):
|
||||
base_lr = lr
|
||||
warmup_init_lr = 0
|
||||
total_steps = int(max_epoch * steps_per_epoch)
|
||||
warmup_steps = int(warmup_epochs * steps_per_epoch)
|
||||
|
||||
lr_each_step = []
|
||||
for i in range(total_steps):
|
||||
last_epoch = i // steps_per_epoch
|
||||
if i < warmup_steps:
|
||||
lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr)
|
||||
else:
|
||||
lr = eta_min + (base_lr - eta_min) * (1. + math.cos(math.pi*last_epoch / T_max)) / 2
|
||||
lr_each_step.append(lr)
|
||||
|
||||
return np.array(lr_each_step).astype(np.float32)
|
||||
|
||||
|
||||
class HyperParams:
|
||||
def __init__(self, args):
|
||||
self.args=args
|
||||
nsteps_per_epoch = self.args.num_training_samples // self.args.global_batch_size
|
||||
self.args.nsteps_per_epoch = nsteps_per_epoch
|
||||
if self.args.max_epochs:
|
||||
nstep = nsteps_per_epoch * self.args.max_epochs
|
||||
else:
|
||||
nstep = self.args.max_train_steps
|
||||
self.args.nstep = nstep
|
||||
|
||||
self.cos_lr = warmup_cosine_annealing_lr(self.args.lr, nsteps_per_epoch, 0, self.args.T_max, self.args.T_max, 0.0)
|
||||
|
||||
def get_learning_rate(self):
|
||||
global_step = tf.train.get_global_step()
|
||||
|
||||
learning_rate = tf.gather(tf.convert_to_tensor(self.cos_lr), global_step)
|
||||
|
||||
learning_rate = tf.identity(learning_rate, 'learning_rate')
|
||||
|
||||
return learning_rate
|
||||
|
||||
@@ -0,0 +1,22 @@
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class Layers:
|
||||
def get_accuracy(self, labels, predicted_classes, logits, args):
|
||||
accuracy = tf.metrics.accuracy(
|
||||
labels=labels, predictions=predicted_classes)
|
||||
top5acc = tf.metrics.mean(
|
||||
tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32))
|
||||
if args.rank_size == 1:
|
||||
newaccuracy = (accuracy[0], accuracy[1])
|
||||
newtop5acc = (top5acc[0], top5acc[1])
|
||||
else:
|
||||
from npu_bridge.hccl import hccl_ops
|
||||
newaccuracy = (hccl_ops.allreduce(accuracy[0],"sum")/args.rank_size, accuracy[1])
|
||||
newtop5acc = (hccl_ops.allreduce(top5acc[0],"sum")/args.rank_size, top5acc[1])
|
||||
metrics = {'val-top1acc': newaccuracy, 'val-top5acc': newtop5acc}
|
||||
return metrics
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
from __future__ import print_function
|
||||
import tensorflow as tf
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
import sys,os
|
||||
|
||||
from benchmark_log import hwlog
|
||||
|
||||
class LogSessionRunHook(tf.train.SessionRunHook):
|
||||
def __init__(self, args, warmup_steps=5):
|
||||
self.global_batch_size = args.global_batch_size
|
||||
if args.iterations_per_loop is not None:
|
||||
self.iterations_per_loop = args.iterations_per_loop
|
||||
else:
|
||||
self.iterations_per_loop = args.nsteps_per_epoch
|
||||
self.warmup_steps = warmup_steps
|
||||
self.iter_times = []
|
||||
self.num_records = args.num_training_samples
|
||||
self.display_every = args.display_every
|
||||
self.logger = get_logger(args.log_name, args.log_dir)
|
||||
rank0log(self.logger, 'PY' + str(sys.version) + 'TF' + str(tf.__version__))
|
||||
|
||||
def after_create_session(self, session, coord):
|
||||
rank0log(self.logger, 'Step Epoch Speed Loss FinLoss LR')
|
||||
self.elapsed_secs = 0.
|
||||
self.count = 0
|
||||
|
||||
def before_run(self, run_context):
|
||||
self.t0 = time.time()
|
||||
return tf.train.SessionRunArgs(
|
||||
fetches=[tf.train.get_global_step(), 'loss:0', 'total_loss:0', 'learning_rate:0'])
|
||||
|
||||
def after_run(self, run_context, run_values):
|
||||
batch_time = time.time() - self.t0
|
||||
self.iter_times.append(batch_time)
|
||||
self.elapsed_secs += batch_time
|
||||
self.count += 1
|
||||
global_step, loss, total_loss, lr = run_values.results
|
||||
if global_step == 1 or global_step % self.display_every == 0:
|
||||
dt = self.elapsed_secs / self.count
|
||||
img_per_sec = self.global_batch_size * self.iterations_per_loop / dt
|
||||
epoch = global_step * self.global_batch_size / self.num_records
|
||||
self.logger.info('step:%6i epoch:%5.1f FPS:%7.1f loss:%6.3f total_loss:%6.3f lr:%7.5f' %
|
||||
(global_step, epoch, img_per_sec, loss, total_loss, lr))
|
||||
self.elapsed_secs = 0.
|
||||
self.count = 0
|
||||
|
||||
hwlog.remark_print(key=hwlog.FPS, value='%7.1f'%img_per_sec)
|
||||
|
||||
def get_average_speed(self):
|
||||
avg_time = np.mean(self.iter_times[self.warmup_steps:])
|
||||
speed = self.global_batch_size / avg_time
|
||||
return speed
|
||||
|
||||
|
||||
|
||||
def rank0log(logger, *args, **kwargs):
|
||||
if logger:
|
||||
logger.info(''.join([str(x) for x in list(args)]))
|
||||
else:
|
||||
print(*args, **kwargs)
|
||||
|
||||
|
||||
def get_logger(log_name, log_dir):
|
||||
logger = logging.getLogger(log_name)
|
||||
logger.setLevel(logging.INFO) # INFO, ERROR
|
||||
# file handler which logs debug messages
|
||||
if not os.path.isdir(log_dir):
|
||||
try:
|
||||
os.makedirs(log_dir)
|
||||
except FileExistsError:
|
||||
# if log_dir is common for multiple ranks like on nfs
|
||||
pass
|
||||
# console handler
|
||||
ch = logging.StreamHandler()
|
||||
ch.setLevel(logging.INFO)
|
||||
# add formatter to the handlers
|
||||
formatter = logging.Formatter('%(message)s')
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
fh = logging.FileHandler(os.path.join(log_dir, log_name))
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(formatter)
|
||||
# add handlers to logger
|
||||
logger.addHandler(fh)
|
||||
return logger
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
import tensorflow as tf
|
||||
import vgg
|
||||
|
||||
|
||||
class Model(object):
|
||||
def __init__(self, args, data, hyper_param, layers, logger):
|
||||
self.args = args
|
||||
self.data = data
|
||||
self.hyper_param = hyper_param
|
||||
self.layers = layers
|
||||
self.logger = logger
|
||||
|
||||
def get_estimator_model_func(self, features, labels, mode, params=None):
|
||||
labels = tf.reshape(labels, (-1,))
|
||||
|
||||
inputs = features
|
||||
is_training = (mode == tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
inputs = tf.cast(inputs, self.args.dtype)
|
||||
|
||||
top_layer = vgg.vgg_impl(inputs, is_training)
|
||||
|
||||
logits = top_layer
|
||||
predicted_classes = tf.argmax(logits, axis=1, output_type=tf.int32)
|
||||
logits = tf.cast(logits, tf.float32)
|
||||
|
||||
labels_one_hot = tf.one_hot(labels, depth=1000)
|
||||
loss = tf.losses.softmax_cross_entropy(
|
||||
logits=logits, onehot_labels=labels_one_hot, label_smoothing=self.args.label_smoothing)
|
||||
|
||||
base_loss = tf.identity(loss, name='loss')
|
||||
|
||||
l2_loss = tf.add_n([tf.nn.l2_loss(tf.cast(v, tf.float32)) for v in tf.trainable_variables()])
|
||||
l2_loss = tf.multiply(l2_loss, self.args.weight_decay)
|
||||
total_loss = base_loss + l2_loss
|
||||
|
||||
total_loss = tf.identity(total_loss, name = 'total_loss')
|
||||
|
||||
if mode == tf.estimator.ModeKeys.EVAL:
|
||||
with tf.device(None):
|
||||
metrics = self.layers.get_accuracy( labels, predicted_classes, logits, self.args)
|
||||
|
||||
return tf.estimator.EstimatorSpec(
|
||||
mode, loss=loss, eval_metric_ops=metrics)
|
||||
|
||||
assert (mode == tf.estimator.ModeKeys.TRAIN)
|
||||
|
||||
batch_size = tf.shape(inputs)[0]
|
||||
|
||||
global_step = tf.train.get_global_step()
|
||||
learning_rate = self.hyper_param.get_learning_rate()
|
||||
|
||||
momentum = self.args.momentum
|
||||
|
||||
opt = tf.train.MomentumOptimizer(
|
||||
learning_rate, momentum, use_nesterov=self.args.use_nesterov)
|
||||
|
||||
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
|
||||
opt = NPUDistributedOptimizer(opt)
|
||||
|
||||
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) or []
|
||||
|
||||
with tf.control_dependencies(update_ops):
|
||||
gate_gradients = tf.train.Optimizer.GATE_NONE
|
||||
grads_and_vars = opt.compute_gradients(total_loss, gate_gradients=gate_gradients)
|
||||
train_op = opt.apply_gradients(grads_and_vars, global_step=global_step)
|
||||
|
||||
train_op = tf.group(train_op)
|
||||
|
||||
return tf.estimator.EstimatorSpec(mode, loss=total_loss, train_op=train_op)
|
||||
|
||||
+73
@@ -0,0 +1,73 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib.image.python.ops import distort_image_ops
|
||||
import math
|
||||
import random
|
||||
|
||||
|
||||
def decode_jpeg(imgdata, channels=3):
|
||||
return tf.image.decode_jpeg(imgdata, channels=channels,
|
||||
fancy_upscaling=False,
|
||||
dct_method='INTEGER_FAST')
|
||||
|
||||
|
||||
def random_horizontal_flip(image, prob):
|
||||
if prob > random.random():
|
||||
image = tf.image.flip_left_right(image)
|
||||
return image
|
||||
|
||||
|
||||
def decode_crop_and_resize(record, bbox, size, scale, ratio):
|
||||
with tf.name_scope('decode_crop_and_resize'):
|
||||
height = 224
|
||||
width = 224
|
||||
crop_ratio = 0.8
|
||||
initial_shape = [int(round(height / crop_ratio)),
|
||||
int(round(width / crop_ratio)), 3]
|
||||
jpeg_shape = tf.image.extract_jpeg_shape( record )
|
||||
|
||||
bbox_begin, bbox_size, bbox = \
|
||||
tf.image.sample_distorted_bounding_box(
|
||||
tf.image.extract_jpeg_shape(record),
|
||||
bounding_boxes=bbox,
|
||||
min_object_covered=0.1,
|
||||
aspect_ratio_range=ratio,
|
||||
area_range=scale,
|
||||
max_attempts=10,
|
||||
use_image_if_no_bounding_boxes=True)
|
||||
|
||||
# Reassemble the bounding box in the format the crop op requires.
|
||||
offset_y, offset_x, _ = tf.unstack(bbox_begin)
|
||||
target_height, target_width, _ = tf.unstack(bbox_size)
|
||||
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
|
||||
|
||||
image = tf.image.decode_and_crop_jpeg( record, crop_window, channels=3 )
|
||||
image = tf.image.resize_images( image, [height, width] )
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def parse_and_preprocess_image_record(record, bbox, training):
|
||||
with tf.name_scope('preprocess'):
|
||||
if training:
|
||||
image = decode_crop_and_resize(record, bbox, 224, (0.08, 1.0), (0.75, 1.333))
|
||||
image = random_horizontal_flip(image, 0.5)
|
||||
image = normalize(image)
|
||||
else:
|
||||
image = decode_jpeg(record, channels=3)
|
||||
image = tf.image.resize_images(image, [256, 256])
|
||||
image = tf.image.central_crop(image, 224.0/256)
|
||||
image = normalize(image)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def normalize(inputs):
|
||||
imagenet_mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
imagenet_std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
|
||||
imagenet_mean = tf.expand_dims(tf.expand_dims(imagenet_mean, 0), 0)
|
||||
imagenet_std = tf.expand_dims(tf.expand_dims(imagenet_std, 0), 0)
|
||||
inputs = inputs - imagenet_mean
|
||||
inputs = inputs * (1.0 / imagenet_std)
|
||||
|
||||
return inputs
|
||||
|
||||
@@ -0,0 +1,143 @@
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import ast
|
||||
|
||||
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../')))
|
||||
sys.path.append(os.path.realpath(os.path.join(os.path.dirname(__file__), '../config')))
|
||||
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../utils'))
|
||||
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../utils/atlasboost'))
|
||||
|
||||
|
||||
import data_loader as dl
|
||||
import model as ml
|
||||
import hyper_param as hp
|
||||
import layers as ly
|
||||
import logger as lg
|
||||
import trainer as tr
|
||||
import create_session as cs
|
||||
|
||||
print(os.getcwd())
|
||||
|
||||
import argparse
|
||||
|
||||
#import hwlog
|
||||
from benchmark_log import hwlog
|
||||
from benchmark_log.basic_utils import get_environment_info
|
||||
from benchmark_log.basic_utils import get_model_parameter
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
|
||||
|
||||
parser.add_argument('--rank_size', default=1,type=int,
|
||||
help="""number of NPUs to use.""")
|
||||
|
||||
# mode and parameters related
|
||||
parser.add_argument('--mode', default='train_and_evaluate',
|
||||
help="""mode to run the program e.g. train, evaluate, and
|
||||
train_and_evaluate""")
|
||||
parser.add_argument('--max_train_steps', default=100,type=int,
|
||||
help="""max steps to train""")
|
||||
parser.add_argument('--iterations_per_loop', default=10, type=int,
|
||||
help="""the number of steps in devices for each iteration""")
|
||||
parser.add_argument('--max_epochs', default=None, type=int,
|
||||
help="""total epochs for training""")
|
||||
parser.add_argument('--epochs_between_evals', default=5, type=int,
|
||||
help="""the interval between train and evaluation , only meaningful
|
||||
when the mode is train_and_evaluate""")
|
||||
|
||||
# dataset
|
||||
parser.add_argument('--data_dir', default='path/data',
|
||||
help="""directory of dataset.""")
|
||||
|
||||
# path for evaluation
|
||||
parser.add_argument('--eval_dir', default='path/eval',
|
||||
help="""directory to evaluate.""")
|
||||
|
||||
parser.add_argument('--dtype', default=tf.float32,
|
||||
help="""data type of inputs.""")
|
||||
parser.add_argument('--use_nesterov', default=True, type=ast.literal_eval,
|
||||
help="""whether to use Nesterov in optimizer""")
|
||||
parser.add_argument('--label_smoothing', default=0.1, type=float,
|
||||
help="""label smoothing factor""")
|
||||
parser.add_argument('--weight_decay', default=0.0001,
|
||||
help="""weight decay for regularization""")
|
||||
parser.add_argument('--batch_size', default=32, type=int,
|
||||
help="""batch size for one NPU""")
|
||||
|
||||
# learning rate and momentum
|
||||
parser.add_argument('--lr', default=0.01, type=float,
|
||||
help="""initial learning rate""")
|
||||
parser.add_argument('--T_max', default=150, type=int,
|
||||
help="""T_max for cosing_annealing learning rate""")
|
||||
parser.add_argument('--momentum', default=0.9, type=float,
|
||||
help="""momentum used in optimizer.""")
|
||||
|
||||
# display frequency
|
||||
parser.add_argument('--display_every', default=1, type=int,
|
||||
help="""the frequency to display info""")
|
||||
|
||||
# log file
|
||||
parser.add_argument('--log_name', default='vgg16.log',
|
||||
help="""name of log file""")
|
||||
parser.add_argument('--log_dir', default='./model_1p',
|
||||
help="""log directory""")
|
||||
|
||||
args, unknown_args = parser.parse_known_args()
|
||||
if len(unknown_args) > 0:
|
||||
for bad_arg in unknown_args:
|
||||
print("ERROR: Unknown command line arg: %s" % bad_arg)
|
||||
raise ValueError("Invalid command line arg(s)")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
args = parse_args()
|
||||
args.global_batch_size = args.batch_size * args.rank_size
|
||||
|
||||
session = cs.CreateSession()
|
||||
data = dl.DataLoader(args)
|
||||
hyper_param = hp.HyperParams(args)
|
||||
layers = ly.Layers()
|
||||
logger = lg.LogSessionRunHook(args)
|
||||
model = ml.Model(args, data, hyper_param, layers, logger)
|
||||
|
||||
trainer = tr.Trainer(session, args, data, model, logger)
|
||||
|
||||
if args.mode == 'train':
|
||||
trainer.train()
|
||||
elif args.mode == 'evaluate':
|
||||
trainer.evaluate()
|
||||
elif args.mode == 'train_and_evaluate':
|
||||
trainer.train_and_evaluate()
|
||||
else:
|
||||
raise ValueError("Invalid mode.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
|
||||
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
|
||||
config_info = get_model_parameter("tensorflow_config")
|
||||
initinal_data = {"base_lr": 0.01, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512,
|
||||
"batchsize": 32}
|
||||
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
|
||||
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
|
||||
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
|
||||
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
|
||||
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
|
||||
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
|
||||
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
|
||||
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
|
||||
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
|
||||
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
|
||||
hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
|
||||
|
||||
tf.logging.set_verbosity(tf.logging.INFO)
|
||||
main()
|
||||
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
import tensorflow as tf
|
||||
from tensorflow.python.ops import data_flow_ops
|
||||
import re
|
||||
import os
|
||||
from operator import itemgetter
|
||||
|
||||
|
||||
def sort_and_load_ckpts(log_dir):
|
||||
ckpts = []
|
||||
for f in os.listdir(log_dir):
|
||||
m = re.match(r'model.ckpt-([0-9]+).index', f)
|
||||
if m is None:
|
||||
continue
|
||||
fullpath = os.path.join(log_dir, f)
|
||||
ckpts.append({'step': int(m.group(1)),
|
||||
'path': os.path.splitext(fullpath)[0],
|
||||
'mtime': os.stat(fullpath).st_mtime,
|
||||
})
|
||||
ckpts.sort(key=itemgetter('step'))
|
||||
return ckpts
|
||||
|
||||
@@ -0,0 +1,142 @@
|
||||
import tensorflow as tf
|
||||
import math
|
||||
import time
|
||||
import os
|
||||
import train_helper
|
||||
from logger import rank0log
|
||||
|
||||
import sys
|
||||
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../utils'))
|
||||
from benchmark_log import hwlog
|
||||
|
||||
|
||||
class Trainer(object):
|
||||
def __init__(self, session, args, data, model, logger):
|
||||
self.sess = session
|
||||
self.args = args
|
||||
self.data = data
|
||||
self.model = model
|
||||
self.logger = logger
|
||||
self.print_logger = self.logger.logger
|
||||
self.all_preds = []
|
||||
self.all_targets = []
|
||||
|
||||
self.classifier, self.training_hook = self.get_npu_classifier()
|
||||
|
||||
def get_npu_classifier(self):
|
||||
from npu_bridge.estimator.npu.npu_config import NPURunConfig
|
||||
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
|
||||
|
||||
run_config = NPURunConfig(
|
||||
hcom_parallel=True,
|
||||
precision_mode="allow_mix_precision",
|
||||
enable_data_pre_proc=True,
|
||||
save_checkpoints_steps=self.args.nsteps_per_epoch,
|
||||
session_config=self.sess.estimator_config,
|
||||
model_dir=self.args.log_dir,
|
||||
iterations_per_loop=self.args.iterations_per_loop,
|
||||
keep_checkpoint_max=5)
|
||||
|
||||
classifier =NPUEstimator(
|
||||
model_fn= self.model.get_estimator_model_func,
|
||||
config= run_config
|
||||
)
|
||||
|
||||
training_hooks = []
|
||||
training_hooks.append(self.logger)
|
||||
|
||||
return classifier, training_hooks
|
||||
|
||||
def train(self):
|
||||
|
||||
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=self.args.max_epochs)
|
||||
|
||||
print ('training steps: %d' % self.args.nstep)
|
||||
self.classifier.train( input_fn=lambda:self.data.get_train_input_fn(),
|
||||
max_steps = self.args.nstep,
|
||||
hooks = self.training_hook
|
||||
)
|
||||
|
||||
def evaluate(self):
|
||||
rank0log(self.print_logger, "Evaluating")
|
||||
rank0log(self.print_logger, "Validation dataset size: {}".format(self.args.num_evaluating_samples))
|
||||
time.sleep(5) # a little extra margin...
|
||||
try:
|
||||
ckpts = train_helper.sort_and_load_ckpts(self.args.eval_dir)
|
||||
print("=========ckpt==========")
|
||||
print(ckpts)
|
||||
print("=========ckpt==========")
|
||||
for i, c in enumerate(ckpts):
|
||||
eval_result = self.classifier.evaluate(
|
||||
input_fn=lambda: self.data.get_eval_input_fn(),
|
||||
checkpoint_path=c['path'])
|
||||
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_result.get("val-top5acc")))
|
||||
|
||||
|
||||
c['epoch'] = math.ceil(c['step'] / (self.args.num_training_samples/ (self.args.batch_size)))
|
||||
c['top1'] = eval_result['val-top1acc']
|
||||
c['top5'] = eval_result['val-top5acc']
|
||||
c['loss'] = eval_result['loss']
|
||||
|
||||
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
|
||||
for i, c in enumerate(ckpts):
|
||||
if 'top1' not in c:
|
||||
continue
|
||||
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
|
||||
.format(c['step'],
|
||||
c['epoch'],
|
||||
c['top1'] * 100,
|
||||
c['top5'] * 100,
|
||||
c['loss'],
|
||||
time=time.strftime('%Y-%m-%d %H:%M:%S',
|
||||
time.localtime(c['mtime']))))
|
||||
rank0log(self.print_logger, "Finished evaluation")
|
||||
except KeyboardInterrupt:
|
||||
self.print_logger.error("Keyboard interrupt")
|
||||
|
||||
def train_and_evaluate(self):
|
||||
epochs_between_evals = self.args.epochs_between_evals
|
||||
|
||||
for i in range(self.args.max_epochs // epochs_between_evals):
|
||||
|
||||
rank0log(self.print_logger, "Starting a training cycle")
|
||||
|
||||
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=self.args.max_epochs)
|
||||
|
||||
|
||||
self.classifier.train(input_fn=lambda:self.data.get_train_input_fn(),
|
||||
steps = self.args.nsteps_per_epoch*epochs_between_evals,
|
||||
hooks = self.training_hook )
|
||||
|
||||
rank0log(self.print_logger, "Starting to evaluate")
|
||||
rank0log(self.print_logger, "Validation dataset size: {}".format(self.args.num_evaluating_samples))
|
||||
time.sleep(5) # a little extra margin...
|
||||
|
||||
ckpts = train_helper.sort_and_load_ckpts(self.args.log_dir)
|
||||
c = ckpts[-1]
|
||||
eval_result = self.classifier.evaluate(
|
||||
input_fn=lambda: self.data.get_eval_input_fn(),
|
||||
checkpoint_path=c['path'])
|
||||
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_result.get("val-top1acc")))
|
||||
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_result.get("val-top5acc")))
|
||||
|
||||
|
||||
c['epoch'] = math.ceil(c['step'] / (self.args.num_training_samples / (self.args.batch_size * self.args.rank_size)))
|
||||
c['top1'] = eval_result['val-top1acc']
|
||||
c['top5'] = eval_result['val-top5acc']
|
||||
c['loss'] = eval_result['loss']
|
||||
|
||||
rank0log(self.print_logger, ' step epoch top1 top5 loss checkpoint_time(UTC)')
|
||||
|
||||
rank0log(self.print_logger,'{:5d} {:5.1f} {:5.3f} {:6.2f} {:6.2f} {time}'
|
||||
.format(c['step'],
|
||||
c['epoch'],
|
||||
c['top1'] * 100,
|
||||
c['top5'] * 100,
|
||||
c['loss'],
|
||||
time=time.strftime('%Y-%m-%d %H:%M:%S',
|
||||
time.localtime(c['mtime']))))
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
import tensorflow as tf
|
||||
|
||||
from npu_bridge.estimator import npu_ops
|
||||
|
||||
# vgg with initialization method in gluoncv
|
||||
def vgg_impl(inputs, is_training=True):
|
||||
x = inputs
|
||||
|
||||
# conv1
|
||||
x = tf.layers.conv2d(x, 64, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 64, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
|
||||
# mp1
|
||||
x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='SAME')
|
||||
|
||||
# covn2
|
||||
x = tf.layers.conv2d(x, 128, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 128, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
|
||||
# mp2
|
||||
x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='SAME')
|
||||
|
||||
# conv3
|
||||
x = tf.layers.conv2d(x, 256, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 256, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 256, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
|
||||
# mp3
|
||||
x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='SAME')
|
||||
|
||||
# conv4
|
||||
x = tf.layers.conv2d(x, 512, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 512, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 512, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
|
||||
# mp4
|
||||
x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='SAME')
|
||||
|
||||
# conv5
|
||||
x = tf.layers.conv2d(x, 512, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 512, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
x = tf.layers.conv2d(x, 512, kernel_size=(3, 3), activation=tf.nn.relu, padding='SAME', use_bias=True, kernel_initializer=tf.initializers.variance_scaling(scale=2.0, mode='fan_out'))
|
||||
|
||||
# mp5
|
||||
x = tf.layers.max_pooling2d(x, (2, 2), (2, 2), padding='SAME')
|
||||
|
||||
x = tf.reshape(x, [-1, 7 * 7 * 512])
|
||||
|
||||
# fc6
|
||||
x = tf.layers.dense(x, 4096, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01))
|
||||
# drop6
|
||||
if is_training:
|
||||
x = npu_ops.dropout(x, 0.5)
|
||||
# fc7
|
||||
x = tf.layers.dense(x, 4096, activation=tf.nn.relu, use_bias=True, kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01))
|
||||
# drop7
|
||||
if is_training:
|
||||
x = npu_ops.dropout(x, 0.5)
|
||||
# fc8
|
||||
x = tf.layers.dense(x, 1000, activation=None, use_bias=True, kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01))
|
||||
|
||||
return x
|
||||
|
||||
Reference in New Issue
Block a user