[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,182 @@
{
"board_id" : "0x002F",
"chip_info" : "910",
"deploy_mode" : "lab",
"group_count" : "1",
"group_list" : [
{
"device_num" : "16",
"server_num" : "2",
"group_name" : "",
"instance_count" : "16",
"instance_list" : [
{
"devices" : [
{
"device_id" : "0",
"device_ip" : "192.168.104.101"
}
],
"rank_id" : "0",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "1",
"device_ip" : "192.168.105.101"
}
],
"rank_id" : "1",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "2",
"device_ip" : "192.168.106.101"
}
],
"rank_id" : "2",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "3",
"device_ip" : "192.168.107.101"
}
],
"rank_id" : "3",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "4",
"device_ip" : "192.168.108.100"
}
],
"rank_id" : "4",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "5",
"device_ip" : "192.168.109.100"
}
],
"rank_id" : "5",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "6",
"device_ip" : "192.168.110.100"
}
],
"rank_id" : "6",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "7",
"device_ip" : "192.168.111.100"
}
],
"rank_id" : "7",
"server_id" : "90.90.176.104"
},
{
"devices" : [
{
"device_id" : "0",
"device_ip" : "192.168.100.101"
}
],
"rank_id" : "8",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "1",
"device_ip" : "192.168.101.101"
}
],
"rank_id" : "9",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "2",
"device_ip" : "192.168.102.101"
}
],
"rank_id" : "10",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "3",
"device_ip" : "192.168.103.101"
}
],
"rank_id" : "11",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "4",
"device_ip" : "192.168.100.100"
}
],
"rank_id" : "12",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "5",
"device_ip" : "192.168.101.100"
}
],
"rank_id" : "13",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "6",
"device_ip" : "192.168.102.100"
}
],
"rank_id" : "14",
"server_id" : "90.90.176.102"
},
{
"devices" : [
{
"device_id" : "7",
"device_ip" : "192.168.103.100"
}
],
"rank_id" : "15",
"server_id" : "90.90.176.102"
}
]
}
],
"para_plane_nic_location" : "device",
"para_plane_nic_name" : [
"eth0"
],
"para_plane_nic_num" : "1",
"status" : "completed"
}
@@ -0,0 +1,12 @@
{
"group_count": "1",
"group_list": [
{
"group_name": "worker",
"device_count": "1",
"instance_count": "1",
"instance_list": [{"devices":[{"device_id":"2","device_ip":"192.168.101.102"}],"pod_name":"npu1p","server_id":"127.0.0.1"}]
}
],
"status": "completed"
}
@@ -0,0 +1,12 @@
{
"group_count": "1",
"group_list": [
{
"group_name": "worker",
"device_count": "8",
"instance_count": "1",
"instance_list": [{"devices":[{"device_id":"0","device_ip":"192.168.190.102"},{"device_id":"1","device_ip":"192.168.191.102"},{"device_id":"2","device_ip":"192.168.192.102"},{"device_id":"3","device_ip":"192.168.193.102"},{"device_id":"4","device_ip":"192.168.190.103"},{"device_id":"5","device_ip":"192.168.191.103"},{"device_id":"6","device_ip":"192.168.192.103"},{"device_id":"7","device_ip":"192.168.193.103"}],"pod_name":"npu8p","server_id":"127.0.0.1"}]
}
],
"status": "completed"
}
@@ -0,0 +1,48 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
DEVICE_LIST=$@
export exec_type={MODE}
prog_exit()
{
if [ x"${exec_type}" = xdocker ];
then
# stop slogd progress
bash /usr/local/Ascend/driver/tools/docker_stop_post_sys.sh
fi
}
# register prog_exit
trap "prog_exit" SIGTERM
if [ x"${exec_type}" = xdocker ];
then
#set env
. ${currentDir}/npu_set_env.sh
# start slogd progress
mkdir -p /var/log/npu/slog/slogd
/usr/local/Ascend/driver/tools/docker/slogd &
# start main.sh
${currentDir}/main.sh ${DEVICE_LIST} &
# wait slogd stop
flag=1
while [ $flag -ne 0 ];
do
sleep 5;
flag=`ps -ef | grep train.sh | grep -v grep | wc -l`
ps -ef >> ${currentDir}/ps.log
echo "" >> ${currentDir}/ps.log
done
else
RANK_ID=`cat ${currentDir}/npu_set_env.sh | grep "RANK_ID=" | awk -F"=" '{print $2}'`
# start main.sh
su - HwHiAiUser -c ". ${currentDir}/npu_set_env.sh;export PROFILING_DIR=/var/log/npu/profiling/container/${RANK_ID};${currentDir}/main.sh ${DEVICE_LIST}" &
wait
fi
@@ -0,0 +1,6 @@
{
"server_count": "1",
"server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,18 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
device_group=$@
device_num=$#
touch ${currentDir}/main.log
for device_phy_id in ${device_group}
do
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
${currentDir}/train.sh ${device_phy_id} &
done
wait
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
@@ -0,0 +1,40 @@
# main env
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
else
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
fi
export SOC_VERSION=Ascend910
export HCCL_CONNECT_TIMEOUT=600
# user env
export JOB_ID={JOB_ID}
export RANK_TABLE_FILE={RANK_TABLE_FILE}
#export RANK_SIZE={RANK_SIZE}
#export RANK_INDEX={RANK_INDEX}
#export RANK_ID={RANK_ID}
# profiling env
export PROFILING_MODE={PROFILING_MODE}
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
export PROFILING_OPTIONS={PROFILING_OPTIONS}
export FP_POINT={FP_POINT}
export BP_POINT={BP_POINT}
# debug env
#export DUMP_GE_GRAPH=2
#export DUMP_OP=1
#export DUMP_OP_LESS=1
#export PRINT_MODEL=1
#export TE_PARALLEL_COMPILER=0
# system env
ulimit -c unlimited
@@ -0,0 +1,33 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
PWD=${currentDir}
device_id=$1
if [ x"${device_id}" = x ] ;
then
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
exit
else
export DEVICE_ID=${device_id}
fi
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
env > ${currentDir}/env_${device_id}.log
#mkdir exec path
mkdir -p ${currentDir}/${device_id}
rm -rf ${currentDir}/${device_id}/*
cd ${currentDir}/${device_id}
#start exec
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;
then
echo "turing train success" >> ${currentDir}/train_${device_id}.log
else
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
fi