[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,6 @@
{
"server_count": "1",
"server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,18 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
device_group=$@
device_num=$#
touch ${currentDir}/main.log
for device_phy_id in ${device_group}
do
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
${currentDir}/train.sh ${device_phy_id} &
done
wait
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
@@ -0,0 +1,41 @@
# main env
if [ -d /usr/local/Ascend/nnae/latest ];then
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
else
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
fi
export SOC_VERSION=Ascend910
export HCCL_CONNECT_TIMEOUT=600
# user env
export JOB_ID={JOB_ID}
export RANK_TABLE_FILE={RANK_TABLE_FILE}
#export RANK_SIZE={RANK_SIZE}
#export RANK_INDEX={RANK_INDEX}
#export RANK_ID={RANK_ID}
# profiling env
export PROFILING_MODE={PROFILING_MODE}
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
export PROFILING_OPTIONS={PROFILING_OPTIONS}
export FP_POINT={FP_POINT}
export BP_POINT={BP_POINT}
# debug env
#export DUMP_GE_GRAPH=2
#export DUMP_OP=1
#export DUMP_OP_LESS=1
#export PRINT_MODEL=1
#export TE_PARALLEL_COMPILER=0
# system env
ulimit -c unlimited
@@ -0,0 +1,33 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
PWD=${currentDir}
device_id=$1
if [ x"${device_id}" = x ] ;
then
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
exit
else
export DEVICE_ID=${device_id}
fi
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
env > ${currentDir}/env_${device_id}.log
#mkdir exec path
mkdir -p ${currentDir}/${device_id}
rm -rf ${currentDir}/${device_id}/*
cd ${currentDir}/${device_id}
#start exec
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;
then
echo "turing train success" >> ${currentDir}/train_${device_id}.log
else
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
fi