[add]上传训练benchmark by z00560161
This commit is contained in:
+6
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
device_group=$@
|
||||
device_num=$#
|
||||
|
||||
touch ${currentDir}/main.log
|
||||
|
||||
for device_phy_id in ${device_group}
|
||||
do
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
|
||||
${currentDir}/train.sh ${device_phy_id} &
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
|
||||
+41
@@ -0,0 +1,41 @@
|
||||
# main env
|
||||
if [ -d /usr/local/Ascend/nnae/latest ];then
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
|
||||
else
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
fi
|
||||
export SOC_VERSION=Ascend910
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
|
||||
# user env
|
||||
export JOB_ID={JOB_ID}
|
||||
export RANK_TABLE_FILE={RANK_TABLE_FILE}
|
||||
#export RANK_SIZE={RANK_SIZE}
|
||||
#export RANK_INDEX={RANK_INDEX}
|
||||
#export RANK_ID={RANK_ID}
|
||||
|
||||
# profiling env
|
||||
export PROFILING_MODE={PROFILING_MODE}
|
||||
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
|
||||
export PROFILING_OPTIONS={PROFILING_OPTIONS}
|
||||
export FP_POINT={FP_POINT}
|
||||
export BP_POINT={BP_POINT}
|
||||
|
||||
|
||||
# debug env
|
||||
#export DUMP_GE_GRAPH=2
|
||||
#export DUMP_OP=1
|
||||
#export DUMP_OP_LESS=1
|
||||
#export PRINT_MODEL=1
|
||||
#export TE_PARALLEL_COMPILER=0
|
||||
|
||||
# system env
|
||||
ulimit -c unlimited
|
||||
+33
@@ -0,0 +1,33 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
PWD=${currentDir}
|
||||
|
||||
device_id=$1
|
||||
if [ x"${device_id}" = x ] ;
|
||||
then
|
||||
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
|
||||
exit
|
||||
else
|
||||
export DEVICE_ID=${device_id}
|
||||
fi
|
||||
|
||||
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
|
||||
export DEVICE_INDEX=${DEVICE_INDEX}
|
||||
|
||||
env > ${currentDir}/env_${device_id}.log
|
||||
|
||||
#mkdir exec path
|
||||
mkdir -p ${currentDir}/${device_id}
|
||||
rm -rf ${currentDir}/${device_id}/*
|
||||
cd ${currentDir}/${device_id}
|
||||
|
||||
#start exec
|
||||
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
|
||||
if [ $? -eq 0 ] ;
|
||||
then
|
||||
echo "turing train success" >> ${currentDir}/train_${device_id}.log
|
||||
else
|
||||
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
|
||||
fi
|
||||
Reference in New Issue
Block a user