[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,182 @@
|
||||
{
|
||||
"board_id" : "0x002F",
|
||||
"chip_info" : "910",
|
||||
"deploy_mode" : "lab",
|
||||
"group_count" : "1",
|
||||
"group_list" : [
|
||||
{
|
||||
"device_num" : "16",
|
||||
"server_num" : "2",
|
||||
"group_name" : "",
|
||||
"instance_count" : "16",
|
||||
"instance_list" : [
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "0",
|
||||
"device_ip" : "192.168.104.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "0",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "1",
|
||||
"device_ip" : "192.168.105.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "1",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "2",
|
||||
"device_ip" : "192.168.106.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "2",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "3",
|
||||
"device_ip" : "192.168.107.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "3",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "4",
|
||||
"device_ip" : "192.168.108.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "4",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "5",
|
||||
"device_ip" : "192.168.109.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "5",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "6",
|
||||
"device_ip" : "192.168.110.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "6",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "7",
|
||||
"device_ip" : "192.168.111.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "7",
|
||||
"server_id" : "90.90.176.104"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "0",
|
||||
"device_ip" : "192.168.100.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "8",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "1",
|
||||
"device_ip" : "192.168.101.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "9",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "2",
|
||||
"device_ip" : "192.168.102.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "10",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "3",
|
||||
"device_ip" : "192.168.103.101"
|
||||
}
|
||||
],
|
||||
"rank_id" : "11",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "4",
|
||||
"device_ip" : "192.168.100.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "12",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "5",
|
||||
"device_ip" : "192.168.101.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "13",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "6",
|
||||
"device_ip" : "192.168.102.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "14",
|
||||
"server_id" : "90.90.176.102"
|
||||
},
|
||||
{
|
||||
"devices" : [
|
||||
{
|
||||
"device_id" : "7",
|
||||
"device_ip" : "192.168.103.100"
|
||||
}
|
||||
],
|
||||
"rank_id" : "15",
|
||||
"server_id" : "90.90.176.102"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"para_plane_nic_location" : "device",
|
||||
"para_plane_nic_name" : [
|
||||
"eth0"
|
||||
],
|
||||
"para_plane_nic_num" : "1",
|
||||
"status" : "completed"
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"group_name": "worker",
|
||||
"device_count": "1",
|
||||
"instance_count": "1",
|
||||
"instance_list": [{"devices":[{"device_id":"2","device_ip":"192.168.101.102"}],"pod_name":"npu1p","server_id":"127.0.0.1"}]
|
||||
}
|
||||
],
|
||||
"status": "completed"
|
||||
}
|
||||
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"group_count": "1",
|
||||
"group_list": [
|
||||
{
|
||||
"group_name": "worker",
|
||||
"device_count": "8",
|
||||
"instance_count": "1",
|
||||
"instance_list": [{"devices":[{"device_id":"0","device_ip":"192.168.190.102"},{"device_id":"1","device_ip":"192.168.191.102"},{"device_id":"2","device_ip":"192.168.192.102"},{"device_id":"3","device_ip":"192.168.193.102"},{"device_id":"4","device_ip":"192.168.190.103"},{"device_id":"5","device_ip":"192.168.191.103"},{"device_id":"6","device_ip":"192.168.192.103"},{"device_id":"7","device_ip":"192.168.193.103"}],"pod_name":"npu8p","server_id":"127.0.0.1"}]
|
||||
}
|
||||
],
|
||||
"status": "completed"
|
||||
}
|
||||
+48
@@ -0,0 +1,48 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
DEVICE_LIST=$@
|
||||
|
||||
export exec_type={MODE}
|
||||
|
||||
prog_exit()
|
||||
{
|
||||
if [ x"${exec_type}" = xdocker ];
|
||||
then
|
||||
# stop slogd progress
|
||||
bash /usr/local/Ascend/driver/tools/docker_stop_post_sys.sh
|
||||
fi
|
||||
}
|
||||
|
||||
# register prog_exit
|
||||
trap "prog_exit" SIGTERM
|
||||
|
||||
if [ x"${exec_type}" = xdocker ];
|
||||
then
|
||||
#set env
|
||||
. ${currentDir}/npu_set_env.sh
|
||||
|
||||
# start slogd progress
|
||||
mkdir -p /var/log/npu/slog/slogd
|
||||
/usr/local/Ascend/driver/tools/docker/slogd &
|
||||
|
||||
# start main.sh
|
||||
${currentDir}/main.sh ${DEVICE_LIST} &
|
||||
|
||||
# wait slogd stop
|
||||
flag=1
|
||||
while [ $flag -ne 0 ];
|
||||
do
|
||||
sleep 5;
|
||||
flag=`ps -ef | grep train.sh | grep -v grep | wc -l`
|
||||
ps -ef >> ${currentDir}/ps.log
|
||||
echo "" >> ${currentDir}/ps.log
|
||||
done
|
||||
else
|
||||
RANK_ID=`cat ${currentDir}/npu_set_env.sh | grep "RANK_ID=" | awk -F"=" '{print $2}'`
|
||||
# start main.sh
|
||||
su - HwHiAiUser -c ". ${currentDir}/npu_set_env.sh;export PROFILING_DIR=/var/log/npu/profiling/container/${RANK_ID};${currentDir}/main.sh ${DEVICE_LIST}" &
|
||||
wait
|
||||
fi
|
||||
|
||||
+6
@@ -0,0 +1,6 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{"device":[{devices}],"server_id":"127.0.0.1"}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
+18
@@ -0,0 +1,18 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
device_group=$@
|
||||
device_num=$#
|
||||
|
||||
touch ${currentDir}/main.log
|
||||
|
||||
for device_phy_id in ${device_group}
|
||||
do
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
|
||||
${currentDir}/train.sh ${device_phy_id} &
|
||||
done
|
||||
|
||||
wait
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
|
||||
+40
@@ -0,0 +1,40 @@
|
||||
# main env
|
||||
if [ -d /usr/local/Ascend/nnae/latest ];then
|
||||
|
||||
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
|
||||
export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
|
||||
else
|
||||
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
|
||||
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
|
||||
export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
|
||||
export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
|
||||
|
||||
fi
|
||||
export SOC_VERSION=Ascend910
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
|
||||
# user env
|
||||
export JOB_ID={JOB_ID}
|
||||
export RANK_TABLE_FILE={RANK_TABLE_FILE}
|
||||
#export RANK_SIZE={RANK_SIZE}
|
||||
#export RANK_INDEX={RANK_INDEX}
|
||||
#export RANK_ID={RANK_ID}
|
||||
|
||||
# profiling env
|
||||
export PROFILING_MODE={PROFILING_MODE}
|
||||
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
|
||||
export PROFILING_OPTIONS={PROFILING_OPTIONS}
|
||||
export FP_POINT={FP_POINT}
|
||||
export BP_POINT={BP_POINT}
|
||||
|
||||
# debug env
|
||||
#export DUMP_GE_GRAPH=2
|
||||
#export DUMP_OP=1
|
||||
#export DUMP_OP_LESS=1
|
||||
#export PRINT_MODEL=1
|
||||
#export TE_PARALLEL_COMPILER=0
|
||||
|
||||
# system env
|
||||
ulimit -c unlimited
|
||||
+33
@@ -0,0 +1,33 @@
|
||||
#!/bin/sh
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
cd ${currentDir}
|
||||
|
||||
PWD=${currentDir}
|
||||
|
||||
device_id=$1
|
||||
if [ x"${device_id}" = x ] ;
|
||||
then
|
||||
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
|
||||
exit
|
||||
else
|
||||
export DEVICE_ID=${device_id}
|
||||
fi
|
||||
|
||||
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
|
||||
export DEVICE_INDEX=${DEVICE_INDEX}
|
||||
|
||||
env > ${currentDir}/env_${device_id}.log
|
||||
|
||||
#mkdir exec path
|
||||
mkdir -p ${currentDir}/${device_id}
|
||||
rm -rf ${currentDir}/${device_id}/*
|
||||
cd ${currentDir}/${device_id}
|
||||
|
||||
#start exec
|
||||
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
|
||||
if [ $? -eq 0 ] ;
|
||||
then
|
||||
echo "turing train success" >> ${currentDir}/train_${device_id}.log
|
||||
else
|
||||
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
|
||||
fi
|
||||
Reference in New Issue
Block a user