[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 解析 yaml 文件中的配置, 并以键值对形式输出
|
||||
# args:
|
||||
# $1: yaml 文件路径
|
||||
# $2: 要获取的节点名
|
||||
#
|
||||
# return:
|
||||
# key1=value1
|
||||
# key2=value2
|
||||
# ...
|
||||
#
|
||||
# 可以使用
|
||||
# `eval $(./get_params_for_yaml.sh $yamlPath $section)`
|
||||
# 直接将参数作为变量存入内存
|
||||
|
||||
|
||||
params=$(python3.7 -c "import yaml; print('\n'.join(['%s=\"%s\"' % i for i in yaml.load(open(r'$1'), Loader=yaml.FullLoader).get('$2').items()]))")
|
||||
if [ x"$params" == x"" ];then
|
||||
echo "path: $1 not found key: $2"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "$params"
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [{devices}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
modelDir=$3
|
||||
config_section=$4
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
|
||||
|
||||
# device 列表, 若无指定 device 时根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
arr=($device_group)
|
||||
if [ ${#arr[@]} -ne ${rank_size} ];then
|
||||
echo "ERROR: device_group: $device_group, quantity is not equal to rank_size: $rank_size"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
HCCL_dir=$modelDir/config
|
||||
cp ${currentDir}/hccl_sample.json ${HCCL_dir}/${rank_size}p.json
|
||||
|
||||
DEVICES=""
|
||||
|
||||
rank_id=0
|
||||
for device_id in $device_group;do
|
||||
DEVICE_IP=`hccn_tool -i ${device_id} -ip -g|awk -F ":" '/ipaddr/{print $2}'`
|
||||
DEVICES+="\n\
|
||||
{\n\
|
||||
\"device_id\": \"${device_id}\",\n\
|
||||
\"device_ip\": \"${DEVICE_IP}\",\n\
|
||||
\"rank_id\": \"${rank_id}\"\n\
|
||||
},"
|
||||
let rank_id++
|
||||
done
|
||||
sed -i 's#{devices}#'"${DEVICES%?}"'#g' ${HCCL_dir}/${rank_size}p.json
|
||||
@@ -0,0 +1,139 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
model=$1
|
||||
hardware=$2
|
||||
yamlPath=$3
|
||||
modelDir=$4
|
||||
framework=$5
|
||||
|
||||
modelScripts="$modelDir/scripts"
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
yamlDir=$(cd "$(dirname "${yamlPath}")";pwd)
|
||||
train_dir=${currentDir%train*}/train
|
||||
timeout=360000
|
||||
# 从 yaml 获取配置
|
||||
if [ x"${framework}" == x"tensorflow" ]; then
|
||||
config_section="tensorflow_config"
|
||||
elif [ x"${framework}" == x"pytorch" ]; then
|
||||
config_section="pytorch_config"
|
||||
else
|
||||
config_section="mindspore_config"
|
||||
fi
|
||||
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
|
||||
|
||||
if [ x"${hardware}" == x"cluster" ];then
|
||||
export CLUSTER=True
|
||||
IFS=","
|
||||
array=($mpirun_ip)
|
||||
m=${array[0]#*:}
|
||||
rank_size=0
|
||||
mpirun_all_ip=""
|
||||
for var in ${array[@]}; do
|
||||
n=${var#*:}
|
||||
mpirun_all_ip+=" ${var%:*}"
|
||||
let a="$n & ($n-1)"
|
||||
let rank_size+=$n
|
||||
if [ $a -ne 0 ] || [ $n -ne $m ];then
|
||||
echo "mpirun_ip: $mpirun_ip error"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
export MPIRUN_ALL_IP=${mpirun_all_ip#?}
|
||||
else
|
||||
rank_size=${hardware%?}
|
||||
fi
|
||||
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
#tensorflow docker时要映射的路径
|
||||
if [ x"${framework}" == x"tensorflow" ]; then
|
||||
if [ x"${hardware}" != x"cluster" ];then
|
||||
# 仅单机执行需要配置 json
|
||||
bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1
|
||||
fi
|
||||
yaml_file_name=${yamlPath##*/}
|
||||
train_model_name=${yaml_file_name%%.*}
|
||||
if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then
|
||||
data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}"
|
||||
elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then
|
||||
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
|
||||
elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then
|
||||
raw_data=${training_file_pattern%raw_data*}raw_data
|
||||
data_urls="-v ${raw_data}:${raw_data}"
|
||||
else
|
||||
data_urls="-v ${data_url}:${data_url}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
if [ x"${framework}" == x"pytorch" ]; then
|
||||
if [ x"${train_model_name}" == x"ResNet50" ]; then
|
||||
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
|
||||
else
|
||||
data_urls="-v ${data_url}:${data_url}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
if [ x"$model" == x"docker" ];then
|
||||
# docker 侧执行
|
||||
if [ x"${hardware}" == x"cluster" ];then
|
||||
# docker多机
|
||||
docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" &
|
||||
else
|
||||
DEVICE_DEV=""
|
||||
for device_id in $device_group;do
|
||||
DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}`
|
||||
done
|
||||
docker run -i --ipc=host \
|
||||
${DEVICE_DEV} --device=/dev/davinci_manager \
|
||||
--device=/dev/devmm_svm --device=/dev/hisi_hdc \
|
||||
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
|
||||
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
|
||||
-v ${train_dir}:${train_dir} \
|
||||
-v ${modelDir}:${modelDir} \
|
||||
${data_urls} \
|
||||
-v ${yamlDir}:${yamlDir} \
|
||||
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
|
||||
-v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
|
||||
-v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
|
||||
/bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" &
|
||||
fi
|
||||
elif [ x"$model" == x"host" ]; then
|
||||
# host 侧执行
|
||||
bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} &
|
||||
fi
|
||||
workshell=$!
|
||||
timeused=0
|
||||
while true
|
||||
do
|
||||
ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep`
|
||||
if [ x"${ret}" = x ];
|
||||
then
|
||||
break
|
||||
else
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s "
|
||||
sleep 5
|
||||
let timeused+=5
|
||||
#如果超过配置的timeout时间,则kill 掉python训练进程
|
||||
if [ ${timeused} -gt ${timeout} ];
|
||||
then
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training timeout ! "
|
||||
#获取python进程ID
|
||||
train_sh_pid=`pgrep -P $(pgrep -P $workshell)`
|
||||
for pid in $train_sh_pid
|
||||
do
|
||||
id=`pgrep -P $pid`
|
||||
kill -9 $id
|
||||
done
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] process end "
|
||||
Reference in New Issue
Block a user