[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,135 @@
# VERSION: 20.0.0.RC1
# 说明:提前下载好昇腾cmake包和OpenMPI软件包在在目录下
FROM ubuntu:18.04
ENV http_proxy="http://ptaishanpublic2:Huawei123@90.90.64.10:8080"
ENV https_proxy="http://ptaishanpublic2:Huawei123@90.90.64.10:8080"
ENV no_proxy=127.0.0.1,.huawei.com,localhost,local,.local
ARG TF_PKG=tensorflow-1.15.0-cp37-cp37m-linux_aarch64.whl
ARG HOST_ASCEND_BASE=/usr/local/Ascend
ARG NNAE_PATH=/usr/local/Ascend/nnae/latest
ARG TF_PLUGIN_PATH=/usr/local/Ascend/tfplugin/latest
ARG INSTALL_ASCEND_PKGS_SH=install_ascend_pkgs.sh
ARG PREBUILD_SH=prebuild.sh
ARG POSTBUILD_SH=postbuild.sh
WORKDIR /tmp
COPY . ./
COPY sources.list /etc/apt/
COPY pip.conf /root/.pip/
# 触发prebuild.sh
RUN bash -c "test -f $PREBUILD_SH && bash $PREBUILD_SH || true"
# 系统包
RUN apt update
RUN apt install --no-install-recommends python3.7 python3.7-dev -y
RUN apt install --no-install-recommends curl g++ gcc pkg-config unzip -y
RUN apt install --no-install-recommends libblas3 liblapack3 liblapack-dev libblas-dev gfortran libhdf5-dev libffi-dev libssl-dev\
libicu60 libxml2 -y
# benchmark系统依赖包
RUN apt-get update
RUN apt-get install -y openssh-client
RUN apt-get install -y net-tools
RUN apt-get install -y openssh-server
RUN apt-get install -y inetutils-ping
RUN apt-get install -y psmisc
RUN apt-get install -y iproute2
RUN apt-get install -y wget
RUN apt-get install -y vim
# pip3.7
RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
cd /tmp && \
apt-get download python3-distutils && \
dpkg-deb -x python3-distutils_*.deb / && \
rm python3-distutils_*.deb && \
cd - && \
python3.7 get-pip.py && \
rm get-pip.py
# HwHiAiUser
RUN groupadd HwHiAiUser && \
useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser
# python包
RUN pip3.7 install numpy && \
pip3.7 install decorator && \
pip3.7 install attrs && \
pip3.7 install sympy==1.4 && \
pip3.7 install cffi==1.12.3 && \
pip3.7 install pyyaml && \
pip3.7 install wheel && \
pip3.7 install pathlib2 && \
pip3.7 install grpcio && \
pip3.7 install grpcio-tools && \
pip3.7 install protobuf && \
pip3.7 install scipy && \
pip3.7 install Pillow==5.3.0 && \
pip3 install torchvision --no-deps && \
pip3.7 install requests
# Ascend包
RUN bash $INSTALL_ASCEND_PKGS_SH
# 安装Cmake
RUN tar -zxvf cmake-3.18.0.tar.gz
WORKDIR cmake-3.18.0
RUN mkdir -p /usr/local/cmake-3.18.0
RUN ./configure --prefix=/usr/local/cmake-3.18.0
RUN make && make install
# 安装OpenMPI开源库
WORKDIR /tmp
RUN tar -jxvf openmpi-4.0.2.tar.bz2
WORKDIR openmpi-4.0.2
RUN mkdir -p /usr/local/mpirun4.0.2
RUN ./configure --prefix=/usr/local/mpirun4.0.2
RUN make && make install
WORKDIR /tmp
# TF安装
ENV LD_LIBRARY_PATH=\
/usr/lib/aarch64-linux-gnu/hdf5/serial:\
$HOST_ASCEND_BASE/add-ons:\
$NNAE_PATH/fwkacllib/lib64:\
$HOST_ASCEND_BASE/driver/lib64/common:\
$HOST_ASCEND_BASE/driver/lib64/driver:$LD_LIBRARY_PATH
RUN pip3.7 install $TF_PKG
# 环境变量
ENV GLOG_v=2
ENV TBE_IMPL_PATH=$NNAE_PATH/opp/op_impl/built-in/ai_core/tbe
ENV TF_PLUGIN_PKG=$TF_PLUGIN_PATH/tfplugin/python/site-packages
ENV FWK_PYTHON_PATH=$NNAE_PATH/fwkacllib/python/site-packages
ENV PATH=$NNAE_PATH/fwkacllib/ccec_compiler/bin:$PATH
ENV ASCEND_OPP_PATH=$NNAE_PATH/opp
ENV PYTHONPATH=\
$FWK_PYTHON_PATH:\
$FWK_PYTHON_PATH/auto_tune.egg:\
$FWK_PYTHON_PATH/schedule_search.egg:\
$TF_PLUGIN_PKG:\
$TBE_IMPL_PATH:\
$PYTHONPATH
ENV OPENMPI=/usr/local/mpirun4.0.2/
ENV LD_LIBRARY_PATH=$OPENMPI/lib/
ENV PATH=$OPENMPI/bin:$PATH
# 免密登录
RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' && cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
sed -i 's/PermitEmptyPasswords yes/PermitEmptyPasswords no /' /etc/ssh/sshd_config && \
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes /' /etc/ssh/sshd_config && \
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
echo "root:1234" | chpasswd
CMD [ "sh", "-c", "sudo service ssh start; bash"]
# 触发postbuild.sh
RUN bash -c "test -f $POSTBUILD_SH && bash $POSTBUILD_SH || true" && \
rm $POSTBUILD_SH
@@ -0,0 +1,27 @@
#!/bin/bash
#--------------------------------------------------------------------------------
# VERSION: 20.0.0.RC1
# 请在此处使用使用bash语法编写脚本代码,安装昇腾软件包
#
# 注:本脚本运行结束后不会被自动清除,若无需保留在镜像中请在postbuild.sh脚本中清除
#--------------------------------------------------------------------------------
ASCEND_NNAE=Ascend-cann-nnae_20.1.0.B030_linux-aarch64.run
ASCEND_TFPLUGIN=Ascend-fwk-tfplugin_20.1.0.B030_linux-aarch64.run
# 构建之前把host上的/etc/ascend_install.info拷贝一份到当前目录
cp ascend_install.info /etc/
# 构建之前把host的/usr/local/Ascend/driver/version.info拷贝一份到当前目录
mkdir -p /usr/local/Ascend/driver/
cp version.info /usr/local/Ascend/driver/
# Ascend-NNAE-20.0.0.B001-arm64-linux_gcc7.3.0.run
chmod +x ${ASCEND_NNAE}
./${ASCEND_NNAE} --install-path=/usr/local/Ascend/ --install --quiet
# Ascend-TFPlugin-20.0.0.B001-arm64-linux_gcc7.3.0.run
chmod +x ${ASCEND_TFPLUGIN}
./${ASCEND_TFPLUGIN} --install-path=/usr/local/Ascend/ --install --quiet
# 只为了安装nnae包,所以需要清理,容器启动时通过ascend docker挂载进来
rm -f version.info
rm -rf /usr/local/Ascend/driver/
@@ -0,0 +1,39 @@
#!/bin/bash
#--------------------------------------------------------------------------------
# VERSION: 20.0.0.RC1
# 请在此处使用使用bash语法编写脚本代码,清除不需要保留在容器中的安装包、脚本、代理配置等
# 本脚本将会在正式构建过程结束后被执行
#
# 注:本脚本运行结束后会被自动清除,不会残留在镜像中;脚本所在位置和Working Dir位置为/tmp
#--------------------------------------------------------------------------------
rm -f ascend_install.info
rm -f prebuild.sh
rm -f install_ascend_pkgs.sh
rm -f Dockerfile*
rm -f cmake*
rm -f openmpi*
rm -f Ascend-cann-nnae_20.1.0.B030_linux-aarch64.run
rm -f Ascend-fwk-tfplugin_20.1.0.B030_linux-aarch64.run
rm -f tensorflow-1.15.0-cp37-cp37m-linux_aarch64.whl
# rm -f /etc/apt/apt.conf.d/80proxy
tee /etc/resolv.conf <<- EOF
# This file is managed by man:systemd-resolved(8). Do not edit.
#
# This is a dynamic resolv.conf file for connecting local clients to the
# internal DNS stub resolver of systemd-resolved. This file lists all
# configured search domains.
#
# Run "systemd-resolve --status" to see details about the uplink DNS servers
# currently in use.
#
# Third party programs must not access this file directly, but only through the
# symlink at /etc/resolv.conf. To manage man:resolv.conf(5) in a different way,
# replace this symlink by a static file or a different symlink.
#
# See man:systemd-resolved.service(8) for details about the supported modes of
# operation for /etc/resolv.conf.
options edns0
nameserver 8.8.8.8
nameserver 8.8.4.4
EOF
@@ -0,0 +1,16 @@
#!/bin/bash
#--------------------------------------------------------------------------------
# VERSION: 20.0.0.RC1
# 请在此处使用使用bash语法编写脚本代码,执行安装准备工作,例如配置代理等
# 本脚本将会在正式构建过程启动前被执行
#
# 注:本脚本运行结束后不会被自动清除,若无需保留在镜像中请在postbuild.sh脚本中清除
#--------------------------------------------------------------------------------
#dns代理配置,修改“/etc/resolv.conf”文件,在文件中加入如下粗体内容,用户需根据实际情况进行配置。
tee /etc/resolv.conf <<- EOF
nameserver 10.72.255.100
EOF
@@ -0,0 +1,166 @@
## atlasboost
[TOC]
### 产品介绍
atlasboost提供了如下功能:
(1) 一键式地启动单机或多机上的训练脚本,并行执行训练任务;
(2) 自动收集参与训练的device信息,生成rank table file
(3) 通过mpi重定向功能,可实时的监控训练过程;
### 目录结构
源代码的目录结构如下:
```
.
├── atlasboost
│   ├── common
│   │   ├── bin
│   │   ├── CMakeLists.txt
│   │   ├── context.cpp
│   │   ├── context.h
│   │   ├── control.cpp
│   │   ├── control.h
│   │   ├── json.cpp
│   │   ├── json.h
│   │   ├── operations.cpp
│   │   └── operations.h
│   └── tensorflow
│   ├── basics.py
│   ├── __init__.py
│   └── mpi_ops.py
├── build
│   ├── build.sh
| ├── compile.sh
| ├── compile_for_ci.sh
| └── openmpi_setup.sh
├── config
├── lib
├── doc
├── opensource
├── output
├── README.md
└── test
├── mpi_local.sh
├── mpi.sh
└── test_tensorflow.py
```
目录结构说明如下:
(1) atlasboost: 用户在训练python脚本中导入的模块;
(2) common: C++源代码,用于收集device信息,生成rank table file
(3) tensorflow: 支持tensorflow框架,设置环境变量,对外提供python接口;
(4) build: 编译脚本,用于编译common中的C++源代码;
(5) test: 测试脚本,可用于测试运行环境;
### 支持的产品
Ascend 910
### 支持的版本
### atlasboost引入
(1)按照目录结构放入到一个公共的目录中,比如当前服务器创建一个目录public,把以上目录结构放到public中,则通过修改PYTHONPATH=$PYTHONPATH:./public/,外部就可以使用atlasboot接口了。
2)通过执行./setup --path dir(可选,root用户的默认目录是/usr/local/atlasboost,非root用户默认目录是/home/username/atlasboost),则会在默认路径或者dir目录下创建atlasboost文件夹,把安装的内容放在此目录下,若dir/atlasboost已经存在,则会有交互提示(是否继续在此目录下安装,请输入y/n),输入y则会覆盖此目录下重名的文件,输入n则会退出安装。
### 环境依赖
atlasboost依赖于开源库Open MPI和Ascend 910软件中的DSMI接口;
(1) 安装Open MPI
下载4.0.2版本的Open MPI,下载地址:
https://www.open-mpi.org/software/ompi/v4.0/
解压
```
tar -jxvf openmpi-4.0.2.tar.bz2
```
配置,编译和安装
```
./configure
make && make install
```
使配置生效
```
ldconfig
```
测试
```
mpirun --version
```
(2) DSMI
atlasboost中调用DSMI接口获取device的相关信息,编译脚本compile.sh内容如下:
```
#!/bin/bash
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/Ascend/driver/kernel/inc/driver
export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/Ascend/driver/lib64/driver
CUR_DIR=$(dirname $(readlink -f $0))
cd ${CUR_DIR}/../atlasboost/common
echo 2 > /proc/sys/kernel/randomize_va_space
cmake .
make
```
其中CPLUS_INCLUDE_PATH和LIBRARY_PATH分别指定了DSMI头文件和对应的动态链接库路径。
### 使用说明
提示:由于通过gethostbyname获取服务器IP,故需要配置host。
#### 1.单机环境测试
将源代码在服务器上解压,然后编译:
```
cd atlasboost/build
./compile.sh
```
然后执行atlasboost/test目录下的测试脚本:
```
./mpi_local.sh
```
该测试程序创建了4条进程,分别收集了服务器上device0到device3的信息,在atlasboost/test生成一份rank_table_file,检查一下该文件中信息是否正确。
#### 2.单机多卡训练
将atlasboost文件夹复制到训练脚本中(只要Python导入模块时能找到) ,在python的启动脚本中导入atlasboost模块:
```
import atlasboost.tensorflow.mpi_ops as atlasboost
```
在python的启动脚本开始时调用atlasboost接口,在main函数中添加如下代码:
```
初始化时传入支持的框架(tensorflow或者mindspore),默认是tensorflow.
atlasboost.init(frame="tensorflow")
device_id = atlasboost.local_rank()
atlasboost. set_device_id (device_id)
```
提示:若非mpi启动训练任务请不要调用以上接口,并且同一台机器上的device_id不要相同。
atlasboost模块初始化之后,每条进程会动态生成一个进程id,若在一台服务器上创建了n条进程,则进程id分别为0到n-1,用户需要根据进程id为每条进程分配一个device(process_id映射到device_id),可直接使用进程id作为device id,如上所示。
执行命令启动训练脚本:
```
mpirun -np 8 -bind-to none -map-by slot --allow-run-as-root ./start.sh
```
其中,-np参数指定启动进程个数,该命令在当前服务器上启动8条进程,start.sh为模型的启动脚本, atlasboost模块会在当前目录为每一台服务器创建rank_table_file,文件在启动目录中。
#### 3.多机环境部署与测试
首先在每台参与训练的服务器中进行单机环境测试;
在多机环境下使用atlasboost,需要配置启动训练服务器到其他参与训练服务器SSH免密登录;
在启动服务器生成公钥:
```
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
```
将启动服务器的公钥发送到其他每台服务器
测试:
```
ssh xx.xx.xx.xx
```
若免密登录配置成功,则可直接使用SSH登录到xx.xx.xx.xx。
在每台服务器的相同位置保存一份atlasdc,若OS属于不同的CPU架构(arm或X86),需要重新编译;在启动服务器中,切换到atlasdc/test目录下,配置mpi.sh脚本:
```
#!/bin/bash
mpirun -H xx.xx.xx.xx:2,xx.xx.xx.xx:4 \
--allow-run-as-root \
--mca btl_tcp_if_exclude lo,docker0,endvnic \
python3 test_tensorflow.py
```
该脚本为在多台服务器上同时启动多条进程的命令,其中-H参数指定了启动哪些服务器上的test_tensorflow.py脚本以及每台服务器上启动几条进程,其中冒号后数值即为在该服务器上启动进程数,根据自己的环境进行配置。
执行测试脚本:
```
./mpi.sh
```
若多机环境正常,则会在每台服务器的atlasboost/test目录下生成进程的工作目录,工作目录中生成了rank table file。
#### 4.多机多卡训练
训练脚本经过单机多卡分布式部署的配置之后,将训练脚本复制到每台参与训练服务器的相同位置,然后执行如下命令:
```
mpirun -H xx.xx.xx.xx:8,xx.xx.xx.xx:8 \
--allow-run-as-root \
-bind-to none -map-by slot \
--mca btl_tcp_if_exclude lo,docker0,endvnic \
./mpi_start.sh
```
该命令在每台服务器上都启动了8条进程进行训练,每台服务器都生成了rank_table_file,其中--mca btl_tcp_if_exclude参数用于限制tcp通信时使用的网卡(不使用lo,docker0,endvnic)。
@@ -0,0 +1,48 @@
# -*- coding: utf-8 -*-
import os
import subprocess
import yaml
def get_model_parameter(config_type):
yaml_path = os.getenv("YAML_PATH")
with open(yaml_path, 'r') as f:
model_parameter_dict = yaml.load(f)
parameter_dict = model_parameter_dict.get(config_type)
if "tensorflow" in config_type:
parameter_dict.pop("mpirun_ip")
parameter_dict.pop("docker_image")
return parameter_dict
def get_environment_info(framework):
cpu_info = subprocess.getstatusoutput('lscpu')[1]
cpu_info = cpu_info.split("\nFlags")[0]
cpu_info_list = cpu_info.split()
cpu_info_keys = []
cpu_info_values = []
value_info = ""
for i in cpu_info_list:
if ":" not in i:
value_info += i
else:
i = i.split(":")[0]
cpu_info_keys.append(i)
if value_info:
cpu_info_values.append(value_info)
value_info = ""
cpu_info_dict = {}
for k, v in zip(cpu_info_keys, cpu_info_values):
cpu_info_dict[k] = v
NPU_info = "Ascend910"
framework_info = ""
if framework.lower() == "tensorflow":
import tensorflow as tf
framework_info = "tensorflow {}".format(tf.__version__)
if framework.lower() == "pytorch":
import torch
framework_info = "pytorch {}".format(torch.__version__)
os_info = subprocess.getstatusoutput('cat /proc/version')[1]
benchmark_version = "v1.0.0"
return cpu_info_dict, NPU_info, framework_info, os_info, benchmark_version
@@ -0,0 +1,276 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import inspect
import logging
import json
import os
import re
import sys
import uuid
import datetime
ABK_VERSION = "1.0.0" # ABK version
CPU_INFO = "cpu_info"
NPU_INFO = "npu_info"
OS_INFO = "os_info"
FRAMEWORK_INFO = "framework_info"
CONFIG_INFO = "config_info"
BENCHMARK_VERSION = "benchmark_version"
YAML_INFO = "yaml_info"
DATA_URL = "data_url"
LOSS_SCALE = "loss_scale"
ITERATION_TIME = "iteration_time"
TOTAL_RUNNING_TIME = "total_running_time"
LOSS = "loss"
MLM_LOSS = "mlm_loss"
NSP_LOSS = "nsp_loss"
Average_LOSS = "average_loss"
MASKED_LM_ACCURACY = "masked_lm_accuracy"
MASKED_LM_LOSS = "masked_lm_loss"
NEXT_SENTENCE_ACCURACY = "next_sentence_accuracy"
NEXT_SENTENCE_LOSS = "next_sentence_loss"
GLOBAL_BATCH_SIZE = "global_batch_size"
ACC = "acc"
F1 = "f1"
PREC = "prec"
REC = "rec"
RUN_START = "run_start"
RUN_STOP = "run_stop"
RUN_FINAL = "run_final"
INPUT_SIZE = "input_size"
INPUT_BATCH_SIZE = "input_batch_size"
OPT_NAME = "opt_name"
OPT_LR = "opt_learning_rate"
OPT_MOMENTUM = "opt_momentum"
OPT_WEIGHT_DECAY = "opt_weight_decay"
GLOBAL_STEP = "global_step"
CURRENT_STEP = "current_step"
EVAL_RESULTS = "eval_results"
TRAIN_LOOP = "train_loop"
TOTAL_TRAIN_EPOCH = "total_train_epoch"
CURRENT_EPOCH = "current_epoch"
FPS = "fps"
THROWOUT = "throwout"
TRAIN_ACCURACY = "train_accuracy"
TRAIN_ACCURACY_TOP1 = "train_accuracy_top1"
TRAIN_ACCURACY_TOP5 = "train_accuracy_top5"
TRAIN_CHECKPOINT = "train_checkpoint"
EVAL_START = "eval_start"
EVAL_SIZE = "eval_size"
EVAL_TARGET = "eval_target"
EVAL_ACCURACY = "eval_accuracy"
EVAL_ACCURACY_TOP1 = "eval_accuracy_top1"
EVAL_ACCURACY_TOP5 = "eval_accuracy_top5"
EVAL_STOP = "eval_stop"
EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
DATASET = "dataset"
BASE_LR = "base_lr"
# Set by imagenet_main.py
STDOUT_TAG_SET = {
ABK_VERSION,
CPU_INFO,
NPU_INFO,
OS_INFO,
FRAMEWORK_INFO,
CONFIG_INFO,
BENCHMARK_VERSION,
YAML_INFO,
DATA_URL,
DATASET,
TRAIN_ACCURACY,
LOSS_SCALE,
ITERATION_TIME,
TOTAL_RUNNING_TIME,
RUN_START,
RUN_STOP,
RUN_FINAL,
INPUT_SIZE,
GLOBAL_BATCH_SIZE,
INPUT_BATCH_SIZE,
OPT_NAME,
OPT_LR,
BASE_LR,
OPT_MOMENTUM,
OPT_WEIGHT_DECAY,
GLOBAL_STEP,
CURRENT_STEP,
TRAIN_LOOP,
TRAIN_ACCURACY_TOP1,
TRAIN_ACCURACY_TOP5,
TOTAL_TRAIN_EPOCH,
CURRENT_EPOCH,
FPS,
THROWOUT,
TRAIN_CHECKPOINT,
EVAL_START,
EVAL_SIZE,
EVAL_TARGET,
EVAL_ACCURACY,
EVAL_ACCURACY_TOP1,
EVAL_ACCURACY_TOP5,
EVAL_STOP,
EVAL_ITERATION_ACCURACY,
MLM_LOSS,
NSP_LOSS,
Average_LOSS,
MASKED_LM_ACCURACY,
MASKED_LM_LOSS,
NEXT_SENTENCE_ACCURACY,
NEXT_SENTENCE_LOSS,
LOSS,
EVAL_RESULTS,
ACC,
F1,
PREC,
REC,
}
REMARK_TAGS = (
ABK_VERSION,
CPU_INFO,
NPU_INFO,
OS_INFO,
FRAMEWORK_INFO,
CONFIG_INFO,
BENCHMARK_VERSION,
YAML_INFO,
DATA_URL,
DATASET,
LOSS_SCALE,
ITERATION_TIME,
TOTAL_RUNNING_TIME,
RUN_START,
RUN_STOP,
RUN_FINAL,
INPUT_SIZE,
GLOBAL_BATCH_SIZE,
INPUT_BATCH_SIZE,
OPT_NAME,
TRAIN_ACCURACY,
TRAIN_ACCURACY_TOP1,
TRAIN_ACCURACY_TOP5,
OPT_LR,
BASE_LR,
OPT_MOMENTUM,
OPT_WEIGHT_DECAY,
GLOBAL_STEP,
CURRENT_STEP,
TRAIN_LOOP,
TOTAL_TRAIN_EPOCH,
CURRENT_EPOCH,
FPS,
THROWOUT,
TRAIN_CHECKPOINT,
EVAL_START,
EVAL_SIZE,
EVAL_TARGET,
EVAL_ACCURACY,
EVAL_ACCURACY_TOP1,
EVAL_ACCURACY_TOP5,
EVAL_STOP,
EVAL_ITERATION_ACCURACY,
MLM_LOSS,
NSP_LOSS,
Average_LOSS,
MASKED_LM_ACCURACY,
MASKED_LM_LOSS,
NEXT_SENTENCE_ACCURACY,
NEXT_SENTENCE_LOSS,
LOSS,
EVAL_RESULTS,
ACC,
F1,
PREC,
REC,
)
ABK_VERSION = "1.0.0" # ABK version
ROOT_DIR = None
PATTERN = re.compile('[a-zA-Z0-9]+')
LOG_FILE = os.getenv("REMARK_LOG_FILE")
LOGGER = logging.getLogger('benchmark_log')
LOGGER.setLevel(logging.DEBUG)
_STREAM_HANDLER = logging.StreamHandler(stream=sys.stdout)
_STREAM_HANDLER.setLevel(logging.INFO)
LOGGER.addHandler(_STREAM_HANDLER)
BENCHMARK = (os.getenv("REMARK_LOG_FILE").split("_")[1]).split(".")[0]
if LOG_FILE:
_FILE_HANDLER = logging.FileHandler(LOG_FILE)
_FILE_HANDLER.setLevel(logging.DEBUG)
LOGGER.addHandler(_FILE_HANDLER)
else:
_STREAM_HANDLER.setLevel(logging.DEBUG)
def get_caller(stack_index=2, root_dir=None):
''' Returns file.py:lineno of your caller. A stack_index of 2 will provide
the caller of the function calling this function. Notice that stack_index
of 2 or more will fail if called from global scope. '''
caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
# Trim the filenames for readability.
filename = caller.filename
filename = os.path.basename(filename)
# if root_dir is not None:
# filename = re.sub("^" + root_dir + "/", "", filename)
return "%s:%d" % (filename, caller.lineno)
TAG_SET = set(REMARK_TAGS)
def remark_print(key, value=None, benchmark=BENCHMARK, stack_offset=0,
tag_set=TAG_SET, deferred=False, root_dir=ROOT_DIR,
extra_print=False):
''' Prints out an benchmark Log Line.
key: The benchmark log key such as 'EVAL_ACCURACY_TOP1' or 'FPS'.
value: The value which contains no newlines.
benchmark: model type: such as resnet50
stack_offset: Increase the value to go deeper into the stack to find the callsite. For example, if this
is being called by a wraper/helper you may want to set stack_offset=1 to use the callsite
of the wraper/helper itself.
tag_set: The set of tags in which key must belong.
deferred: The value is not presently known. In that case, a unique ID will
be assigned as the value of this call and will be returned. The
caller can then include said unique ID when the value is known
later.
root_dir: Directory prefix which will be trimmed when reporting calling file
for compliance logging.
extra_print: Print a blank line before logging to clear any text in the line.
Example output:
::::ABK V1.0.0 resnet50 2020-08-12 06:22:09.670723 (hooks.py:149) fps: 681.8494655321242
'''
return_value = None
if (tag_set is None and not PATTERN.match(key)) or key not in tag_set:
raise ValueError('Invalid key for MLPerf print: ' + str(key))
if value is not None and deferred:
raise ValueError("deferred is set to True, but a value was provided")
if deferred:
return_value = str(uuid.uuid4())
value = "DEFERRED: {}".format(return_value)
if value is None:
tag = key
else:
str_json = json.dumps(value)
tag = "{key}: {value}".format(key=key, value=str_json)
callsite = get_caller(2 + stack_offset, root_dir=root_dir)
# now = time.time()
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
message = ':::ABK {version} {benchmark} {secs} ({callsite}) {tag}'.format(
version=ABK_VERSION, secs=now, benchmark=benchmark, callsite=callsite, tag=tag)
if extra_print:
print() # There could be prior text on a line
if tag in STDOUT_TAG_SET:
LOGGER.info(message)
else:
LOGGER.debug(message)
return return_value
@@ -0,0 +1,47 @@
#!/bin/bash
# Get COCO 2014 data sets
if [ $1 == 'YoLov3' ];then
echo 111
mkdir -p /home/datasets/coco
pushd /home/datasets/coco
curl -O http://images.cocodataset.org/zips/train2014.zip
unzip train2014.zip
curl -O http://images.cocodataset.org/zips/val2014.zip
unzip val2014.zip
curl -O http://images.cocodataset.org/annotations/annotations_trainval2014.zip
unzip annotations_trainval2014.zip
# Get bert/cule data sets
elif [ $1 == 'Bert' ];then
echo 222
mkdir -p /home/datasets/Bertdata
pushd /home/datasets/Bertdata
curl -O xxxxxxxxxxx
tar xxxxx
# Get imagenet_TF data sets
else
echo 333
mkdir -p /home/datasets/imagenet_TF
pushd /home/datasets/imagenet_TF
curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
tar xvf ILSVRC2012_img_val.tar
curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
tar xvf ILSVRC2012_img_train.tar
curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_bbox_train_v2.tar
tar xvf ILSVRC2012_bbox_train_v2.tar
fi
popd
@@ -0,0 +1,23 @@
#!/bin/bash
# 解析 yaml 文件中的配置, 并以键值对形式输出
# args:
# $1: yaml 文件路径
# $2: 要获取的节点名
#
# return:
# key1=value1
# key2=value2
# ...
#
# 可以使用
# `eval $(./get_params_for_yaml.sh $yamlPath $section)`
# 直接将参数作为变量存入内存
params=$(python3.7 -c "import yaml; print('\n'.join(['%s=\"%s\"' % i for i in yaml.load(open(r'$1'), Loader=yaml.FullLoader).get('$2').items()]))")
if [ x"$params" == x"" ];then
echo "path: $1 not found key: $2"
exit 1
fi
echo -e "$params"
@@ -0,0 +1,9 @@
{
"server_count": "1",
"server_list": [{
"device": [{devices}],
"server_id": "127.0.0.1"
}],
"status": "completed",
"version": "1.0"
}
@@ -0,0 +1,40 @@
#!/bin/bash
rank_size=$1
yamlPath=$2
modelDir=$3
config_section=$4
currentDir=$(cd "$(dirname "$0")"; pwd)
# 从 yaml 获取配置
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
# device 列表, 若无指定 device 时根据 rank_size 顺序选择
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
arr=($device_group)
if [ ${#arr[@]} -ne ${rank_size} ];then
echo "ERROR: device_group: $device_group, quantity is not equal to rank_size: $rank_size"
exit 1
fi
HCCL_dir=$modelDir/config
cp ${currentDir}/hccl_sample.json ${HCCL_dir}/${rank_size}p.json
DEVICES=""
rank_id=0
for device_id in $device_group;do
DEVICE_IP=`hccn_tool -i ${device_id} -ip -g|awk -F ":" '/ipaddr/{print $2}'`
DEVICES+="\n\
{\n\
\"device_id\": \"${device_id}\",\n\
\"device_ip\": \"${DEVICE_IP}\",\n\
\"rank_id\": \"${rank_id}\"\n\
},"
let rank_id++
done
sed -i 's#{devices}#'"${DEVICES%?}"'#g' ${HCCL_dir}/${rank_size}p.json
@@ -0,0 +1,139 @@
#!/bin/bash
model=$1
hardware=$2
yamlPath=$3
modelDir=$4
framework=$5
modelScripts="$modelDir/scripts"
currentDir=$(cd "$(dirname "$0")"; pwd)
yamlDir=$(cd "$(dirname "${yamlPath}")";pwd)
train_dir=${currentDir%train*}/train
timeout=360000
# 从 yaml 获取配置
if [ x"${framework}" == x"tensorflow" ]; then
config_section="tensorflow_config"
elif [ x"${framework}" == x"pytorch" ]; then
config_section="pytorch_config"
else
config_section="mindspore_config"
fi
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
if [ x"${hardware}" == x"cluster" ];then
export CLUSTER=True
IFS=","
array=($mpirun_ip)
m=${array[0]#*:}
rank_size=0
mpirun_all_ip=""
for var in ${array[@]}; do
n=${var#*:}
mpirun_all_ip+=" ${var%:*}"
let a="$n & ($n-1)"
let rank_size+=$n
if [ $a -ne 0 ] || [ $n -ne $m ];then
echo "mpirun_ip: $mpirun_ip error"
exit 1
fi
done
export MPIRUN_ALL_IP=${mpirun_all_ip#?}
else
rank_size=${hardware%?}
fi
eval device_group=\$device_group_${rank_size}p
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
device_group="$(seq 0 "$(expr $rank_size - 1)")"
fi
#tensorflow docker时要映射的路径
if [ x"${framework}" == x"tensorflow" ]; then
if [ x"${hardware}" != x"cluster" ];then
# 仅单机执行需要配置 json
bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1
fi
yaml_file_name=${yamlPath##*/}
train_model_name=${yaml_file_name%%.*}
if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then
data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}"
elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then
raw_data=${training_file_pattern%raw_data*}raw_data
data_urls="-v ${raw_data}:${raw_data}"
else
data_urls="-v ${data_url}:${data_url}"
fi
fi
if [ x"${framework}" == x"pytorch" ]; then
if [ x"${train_model_name}" == x"ResNet50" ]; then
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
else
data_urls="-v ${data_url}:${data_url}"
fi
fi
if [ x"$model" == x"docker" ];then
# docker 侧执行
if [ x"${hardware}" == x"cluster" ];then
# docker多机
docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" &
else
DEVICE_DEV=""
for device_id in $device_group;do
DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}`
done
docker run -i --ipc=host \
${DEVICE_DEV} --device=/dev/davinci_manager \
--device=/dev/devmm_svm --device=/dev/hisi_hdc \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
-v ${train_dir}:${train_dir} \
-v ${modelDir}:${modelDir} \
${data_urls} \
-v ${yamlDir}:${yamlDir} \
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
-v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
-v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
/bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" &
fi
elif [ x"$model" == x"host" ]; then
# host 侧执行
bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} &
fi
workshell=$!
timeused=0
while true
do
ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep`
if [ x"${ret}" = x ];
then
break
else
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s "
sleep 5
let timeused+=5
#如果超过配置的timeout时间,则kill 掉python训练进程
if [ ${timeused} -gt ${timeout} ];
then
echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training timeout ! "
#获取python进程ID
train_sh_pid=`pgrep -P $(pgrep -P $workshell)`
for pid in $train_sh_pid
do
id=`pgrep -P $pid`
kill -9 $id
done
break
fi
fi
done
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] process end "