[add]上传训练benchmark by z00560161
This commit is contained in:
@@ -0,0 +1,135 @@
|
||||
# VERSION: 20.0.0.RC1
|
||||
# 说明:提前下载好昇腾cmake包和OpenMPI软件包在在目录下
|
||||
FROM ubuntu:18.04
|
||||
|
||||
ENV http_proxy="http://ptaishanpublic2:Huawei123@90.90.64.10:8080"
|
||||
ENV https_proxy="http://ptaishanpublic2:Huawei123@90.90.64.10:8080"
|
||||
ENV no_proxy=127.0.0.1,.huawei.com,localhost,local,.local
|
||||
|
||||
|
||||
ARG TF_PKG=tensorflow-1.15.0-cp37-cp37m-linux_aarch64.whl
|
||||
ARG HOST_ASCEND_BASE=/usr/local/Ascend
|
||||
ARG NNAE_PATH=/usr/local/Ascend/nnae/latest
|
||||
ARG TF_PLUGIN_PATH=/usr/local/Ascend/tfplugin/latest
|
||||
ARG INSTALL_ASCEND_PKGS_SH=install_ascend_pkgs.sh
|
||||
ARG PREBUILD_SH=prebuild.sh
|
||||
ARG POSTBUILD_SH=postbuild.sh
|
||||
WORKDIR /tmp
|
||||
COPY . ./
|
||||
COPY sources.list /etc/apt/
|
||||
COPY pip.conf /root/.pip/
|
||||
|
||||
# 触发prebuild.sh
|
||||
RUN bash -c "test -f $PREBUILD_SH && bash $PREBUILD_SH || true"
|
||||
|
||||
|
||||
# 系统包
|
||||
RUN apt update
|
||||
RUN apt install --no-install-recommends python3.7 python3.7-dev -y
|
||||
RUN apt install --no-install-recommends curl g++ gcc pkg-config unzip -y
|
||||
RUN apt install --no-install-recommends libblas3 liblapack3 liblapack-dev libblas-dev gfortran libhdf5-dev libffi-dev libssl-dev\
|
||||
libicu60 libxml2 -y
|
||||
# benchmark系统依赖包
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y openssh-client
|
||||
RUN apt-get install -y net-tools
|
||||
RUN apt-get install -y openssh-server
|
||||
RUN apt-get install -y inetutils-ping
|
||||
RUN apt-get install -y psmisc
|
||||
RUN apt-get install -y iproute2
|
||||
RUN apt-get install -y wget
|
||||
RUN apt-get install -y vim
|
||||
|
||||
# pip3.7
|
||||
RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
|
||||
cd /tmp && \
|
||||
apt-get download python3-distutils && \
|
||||
dpkg-deb -x python3-distutils_*.deb / && \
|
||||
rm python3-distutils_*.deb && \
|
||||
cd - && \
|
||||
python3.7 get-pip.py && \
|
||||
rm get-pip.py
|
||||
|
||||
# HwHiAiUser
|
||||
RUN groupadd HwHiAiUser && \
|
||||
useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser
|
||||
|
||||
# python包
|
||||
RUN pip3.7 install numpy && \
|
||||
pip3.7 install decorator && \
|
||||
pip3.7 install attrs && \
|
||||
pip3.7 install sympy==1.4 && \
|
||||
pip3.7 install cffi==1.12.3 && \
|
||||
pip3.7 install pyyaml && \
|
||||
pip3.7 install wheel && \
|
||||
pip3.7 install pathlib2 && \
|
||||
pip3.7 install grpcio && \
|
||||
pip3.7 install grpcio-tools && \
|
||||
pip3.7 install protobuf && \
|
||||
pip3.7 install scipy && \
|
||||
pip3.7 install Pillow==5.3.0 && \
|
||||
pip3 install torchvision --no-deps && \
|
||||
pip3.7 install requests
|
||||
|
||||
# Ascend包
|
||||
RUN bash $INSTALL_ASCEND_PKGS_SH
|
||||
|
||||
# 安装Cmake
|
||||
RUN tar -zxvf cmake-3.18.0.tar.gz
|
||||
WORKDIR cmake-3.18.0
|
||||
RUN mkdir -p /usr/local/cmake-3.18.0
|
||||
RUN ./configure --prefix=/usr/local/cmake-3.18.0
|
||||
RUN make && make install
|
||||
|
||||
# 安装OpenMPI开源库
|
||||
WORKDIR /tmp
|
||||
|
||||
RUN tar -jxvf openmpi-4.0.2.tar.bz2
|
||||
WORKDIR openmpi-4.0.2
|
||||
RUN mkdir -p /usr/local/mpirun4.0.2
|
||||
RUN ./configure --prefix=/usr/local/mpirun4.0.2
|
||||
RUN make && make install
|
||||
|
||||
WORKDIR /tmp
|
||||
|
||||
# TF安装
|
||||
ENV LD_LIBRARY_PATH=\
|
||||
/usr/lib/aarch64-linux-gnu/hdf5/serial:\
|
||||
$HOST_ASCEND_BASE/add-ons:\
|
||||
$NNAE_PATH/fwkacllib/lib64:\
|
||||
$HOST_ASCEND_BASE/driver/lib64/common:\
|
||||
$HOST_ASCEND_BASE/driver/lib64/driver:$LD_LIBRARY_PATH
|
||||
|
||||
RUN pip3.7 install $TF_PKG
|
||||
|
||||
# 环境变量
|
||||
ENV GLOG_v=2
|
||||
ENV TBE_IMPL_PATH=$NNAE_PATH/opp/op_impl/built-in/ai_core/tbe
|
||||
ENV TF_PLUGIN_PKG=$TF_PLUGIN_PATH/tfplugin/python/site-packages
|
||||
ENV FWK_PYTHON_PATH=$NNAE_PATH/fwkacllib/python/site-packages
|
||||
ENV PATH=$NNAE_PATH/fwkacllib/ccec_compiler/bin:$PATH
|
||||
ENV ASCEND_OPP_PATH=$NNAE_PATH/opp
|
||||
ENV PYTHONPATH=\
|
||||
$FWK_PYTHON_PATH:\
|
||||
$FWK_PYTHON_PATH/auto_tune.egg:\
|
||||
$FWK_PYTHON_PATH/schedule_search.egg:\
|
||||
$TF_PLUGIN_PKG:\
|
||||
$TBE_IMPL_PATH:\
|
||||
$PYTHONPATH
|
||||
ENV OPENMPI=/usr/local/mpirun4.0.2/
|
||||
ENV LD_LIBRARY_PATH=$OPENMPI/lib/
|
||||
ENV PATH=$OPENMPI/bin:$PATH
|
||||
|
||||
# 免密登录
|
||||
RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' && cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
|
||||
sed -i 's/PermitEmptyPasswords yes/PermitEmptyPasswords no /' /etc/ssh/sshd_config && \
|
||||
sed -i 's/PermitRootLogin without-password/PermitRootLogin yes /' /etc/ssh/sshd_config && \
|
||||
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \
|
||||
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
|
||||
echo "root:1234" | chpasswd
|
||||
CMD [ "sh", "-c", "sudo service ssh start; bash"]
|
||||
|
||||
|
||||
# 触发postbuild.sh
|
||||
RUN bash -c "test -f $POSTBUILD_SH && bash $POSTBUILD_SH || true" && \
|
||||
rm $POSTBUILD_SH
|
||||
@@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# VERSION: 20.0.0.RC1
|
||||
# 请在此处使用使用bash语法编写脚本代码,安装昇腾软件包
|
||||
#
|
||||
# 注:本脚本运行结束后不会被自动清除,若无需保留在镜像中请在postbuild.sh脚本中清除
|
||||
#--------------------------------------------------------------------------------
|
||||
|
||||
ASCEND_NNAE=Ascend-cann-nnae_20.1.0.B030_linux-aarch64.run
|
||||
ASCEND_TFPLUGIN=Ascend-fwk-tfplugin_20.1.0.B030_linux-aarch64.run
|
||||
|
||||
# 构建之前把host上的/etc/ascend_install.info拷贝一份到当前目录
|
||||
cp ascend_install.info /etc/
|
||||
# 构建之前把host的/usr/local/Ascend/driver/version.info拷贝一份到当前目录
|
||||
mkdir -p /usr/local/Ascend/driver/
|
||||
cp version.info /usr/local/Ascend/driver/
|
||||
# Ascend-NNAE-20.0.0.B001-arm64-linux_gcc7.3.0.run
|
||||
chmod +x ${ASCEND_NNAE}
|
||||
./${ASCEND_NNAE} --install-path=/usr/local/Ascend/ --install --quiet
|
||||
# Ascend-TFPlugin-20.0.0.B001-arm64-linux_gcc7.3.0.run
|
||||
chmod +x ${ASCEND_TFPLUGIN}
|
||||
./${ASCEND_TFPLUGIN} --install-path=/usr/local/Ascend/ --install --quiet
|
||||
|
||||
# 只为了安装nnae包,所以需要清理,容器启动时通过ascend docker挂载进来
|
||||
rm -f version.info
|
||||
rm -rf /usr/local/Ascend/driver/
|
||||
@@ -0,0 +1,39 @@
|
||||
#!/bin/bash
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# VERSION: 20.0.0.RC1
|
||||
# 请在此处使用使用bash语法编写脚本代码,清除不需要保留在容器中的安装包、脚本、代理配置等
|
||||
# 本脚本将会在正式构建过程结束后被执行
|
||||
#
|
||||
# 注:本脚本运行结束后会被自动清除,不会残留在镜像中;脚本所在位置和Working Dir位置为/tmp
|
||||
#--------------------------------------------------------------------------------
|
||||
rm -f ascend_install.info
|
||||
rm -f prebuild.sh
|
||||
rm -f install_ascend_pkgs.sh
|
||||
rm -f Dockerfile*
|
||||
rm -f cmake*
|
||||
rm -f openmpi*
|
||||
rm -f Ascend-cann-nnae_20.1.0.B030_linux-aarch64.run
|
||||
rm -f Ascend-fwk-tfplugin_20.1.0.B030_linux-aarch64.run
|
||||
rm -f tensorflow-1.15.0-cp37-cp37m-linux_aarch64.whl
|
||||
# rm -f /etc/apt/apt.conf.d/80proxy
|
||||
tee /etc/resolv.conf <<- EOF
|
||||
# This file is managed by man:systemd-resolved(8). Do not edit.
|
||||
#
|
||||
# This is a dynamic resolv.conf file for connecting local clients to the
|
||||
# internal DNS stub resolver of systemd-resolved. This file lists all
|
||||
# configured search domains.
|
||||
#
|
||||
# Run "systemd-resolve --status" to see details about the uplink DNS servers
|
||||
# currently in use.
|
||||
#
|
||||
# Third party programs must not access this file directly, but only through the
|
||||
# symlink at /etc/resolv.conf. To manage man:resolv.conf(5) in a different way,
|
||||
# replace this symlink by a static file or a different symlink.
|
||||
#
|
||||
# See man:systemd-resolved.service(8) for details about the supported modes of
|
||||
# operation for /etc/resolv.conf.
|
||||
options edns0
|
||||
nameserver 8.8.8.8
|
||||
nameserver 8.8.4.4
|
||||
EOF
|
||||
@@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
#--------------------------------------------------------------------------------
|
||||
# VERSION: 20.0.0.RC1
|
||||
# 请在此处使用使用bash语法编写脚本代码,执行安装准备工作,例如配置代理等
|
||||
# 本脚本将会在正式构建过程启动前被执行
|
||||
#
|
||||
# 注:本脚本运行结束后不会被自动清除,若无需保留在镜像中请在postbuild.sh脚本中清除
|
||||
#--------------------------------------------------------------------------------
|
||||
#dns代理配置,修改“/etc/resolv.conf”文件,在文件中加入如下粗体内容,用户需根据实际情况进行配置。
|
||||
tee /etc/resolv.conf <<- EOF
|
||||
nameserver 10.72.255.100
|
||||
EOF
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,166 @@
|
||||
## atlasboost
|
||||
[TOC]
|
||||
### 产品介绍
|
||||
atlasboost提供了如下功能:
|
||||
(1) 一键式地启动单机或多机上的训练脚本,并行执行训练任务;
|
||||
(2) 自动收集参与训练的device信息,生成rank table file;
|
||||
(3) 通过mpi重定向功能,可实时的监控训练过程;
|
||||
|
||||
### 目录结构
|
||||
源代码的目录结构如下:
|
||||
```
|
||||
.
|
||||
├── atlasboost
|
||||
│ ├── common
|
||||
│ │ ├── bin
|
||||
│ │ ├── CMakeLists.txt
|
||||
│ │ ├── context.cpp
|
||||
│ │ ├── context.h
|
||||
│ │ ├── control.cpp
|
||||
│ │ ├── control.h
|
||||
│ │ ├── json.cpp
|
||||
│ │ ├── json.h
|
||||
│ │ ├── operations.cpp
|
||||
│ │ └── operations.h
|
||||
│ └── tensorflow
|
||||
│ ├── basics.py
|
||||
│ ├── __init__.py
|
||||
│ └── mpi_ops.py
|
||||
├── build
|
||||
│ ├── build.sh
|
||||
| ├── compile.sh
|
||||
| ├── compile_for_ci.sh
|
||||
| └── openmpi_setup.sh
|
||||
├── config
|
||||
├── lib
|
||||
├── doc
|
||||
├── opensource
|
||||
├── output
|
||||
├── README.md
|
||||
└── test
|
||||
├── mpi_local.sh
|
||||
├── mpi.sh
|
||||
└── test_tensorflow.py
|
||||
```
|
||||
目录结构说明如下:
|
||||
(1) atlasboost: 用户在训练python脚本中导入的模块;
|
||||
(2) common: C++源代码,用于收集device信息,生成rank table file;
|
||||
(3) tensorflow: 支持tensorflow框架,设置环境变量,对外提供python接口;
|
||||
(4) build: 编译脚本,用于编译common中的C++源代码;
|
||||
(5) test: 测试脚本,可用于测试运行环境;
|
||||
### 支持的产品
|
||||
Ascend 910
|
||||
### 支持的版本
|
||||
|
||||
### atlasboost引入
|
||||
(1)按照目录结构放入到一个公共的目录中,比如当前服务器创建一个目录public,把以上目录结构放到public中,则通过修改PYTHONPATH=$PYTHONPATH:./public/,外部就可以使用atlasboot接口了。
|
||||
(2)通过执行./setup --path dir(可选,root用户的默认目录是/usr/local/atlasboost,非root用户默认目录是/home/username/atlasboost),则会在默认路径或者dir目录下创建atlasboost文件夹,把安装的内容放在此目录下,若dir/atlasboost已经存在,则会有交互提示(是否继续在此目录下安装,请输入y/n),输入y则会覆盖此目录下重名的文件,输入n则会退出安装。
|
||||
|
||||
### 环境依赖
|
||||
atlasboost依赖于开源库Open MPI和Ascend 910软件中的DSMI接口;
|
||||
(1) 安装Open MPI
|
||||
下载4.0.2版本的Open MPI,下载地址:
|
||||
https://www.open-mpi.org/software/ompi/v4.0/
|
||||
解压
|
||||
```
|
||||
tar -jxvf openmpi-4.0.2.tar.bz2
|
||||
```
|
||||
配置,编译和安装
|
||||
```
|
||||
./configure
|
||||
make && make install
|
||||
```
|
||||
使配置生效
|
||||
```
|
||||
ldconfig
|
||||
```
|
||||
测试
|
||||
```
|
||||
mpirun --version
|
||||
```
|
||||
(2) DSMI
|
||||
atlasboost中调用DSMI接口获取device的相关信息,编译脚本compile.sh内容如下:
|
||||
```
|
||||
#!/bin/bash
|
||||
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/Ascend/driver/kernel/inc/driver
|
||||
export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/Ascend/driver/lib64/driver
|
||||
|
||||
CUR_DIR=$(dirname $(readlink -f $0))
|
||||
cd ${CUR_DIR}/../atlasboost/common
|
||||
echo 2 > /proc/sys/kernel/randomize_va_space
|
||||
cmake .
|
||||
make
|
||||
```
|
||||
其中CPLUS_INCLUDE_PATH和LIBRARY_PATH分别指定了DSMI头文件和对应的动态链接库路径。
|
||||
### 使用说明
|
||||
提示:由于通过gethostbyname获取服务器IP,故需要配置host。
|
||||
#### 1.单机环境测试
|
||||
将源代码在服务器上解压,然后编译:
|
||||
```
|
||||
cd atlasboost/build
|
||||
./compile.sh
|
||||
```
|
||||
然后执行atlasboost/test目录下的测试脚本:
|
||||
```
|
||||
./mpi_local.sh
|
||||
```
|
||||
该测试程序创建了4条进程,分别收集了服务器上device0到device3的信息,在atlasboost/test生成一份rank_table_file,检查一下该文件中信息是否正确。
|
||||
#### 2.单机多卡训练
|
||||
将atlasboost文件夹复制到训练脚本中(只要Python导入模块时能找到) ,在python的启动脚本中导入atlasboost模块:
|
||||
```
|
||||
import atlasboost.tensorflow.mpi_ops as atlasboost
|
||||
```
|
||||
在python的启动脚本开始时调用atlasboost接口,在main函数中添加如下代码:
|
||||
```
|
||||
初始化时传入支持的框架(tensorflow或者mindspore),默认是tensorflow.
|
||||
atlasboost.init(frame="tensorflow")
|
||||
device_id = atlasboost.local_rank()
|
||||
atlasboost. set_device_id (device_id)
|
||||
```
|
||||
提示:若非mpi启动训练任务请不要调用以上接口,并且同一台机器上的device_id不要相同。
|
||||
atlasboost模块初始化之后,每条进程会动态生成一个进程id,若在一台服务器上创建了n条进程,则进程id分别为0到n-1,用户需要根据进程id为每条进程分配一个device(process_id映射到device_id),可直接使用进程id作为device id,如上所示。
|
||||
执行命令启动训练脚本:
|
||||
```
|
||||
mpirun -np 8 -bind-to none -map-by slot --allow-run-as-root ./start.sh
|
||||
```
|
||||
其中,-np参数指定启动进程个数,该命令在当前服务器上启动8条进程,start.sh为模型的启动脚本, atlasboost模块会在当前目录为每一台服务器创建rank_table_file,文件在启动目录中。
|
||||
#### 3.多机环境部署与测试
|
||||
首先在每台参与训练的服务器中进行单机环境测试;
|
||||
在多机环境下使用atlasboost,需要配置启动训练服务器到其他参与训练服务器SSH免密登录;
|
||||
在启动服务器生成公钥:
|
||||
```
|
||||
ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
|
||||
```
|
||||
将启动服务器的公钥发送到其他每台服务器
|
||||
测试:
|
||||
```
|
||||
ssh xx.xx.xx.xx
|
||||
```
|
||||
若免密登录配置成功,则可直接使用SSH登录到xx.xx.xx.xx。
|
||||
在每台服务器的相同位置保存一份atlasdc,若OS属于不同的CPU架构(arm或X86),需要重新编译;在启动服务器中,切换到atlasdc/test目录下,配置mpi.sh脚本:
|
||||
```
|
||||
#!/bin/bash
|
||||
|
||||
mpirun -H xx.xx.xx.xx:2,xx.xx.xx.xx:4 \
|
||||
--allow-run-as-root \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic \
|
||||
python3 test_tensorflow.py
|
||||
```
|
||||
该脚本为在多台服务器上同时启动多条进程的命令,其中-H参数指定了启动哪些服务器上的test_tensorflow.py脚本以及每台服务器上启动几条进程,其中冒号后数值即为在该服务器上启动进程数,根据自己的环境进行配置。
|
||||
执行测试脚本:
|
||||
```
|
||||
./mpi.sh
|
||||
```
|
||||
若多机环境正常,则会在每台服务器的atlasboost/test目录下生成进程的工作目录,工作目录中生成了rank table file。
|
||||
#### 4.多机多卡训练
|
||||
训练脚本经过单机多卡分布式部署的配置之后,将训练脚本复制到每台参与训练服务器的相同位置,然后执行如下命令:
|
||||
```
|
||||
mpirun -H xx.xx.xx.xx:8,xx.xx.xx.xx:8 \
|
||||
--allow-run-as-root \
|
||||
-bind-to none -map-by slot \
|
||||
--mca btl_tcp_if_exclude lo,docker0,endvnic \
|
||||
./mpi_start.sh
|
||||
```
|
||||
该命令在每台服务器上都启动了8条进程进行训练,每台服务器都生成了rank_table_file,其中--mca btl_tcp_if_exclude参数用于限制tcp通信时使用的网卡(不使用lo,docker0,endvnic)。
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import yaml
|
||||
|
||||
|
||||
def get_model_parameter(config_type):
|
||||
yaml_path = os.getenv("YAML_PATH")
|
||||
with open(yaml_path, 'r') as f:
|
||||
model_parameter_dict = yaml.load(f)
|
||||
parameter_dict = model_parameter_dict.get(config_type)
|
||||
if "tensorflow" in config_type:
|
||||
parameter_dict.pop("mpirun_ip")
|
||||
parameter_dict.pop("docker_image")
|
||||
return parameter_dict
|
||||
|
||||
|
||||
def get_environment_info(framework):
|
||||
cpu_info = subprocess.getstatusoutput('lscpu')[1]
|
||||
cpu_info = cpu_info.split("\nFlags")[0]
|
||||
cpu_info_list = cpu_info.split()
|
||||
cpu_info_keys = []
|
||||
cpu_info_values = []
|
||||
value_info = ""
|
||||
for i in cpu_info_list:
|
||||
if ":" not in i:
|
||||
value_info += i
|
||||
else:
|
||||
i = i.split(":")[0]
|
||||
cpu_info_keys.append(i)
|
||||
if value_info:
|
||||
cpu_info_values.append(value_info)
|
||||
value_info = ""
|
||||
cpu_info_dict = {}
|
||||
for k, v in zip(cpu_info_keys, cpu_info_values):
|
||||
cpu_info_dict[k] = v
|
||||
NPU_info = "Ascend910"
|
||||
framework_info = ""
|
||||
if framework.lower() == "tensorflow":
|
||||
import tensorflow as tf
|
||||
framework_info = "tensorflow {}".format(tf.__version__)
|
||||
if framework.lower() == "pytorch":
|
||||
import torch
|
||||
framework_info = "pytorch {}".format(torch.__version__)
|
||||
os_info = subprocess.getstatusoutput('cat /proc/version')[1]
|
||||
benchmark_version = "v1.0.0"
|
||||
return cpu_info_dict, NPU_info, framework_info, os_info, benchmark_version
|
||||
@@ -0,0 +1,276 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import inspect
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import uuid
|
||||
import datetime
|
||||
|
||||
|
||||
ABK_VERSION = "1.0.0" # ABK version
|
||||
CPU_INFO = "cpu_info"
|
||||
NPU_INFO = "npu_info"
|
||||
OS_INFO = "os_info"
|
||||
FRAMEWORK_INFO = "framework_info"
|
||||
CONFIG_INFO = "config_info"
|
||||
BENCHMARK_VERSION = "benchmark_version"
|
||||
YAML_INFO = "yaml_info"
|
||||
DATA_URL = "data_url"
|
||||
LOSS_SCALE = "loss_scale"
|
||||
ITERATION_TIME = "iteration_time"
|
||||
TOTAL_RUNNING_TIME = "total_running_time"
|
||||
LOSS = "loss"
|
||||
MLM_LOSS = "mlm_loss"
|
||||
NSP_LOSS = "nsp_loss"
|
||||
Average_LOSS = "average_loss"
|
||||
MASKED_LM_ACCURACY = "masked_lm_accuracy"
|
||||
MASKED_LM_LOSS = "masked_lm_loss"
|
||||
NEXT_SENTENCE_ACCURACY = "next_sentence_accuracy"
|
||||
NEXT_SENTENCE_LOSS = "next_sentence_loss"
|
||||
GLOBAL_BATCH_SIZE = "global_batch_size"
|
||||
ACC = "acc"
|
||||
F1 = "f1"
|
||||
PREC = "prec"
|
||||
REC = "rec"
|
||||
RUN_START = "run_start"
|
||||
RUN_STOP = "run_stop"
|
||||
RUN_FINAL = "run_final"
|
||||
INPUT_SIZE = "input_size"
|
||||
INPUT_BATCH_SIZE = "input_batch_size"
|
||||
OPT_NAME = "opt_name"
|
||||
OPT_LR = "opt_learning_rate"
|
||||
OPT_MOMENTUM = "opt_momentum"
|
||||
OPT_WEIGHT_DECAY = "opt_weight_decay"
|
||||
GLOBAL_STEP = "global_step"
|
||||
CURRENT_STEP = "current_step"
|
||||
EVAL_RESULTS = "eval_results"
|
||||
TRAIN_LOOP = "train_loop"
|
||||
TOTAL_TRAIN_EPOCH = "total_train_epoch"
|
||||
CURRENT_EPOCH = "current_epoch"
|
||||
FPS = "fps"
|
||||
THROWOUT = "throwout"
|
||||
TRAIN_ACCURACY = "train_accuracy"
|
||||
TRAIN_ACCURACY_TOP1 = "train_accuracy_top1"
|
||||
TRAIN_ACCURACY_TOP5 = "train_accuracy_top5"
|
||||
TRAIN_CHECKPOINT = "train_checkpoint"
|
||||
EVAL_START = "eval_start"
|
||||
EVAL_SIZE = "eval_size"
|
||||
EVAL_TARGET = "eval_target"
|
||||
EVAL_ACCURACY = "eval_accuracy"
|
||||
EVAL_ACCURACY_TOP1 = "eval_accuracy_top1"
|
||||
EVAL_ACCURACY_TOP5 = "eval_accuracy_top5"
|
||||
EVAL_STOP = "eval_stop"
|
||||
EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
|
||||
DATASET = "dataset"
|
||||
BASE_LR = "base_lr"
|
||||
# Set by imagenet_main.py
|
||||
STDOUT_TAG_SET = {
|
||||
ABK_VERSION,
|
||||
CPU_INFO,
|
||||
NPU_INFO,
|
||||
OS_INFO,
|
||||
FRAMEWORK_INFO,
|
||||
CONFIG_INFO,
|
||||
BENCHMARK_VERSION,
|
||||
YAML_INFO,
|
||||
DATA_URL,
|
||||
DATASET,
|
||||
TRAIN_ACCURACY,
|
||||
LOSS_SCALE,
|
||||
ITERATION_TIME,
|
||||
TOTAL_RUNNING_TIME,
|
||||
RUN_START,
|
||||
RUN_STOP,
|
||||
RUN_FINAL,
|
||||
INPUT_SIZE,
|
||||
GLOBAL_BATCH_SIZE,
|
||||
INPUT_BATCH_SIZE,
|
||||
OPT_NAME,
|
||||
OPT_LR,
|
||||
BASE_LR,
|
||||
OPT_MOMENTUM,
|
||||
OPT_WEIGHT_DECAY,
|
||||
GLOBAL_STEP,
|
||||
CURRENT_STEP,
|
||||
TRAIN_LOOP,
|
||||
TRAIN_ACCURACY_TOP1,
|
||||
TRAIN_ACCURACY_TOP5,
|
||||
TOTAL_TRAIN_EPOCH,
|
||||
CURRENT_EPOCH,
|
||||
FPS,
|
||||
THROWOUT,
|
||||
TRAIN_CHECKPOINT,
|
||||
EVAL_START,
|
||||
EVAL_SIZE,
|
||||
EVAL_TARGET,
|
||||
EVAL_ACCURACY,
|
||||
EVAL_ACCURACY_TOP1,
|
||||
EVAL_ACCURACY_TOP5,
|
||||
EVAL_STOP,
|
||||
EVAL_ITERATION_ACCURACY,
|
||||
MLM_LOSS,
|
||||
NSP_LOSS,
|
||||
Average_LOSS,
|
||||
MASKED_LM_ACCURACY,
|
||||
MASKED_LM_LOSS,
|
||||
NEXT_SENTENCE_ACCURACY,
|
||||
NEXT_SENTENCE_LOSS,
|
||||
LOSS,
|
||||
EVAL_RESULTS,
|
||||
ACC,
|
||||
F1,
|
||||
PREC,
|
||||
REC,
|
||||
}
|
||||
|
||||
|
||||
REMARK_TAGS = (
|
||||
ABK_VERSION,
|
||||
CPU_INFO,
|
||||
NPU_INFO,
|
||||
OS_INFO,
|
||||
FRAMEWORK_INFO,
|
||||
CONFIG_INFO,
|
||||
BENCHMARK_VERSION,
|
||||
YAML_INFO,
|
||||
DATA_URL,
|
||||
DATASET,
|
||||
LOSS_SCALE,
|
||||
ITERATION_TIME,
|
||||
TOTAL_RUNNING_TIME,
|
||||
RUN_START,
|
||||
RUN_STOP,
|
||||
RUN_FINAL,
|
||||
INPUT_SIZE,
|
||||
GLOBAL_BATCH_SIZE,
|
||||
INPUT_BATCH_SIZE,
|
||||
OPT_NAME,
|
||||
TRAIN_ACCURACY,
|
||||
TRAIN_ACCURACY_TOP1,
|
||||
TRAIN_ACCURACY_TOP5,
|
||||
OPT_LR,
|
||||
BASE_LR,
|
||||
OPT_MOMENTUM,
|
||||
OPT_WEIGHT_DECAY,
|
||||
GLOBAL_STEP,
|
||||
CURRENT_STEP,
|
||||
TRAIN_LOOP,
|
||||
TOTAL_TRAIN_EPOCH,
|
||||
CURRENT_EPOCH,
|
||||
FPS,
|
||||
THROWOUT,
|
||||
TRAIN_CHECKPOINT,
|
||||
EVAL_START,
|
||||
EVAL_SIZE,
|
||||
EVAL_TARGET,
|
||||
EVAL_ACCURACY,
|
||||
EVAL_ACCURACY_TOP1,
|
||||
EVAL_ACCURACY_TOP5,
|
||||
EVAL_STOP,
|
||||
EVAL_ITERATION_ACCURACY,
|
||||
MLM_LOSS,
|
||||
NSP_LOSS,
|
||||
Average_LOSS,
|
||||
MASKED_LM_ACCURACY,
|
||||
MASKED_LM_LOSS,
|
||||
NEXT_SENTENCE_ACCURACY,
|
||||
NEXT_SENTENCE_LOSS,
|
||||
LOSS,
|
||||
EVAL_RESULTS,
|
||||
ACC,
|
||||
F1,
|
||||
PREC,
|
||||
REC,
|
||||
)
|
||||
|
||||
|
||||
ABK_VERSION = "1.0.0" # ABK version
|
||||
ROOT_DIR = None
|
||||
PATTERN = re.compile('[a-zA-Z0-9]+')
|
||||
LOG_FILE = os.getenv("REMARK_LOG_FILE")
|
||||
LOGGER = logging.getLogger('benchmark_log')
|
||||
LOGGER.setLevel(logging.DEBUG)
|
||||
_STREAM_HANDLER = logging.StreamHandler(stream=sys.stdout)
|
||||
_STREAM_HANDLER.setLevel(logging.INFO)
|
||||
LOGGER.addHandler(_STREAM_HANDLER)
|
||||
BENCHMARK = (os.getenv("REMARK_LOG_FILE").split("_")[1]).split(".")[0]
|
||||
|
||||
|
||||
if LOG_FILE:
|
||||
_FILE_HANDLER = logging.FileHandler(LOG_FILE)
|
||||
_FILE_HANDLER.setLevel(logging.DEBUG)
|
||||
LOGGER.addHandler(_FILE_HANDLER)
|
||||
else:
|
||||
_STREAM_HANDLER.setLevel(logging.DEBUG)
|
||||
|
||||
|
||||
def get_caller(stack_index=2, root_dir=None):
|
||||
''' Returns file.py:lineno of your caller. A stack_index of 2 will provide
|
||||
the caller of the function calling this function. Notice that stack_index
|
||||
of 2 or more will fail if called from global scope. '''
|
||||
caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
|
||||
# Trim the filenames for readability.
|
||||
filename = caller.filename
|
||||
filename = os.path.basename(filename)
|
||||
# if root_dir is not None:
|
||||
# filename = re.sub("^" + root_dir + "/", "", filename)
|
||||
return "%s:%d" % (filename, caller.lineno)
|
||||
|
||||
|
||||
TAG_SET = set(REMARK_TAGS)
|
||||
|
||||
|
||||
def remark_print(key, value=None, benchmark=BENCHMARK, stack_offset=0,
|
||||
tag_set=TAG_SET, deferred=False, root_dir=ROOT_DIR,
|
||||
extra_print=False):
|
||||
''' Prints out an benchmark Log Line.
|
||||
key: The benchmark log key such as 'EVAL_ACCURACY_TOP1' or 'FPS'.
|
||||
value: The value which contains no newlines.
|
||||
benchmark: model type: such as resnet50
|
||||
stack_offset: Increase the value to go deeper into the stack to find the callsite. For example, if this
|
||||
is being called by a wraper/helper you may want to set stack_offset=1 to use the callsite
|
||||
of the wraper/helper itself.
|
||||
tag_set: The set of tags in which key must belong.
|
||||
deferred: The value is not presently known. In that case, a unique ID will
|
||||
be assigned as the value of this call and will be returned. The
|
||||
caller can then include said unique ID when the value is known
|
||||
later.
|
||||
root_dir: Directory prefix which will be trimmed when reporting calling file
|
||||
for compliance logging.
|
||||
extra_print: Print a blank line before logging to clear any text in the line.
|
||||
Example output:
|
||||
::::ABK V1.0.0 resnet50 2020-08-12 06:22:09.670723 (hooks.py:149) fps: 681.8494655321242
|
||||
'''
|
||||
|
||||
return_value = None
|
||||
if (tag_set is None and not PATTERN.match(key)) or key not in tag_set:
|
||||
raise ValueError('Invalid key for MLPerf print: ' + str(key))
|
||||
if value is not None and deferred:
|
||||
raise ValueError("deferred is set to True, but a value was provided")
|
||||
if deferred:
|
||||
return_value = str(uuid.uuid4())
|
||||
value = "DEFERRED: {}".format(return_value)
|
||||
if value is None:
|
||||
tag = key
|
||||
else:
|
||||
str_json = json.dumps(value)
|
||||
tag = "{key}: {value}".format(key=key, value=str_json)
|
||||
callsite = get_caller(2 + stack_offset, root_dir=root_dir)
|
||||
# now = time.time()
|
||||
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
|
||||
message = ':::ABK {version} {benchmark} {secs} ({callsite}) {tag}'.format(
|
||||
version=ABK_VERSION, secs=now, benchmark=benchmark, callsite=callsite, tag=tag)
|
||||
if extra_print:
|
||||
print() # There could be prior text on a line
|
||||
if tag in STDOUT_TAG_SET:
|
||||
LOGGER.info(message)
|
||||
else:
|
||||
LOGGER.debug(message)
|
||||
return return_value
|
||||
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Get COCO 2014 data sets
|
||||
|
||||
if [ $1 == 'YoLov3' ];then
|
||||
echo 111
|
||||
mkdir -p /home/datasets/coco
|
||||
pushd /home/datasets/coco
|
||||
|
||||
curl -O http://images.cocodataset.org/zips/train2014.zip
|
||||
unzip train2014.zip
|
||||
|
||||
curl -O http://images.cocodataset.org/zips/val2014.zip
|
||||
unzip val2014.zip
|
||||
|
||||
curl -O http://images.cocodataset.org/annotations/annotations_trainval2014.zip
|
||||
unzip annotations_trainval2014.zip
|
||||
|
||||
# Get bert/cule data sets
|
||||
elif [ $1 == 'Bert' ];then
|
||||
echo 222
|
||||
mkdir -p /home/datasets/Bertdata
|
||||
pushd /home/datasets/Bertdata
|
||||
|
||||
curl -O xxxxxxxxxxx
|
||||
tar xxxxx
|
||||
|
||||
# Get imagenet_TF data sets
|
||||
else
|
||||
echo 333
|
||||
mkdir -p /home/datasets/imagenet_TF
|
||||
pushd /home/datasets/imagenet_TF
|
||||
|
||||
|
||||
curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
|
||||
tar xvf ILSVRC2012_img_val.tar
|
||||
|
||||
curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
|
||||
tar xvf ILSVRC2012_img_train.tar
|
||||
|
||||
curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_bbox_train_v2.tar
|
||||
tar xvf ILSVRC2012_bbox_train_v2.tar
|
||||
fi
|
||||
|
||||
|
||||
|
||||
popd
|
||||
@@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 解析 yaml 文件中的配置, 并以键值对形式输出
|
||||
# args:
|
||||
# $1: yaml 文件路径
|
||||
# $2: 要获取的节点名
|
||||
#
|
||||
# return:
|
||||
# key1=value1
|
||||
# key2=value2
|
||||
# ...
|
||||
#
|
||||
# 可以使用
|
||||
# `eval $(./get_params_for_yaml.sh $yamlPath $section)`
|
||||
# 直接将参数作为变量存入内存
|
||||
|
||||
|
||||
params=$(python3.7 -c "import yaml; print('\n'.join(['%s=\"%s\"' % i for i in yaml.load(open(r'$1'), Loader=yaml.FullLoader).get('$2').items()]))")
|
||||
if [ x"$params" == x"" ];then
|
||||
echo "path: $1 not found key: $2"
|
||||
exit 1
|
||||
fi
|
||||
echo -e "$params"
|
||||
@@ -0,0 +1,9 @@
|
||||
{
|
||||
"server_count": "1",
|
||||
"server_list": [{
|
||||
"device": [{devices}],
|
||||
"server_id": "127.0.0.1"
|
||||
}],
|
||||
"status": "completed",
|
||||
"version": "1.0"
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
#!/bin/bash
|
||||
|
||||
rank_size=$1
|
||||
yamlPath=$2
|
||||
modelDir=$3
|
||||
config_section=$4
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
|
||||
# 从 yaml 获取配置
|
||||
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
|
||||
|
||||
# device 列表, 若无指定 device 时根据 rank_size 顺序选择
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
arr=($device_group)
|
||||
if [ ${#arr[@]} -ne ${rank_size} ];then
|
||||
echo "ERROR: device_group: $device_group, quantity is not equal to rank_size: $rank_size"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
HCCL_dir=$modelDir/config
|
||||
cp ${currentDir}/hccl_sample.json ${HCCL_dir}/${rank_size}p.json
|
||||
|
||||
DEVICES=""
|
||||
|
||||
rank_id=0
|
||||
for device_id in $device_group;do
|
||||
DEVICE_IP=`hccn_tool -i ${device_id} -ip -g|awk -F ":" '/ipaddr/{print $2}'`
|
||||
DEVICES+="\n\
|
||||
{\n\
|
||||
\"device_id\": \"${device_id}\",\n\
|
||||
\"device_ip\": \"${DEVICE_IP}\",\n\
|
||||
\"rank_id\": \"${rank_id}\"\n\
|
||||
},"
|
||||
let rank_id++
|
||||
done
|
||||
sed -i 's#{devices}#'"${DEVICES%?}"'#g' ${HCCL_dir}/${rank_size}p.json
|
||||
@@ -0,0 +1,139 @@
|
||||
#!/bin/bash
|
||||
|
||||
|
||||
model=$1
|
||||
hardware=$2
|
||||
yamlPath=$3
|
||||
modelDir=$4
|
||||
framework=$5
|
||||
|
||||
modelScripts="$modelDir/scripts"
|
||||
|
||||
currentDir=$(cd "$(dirname "$0")"; pwd)
|
||||
yamlDir=$(cd "$(dirname "${yamlPath}")";pwd)
|
||||
train_dir=${currentDir%train*}/train
|
||||
timeout=360000
|
||||
# 从 yaml 获取配置
|
||||
if [ x"${framework}" == x"tensorflow" ]; then
|
||||
config_section="tensorflow_config"
|
||||
elif [ x"${framework}" == x"pytorch" ]; then
|
||||
config_section="pytorch_config"
|
||||
else
|
||||
config_section="mindspore_config"
|
||||
fi
|
||||
eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
|
||||
|
||||
if [ x"${hardware}" == x"cluster" ];then
|
||||
export CLUSTER=True
|
||||
IFS=","
|
||||
array=($mpirun_ip)
|
||||
m=${array[0]#*:}
|
||||
rank_size=0
|
||||
mpirun_all_ip=""
|
||||
for var in ${array[@]}; do
|
||||
n=${var#*:}
|
||||
mpirun_all_ip+=" ${var%:*}"
|
||||
let a="$n & ($n-1)"
|
||||
let rank_size+=$n
|
||||
if [ $a -ne 0 ] || [ $n -ne $m ];then
|
||||
echo "mpirun_ip: $mpirun_ip error"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
export MPIRUN_ALL_IP=${mpirun_all_ip#?}
|
||||
else
|
||||
rank_size=${hardware%?}
|
||||
fi
|
||||
|
||||
eval device_group=\$device_group_${rank_size}p
|
||||
if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
|
||||
device_group="$(seq 0 "$(expr $rank_size - 1)")"
|
||||
fi
|
||||
|
||||
#tensorflow docker时要映射的路径
|
||||
if [ x"${framework}" == x"tensorflow" ]; then
|
||||
if [ x"${hardware}" != x"cluster" ];then
|
||||
# 仅单机执行需要配置 json
|
||||
bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1
|
||||
fi
|
||||
yaml_file_name=${yamlPath##*/}
|
||||
train_model_name=${yaml_file_name%%.*}
|
||||
if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then
|
||||
data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}"
|
||||
elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then
|
||||
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
|
||||
elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then
|
||||
raw_data=${training_file_pattern%raw_data*}raw_data
|
||||
data_urls="-v ${raw_data}:${raw_data}"
|
||||
else
|
||||
data_urls="-v ${data_url}:${data_url}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
if [ x"${framework}" == x"pytorch" ]; then
|
||||
if [ x"${train_model_name}" == x"ResNet50" ]; then
|
||||
data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
|
||||
else
|
||||
data_urls="-v ${data_url}:${data_url}"
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
if [ x"$model" == x"docker" ];then
|
||||
# docker 侧执行
|
||||
if [ x"${hardware}" == x"cluster" ];then
|
||||
# docker多机
|
||||
docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" &
|
||||
else
|
||||
DEVICE_DEV=""
|
||||
for device_id in $device_group;do
|
||||
DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}`
|
||||
done
|
||||
docker run -i --ipc=host \
|
||||
${DEVICE_DEV} --device=/dev/davinci_manager \
|
||||
--device=/dev/devmm_svm --device=/dev/hisi_hdc \
|
||||
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
|
||||
-v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
|
||||
-v ${train_dir}:${train_dir} \
|
||||
-v ${modelDir}:${modelDir} \
|
||||
${data_urls} \
|
||||
-v ${yamlDir}:${yamlDir} \
|
||||
-v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
|
||||
-v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
|
||||
-v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
|
||||
/bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" &
|
||||
fi
|
||||
elif [ x"$model" == x"host" ]; then
|
||||
# host 侧执行
|
||||
bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} &
|
||||
fi
|
||||
workshell=$!
|
||||
timeused=0
|
||||
while true
|
||||
do
|
||||
ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep`
|
||||
if [ x"${ret}" = x ];
|
||||
then
|
||||
break
|
||||
else
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s "
|
||||
sleep 5
|
||||
let timeused+=5
|
||||
#如果超过配置的timeout时间,则kill 掉python训练进程
|
||||
if [ ${timeused} -gt ${timeout} ];
|
||||
then
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training timeout ! "
|
||||
#获取python进程ID
|
||||
train_sh_pid=`pgrep -P $(pgrep -P $workshell)`
|
||||
for pid in $train_sh_pid
|
||||
do
|
||||
id=`pgrep -P $pid`
|
||||
kill -9 $id
|
||||
done
|
||||
break
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
||||
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] process end "
|
||||
Reference in New Issue
Block a user