[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,135 @@
+# VERSION: 20.0.0.RC1
+# 说明：提前下载好昇腾cmake包和OpenMPI软件包在在目录下
+FROM ubuntu:18.04
+
+ENV http_proxy="http://ptaishanpublic2:Huawei123@90.90.64.10:8080"
+ENV https_proxy="http://ptaishanpublic2:Huawei123@90.90.64.10:8080"
+ENV no_proxy=127.0.0.1,.huawei.com,localhost,local,.local
+
+
+ARG TF_PKG=tensorflow-1.15.0-cp37-cp37m-linux_aarch64.whl
+ARG HOST_ASCEND_BASE=/usr/local/Ascend
+ARG NNAE_PATH=/usr/local/Ascend/nnae/latest
+ARG TF_PLUGIN_PATH=/usr/local/Ascend/tfplugin/latest
+ARG INSTALL_ASCEND_PKGS_SH=install_ascend_pkgs.sh
+ARG PREBUILD_SH=prebuild.sh
+ARG POSTBUILD_SH=postbuild.sh
+WORKDIR /tmp
+COPY . ./
+COPY sources.list /etc/apt/ 
+COPY pip.conf /root/.pip/
+
+# 触发prebuild.sh
+RUN bash -c "test -f $PREBUILD_SH && bash $PREBUILD_SH || true"
+
+
+# 系统包
+RUN apt update
+RUN apt install --no-install-recommends python3.7 python3.7-dev -y
+RUN apt install --no-install-recommends curl g++ gcc pkg-config unzip -y
+RUN apt install --no-install-recommends libblas3 liblapack3 liblapack-dev libblas-dev gfortran libhdf5-dev libffi-dev libssl-dev\
+                                        libicu60 libxml2 -y
+# benchmark系统依赖包
+RUN apt-get update
+RUN apt-get install -y openssh-client
+RUN apt-get install -y net-tools
+RUN apt-get install -y openssh-server
+RUN apt-get install -y inetutils-ping
+RUN apt-get install -y psmisc
+RUN apt-get install -y iproute2
+RUN apt-get install -y wget
+RUN apt-get install -y vim
+
+# pip3.7
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    cd /tmp && \
+    apt-get download python3-distutils && \
+    dpkg-deb -x python3-distutils_*.deb / && \
+    rm python3-distutils_*.deb && \
+    cd - && \
+    python3.7 get-pip.py && \
+    rm get-pip.py
+
+# HwHiAiUser
+RUN groupadd HwHiAiUser && \
+    useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser
+
+# python包
+RUN pip3.7 install numpy && \
+    pip3.7 install decorator && \
+    pip3.7 install attrs && \
+    pip3.7 install sympy==1.4 && \
+    pip3.7 install cffi==1.12.3 && \
+    pip3.7 install pyyaml && \
+    pip3.7 install wheel && \
+    pip3.7 install pathlib2 && \
+    pip3.7 install grpcio && \
+    pip3.7 install grpcio-tools && \
+    pip3.7 install protobuf && \
+    pip3.7 install scipy && \
+    pip3.7 install Pillow==5.3.0 && \
+    pip3 install torchvision --no-deps && \
+    pip3.7 install requests
+
+# Ascend包
+RUN bash $INSTALL_ASCEND_PKGS_SH
+
+# 安装Cmake
+RUN tar -zxvf cmake-3.18.0.tar.gz
+WORKDIR cmake-3.18.0
+RUN mkdir -p /usr/local/cmake-3.18.0
+RUN ./configure --prefix=/usr/local/cmake-3.18.0
+RUN make && make install
+
+# 安装OpenMPI开源库
+WORKDIR /tmp
+
+RUN tar -jxvf openmpi-4.0.2.tar.bz2
+WORKDIR openmpi-4.0.2
+RUN mkdir -p /usr/local/mpirun4.0.2
+RUN ./configure --prefix=/usr/local/mpirun4.0.2
+RUN make && make install
+
+WORKDIR /tmp
+
+# TF安装
+ENV LD_LIBRARY_PATH=\
+/usr/lib/aarch64-linux-gnu/hdf5/serial:\
+$HOST_ASCEND_BASE/add-ons:\
+$NNAE_PATH/fwkacllib/lib64:\
+$HOST_ASCEND_BASE/driver/lib64/common:\
+$HOST_ASCEND_BASE/driver/lib64/driver:$LD_LIBRARY_PATH
+
+RUN pip3.7 install $TF_PKG
+
+# 环境变量
+ENV GLOG_v=2
+ENV TBE_IMPL_PATH=$NNAE_PATH/opp/op_impl/built-in/ai_core/tbe
+ENV TF_PLUGIN_PKG=$TF_PLUGIN_PATH/tfplugin/python/site-packages
+ENV FWK_PYTHON_PATH=$NNAE_PATH/fwkacllib/python/site-packages
+ENV PATH=$NNAE_PATH/fwkacllib/ccec_compiler/bin:$PATH
+ENV ASCEND_OPP_PATH=$NNAE_PATH/opp
+ENV PYTHONPATH=\
+$FWK_PYTHON_PATH:\
+$FWK_PYTHON_PATH/auto_tune.egg:\
+$FWK_PYTHON_PATH/schedule_search.egg:\
+$TF_PLUGIN_PKG:\
+$TBE_IMPL_PATH:\
+$PYTHONPATH
+ENV OPENMPI=/usr/local/mpirun4.0.2/
+ENV LD_LIBRARY_PATH=$OPENMPI/lib/
+ENV PATH=$OPENMPI/bin:$PATH
+
+# 免密登录
+RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa -P '' && cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && \
+    sed -i 's/PermitEmptyPasswords yes/PermitEmptyPasswords no /' /etc/ssh/sshd_config && \ 
+    sed -i 's/PermitRootLogin without-password/PermitRootLogin yes /' /etc/ssh/sshd_config && \ 
+    echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config && \ 
+    echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ 
+    echo "root:1234" | chpasswd 
+CMD [ "sh", "-c", "sudo service ssh start; bash"]
+
+
+# 触发postbuild.sh
+RUN bash -c "test -f $POSTBUILD_SH && bash $POSTBUILD_SH || true" && \
+    rm $POSTBUILD_SH
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#--------------------------------------------------------------------------------
+# VERSION: 20.0.0.RC1
+# 请在此处使用使用bash语法编写脚本代码，安装昇腾软件包
+#
+# 注：本脚本运行结束后不会被自动清除，若无需保留在镜像中请在postbuild.sh脚本中清除
+#--------------------------------------------------------------------------------
+
+ASCEND_NNAE=Ascend-cann-nnae_20.1.0.B030_linux-aarch64.run
+ASCEND_TFPLUGIN=Ascend-fwk-tfplugin_20.1.0.B030_linux-aarch64.run
+
+# 构建之前把host上的/etc/ascend_install.info拷贝一份到当前目录
+cp ascend_install.info /etc/
+# 构建之前把host的/usr/local/Ascend/driver/version.info拷贝一份到当前目录
+mkdir -p /usr/local/Ascend/driver/
+cp version.info /usr/local/Ascend/driver/
+# Ascend-NNAE-20.0.0.B001-arm64-linux_gcc7.3.0.run
+chmod +x ${ASCEND_NNAE}
+./${ASCEND_NNAE} --install-path=/usr/local/Ascend/ --install --quiet
+# Ascend-TFPlugin-20.0.0.B001-arm64-linux_gcc7.3.0.run
+chmod +x ${ASCEND_TFPLUGIN}
+./${ASCEND_TFPLUGIN} --install-path=/usr/local/Ascend/ --install --quiet
+
+# 只为了安装nnae包，所以需要清理，容器启动时通过ascend docker挂载进来
+rm -f version.info
+rm -rf /usr/local/Ascend/driver/
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+#--------------------------------------------------------------------------------
+# VERSION: 20.0.0.RC1
+# 请在此处使用使用bash语法编写脚本代码，清除不需要保留在容器中的安装包、脚本、代理配置等
+# 本脚本将会在正式构建过程结束后被执行
+#
+# 注：本脚本运行结束后会被自动清除，不会残留在镜像中；脚本所在位置和Working Dir位置为/tmp
+#--------------------------------------------------------------------------------
+rm -f ascend_install.info
+rm -f prebuild.sh
+rm -f install_ascend_pkgs.sh
+rm -f Dockerfile*
+rm -f cmake*
+rm -f openmpi*
+rm -f Ascend-cann-nnae_20.1.0.B030_linux-aarch64.run
+rm -f Ascend-fwk-tfplugin_20.1.0.B030_linux-aarch64.run
+rm -f tensorflow-1.15.0-cp37-cp37m-linux_aarch64.whl
+# rm -f /etc/apt/apt.conf.d/80proxy
+tee /etc/resolv.conf <<- EOF
+# This file is managed by man:systemd-resolved(8). Do not edit.
+#
+# This is a dynamic resolv.conf file for connecting local clients to the
+# internal DNS stub resolver of systemd-resolved. This file lists all
+# configured search domains.
+#
+# Run "systemd-resolve --status" to see details about the uplink DNS servers
+# currently in use.
+#
+# Third party programs must not access this file directly, but only through the
+# symlink at /etc/resolv.conf. To manage man:resolv.conf(5) in a different way,
+# replace this symlink by a static file or a different symlink.
+#
+# See man:systemd-resolved.service(8) for details about the supported modes of
+# operation for /etc/resolv.conf.
+options edns0
+nameserver 8.8.8.8
+nameserver 8.8.4.4
+EOF
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#--------------------------------------------------------------------------------
+# VERSION: 20.0.0.RC1
+# 请在此处使用使用bash语法编写脚本代码，执行安装准备工作，例如配置代理等
+# 本脚本将会在正式构建过程启动前被执行
+#
+# 注：本脚本运行结束后不会被自动清除，若无需保留在镜像中请在postbuild.sh脚本中清除
+#--------------------------------------------------------------------------------
+#dns代理配置，修改“/etc/resolv.conf”文件，在文件中加入如下粗体内容，用户需根据实际情况进行配置。
+tee /etc/resolv.conf <<- EOF
+nameserver 10.72.255.100
+EOF
+
+
+	
@@ -0,0 +1,166 @@
+## atlasboost
+[TOC]
+### 产品介绍
+atlasboost提供了如下功能：
+(1) 一键式地启动单机或多机上的训练脚本，并行执行训练任务；
+(2) 自动收集参与训练的device信息，生成rank table file；
+(3) 通过mpi重定向功能，可实时的监控训练过程；
+
+### 目录结构
+源代码的目录结构如下：
+```
+.
+├── atlasboost
+│   ├── common
+│   │   ├── bin
+│   │   ├── CMakeLists.txt
+│   │   ├── context.cpp
+│   │   ├── context.h
+│   │   ├── control.cpp
+│   │   ├── control.h
+│   │   ├── json.cpp
+│   │   ├── json.h
+│   │   ├── operations.cpp
+│   │   └── operations.h
+│   └── tensorflow
+│       ├── basics.py
+│       ├── __init__.py
+│       └── mpi_ops.py
+├── build
+│   ├── build.sh
+|   ├── compile.sh
+|   ├── compile_for_ci.sh
+|   └── openmpi_setup.sh
+├── config
+├── lib
+├── doc
+├── opensource
+├── output
+├── README.md
+└── test
+    ├── mpi_local.sh
+    ├── mpi.sh
+    └── test_tensorflow.py
+```
+目录结构说明如下：
+(1) atlasboost: 用户在训练python脚本中导入的模块；
+(2) common: C++源代码，用于收集device信息，生成rank table file；
+(3) tensorflow: 支持tensorflow框架，设置环境变量，对外提供python接口；
+(4) build: 编译脚本，用于编译common中的C++源代码；
+(5) test: 测试脚本，可用于测试运行环境；
+### 支持的产品
+Ascend 910
+### 支持的版本
+
+### atlasboost引入
+（1）按照目录结构放入到一个公共的目录中,比如当前服务器创建一个目录public,把以上目录结构放到public中，则通过修改PYTHONPATH=$PYTHONPATH:./public/,外部就可以使用atlasboot接口了。
+（2）通过执行./setup --path dir(可选，root用户的默认目录是/usr/local/atlasboost,非root用户默认目录是/home/username/atlasboost),则会在默认路径或者dir目录下创建atlasboost文件夹,把安装的内容放在此目录下,若dir/atlasboost已经存在,则会有交互提示(是否继续在此目录下安装,请输入y/n),输入y则会覆盖此目录下重名的文件,输入n则会退出安装。
+
+### 环境依赖
+atlasboost依赖于开源库Open MPI和Ascend 910软件中的DSMI接口；
+(1) 安装Open MPI
+下载4.0.2版本的Open MPI，下载地址：
+https://www.open-mpi.org/software/ompi/v4.0/
+解压
+```
+tar -jxvf openmpi-4.0.2.tar.bz2
+```
+配置，编译和安装
+```
+./configure
+make && make install
+```
+使配置生效
+```
+ldconfig
+```
+测试
+```
+mpirun --version
+```
+(2) DSMI
+atlasboost中调用DSMI接口获取device的相关信息，编译脚本compile.sh内容如下：
+```
+#!/bin/bash
+
+export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/Ascend/driver/kernel/inc/driver
+export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/Ascend/driver/lib64/driver
+
+CUR_DIR=$(dirname $(readlink -f $0))
+cd ${CUR_DIR}/../atlasboost/common
+echo 2 > /proc/sys/kernel/randomize_va_space
+cmake .
+make
+```
+其中CPLUS_INCLUDE_PATH和LIBRARY_PATH分别指定了DSMI头文件和对应的动态链接库路径。
+### 使用说明
+提示：由于通过gethostbyname获取服务器IP，故需要配置host。
+#### 1.单机环境测试
+将源代码在服务器上解压，然后编译：
+```
+cd atlasboost/build
+./compile.sh
+```
+然后执行atlasboost/test目录下的测试脚本：
+```
+./mpi_local.sh
+```
+该测试程序创建了4条进程，分别收集了服务器上device0到device3的信息，在atlasboost/test生成一份rank_table_file，检查一下该文件中信息是否正确。
+#### 2.单机多卡训练
+将atlasboost文件夹复制到训练脚本中(只要Python导入模块时能找到) ，在python的启动脚本中导入atlasboost模块：
+```
+import atlasboost.tensorflow.mpi_ops as atlasboost
+```
+在python的启动脚本开始时调用atlasboost接口，在main函数中添加如下代码：
+```
+初始化时传入支持的框架(tensorflow或者mindspore)，默认是tensorflow.
+atlasboost.init(frame="tensorflow") 
+device_id = atlasboost.local_rank()
+atlasboost. set_device_id (device_id)
+```
+提示：若非mpi启动训练任务请不要调用以上接口,并且同一台机器上的device_id不要相同。
+atlasboost模块初始化之后，每条进程会动态生成一个进程id，若在一台服务器上创建了n条进程，则进程id分别为0到n-1，用户需要根据进程id为每条进程分配一个device(process_id映射到device_id)，可直接使用进程id作为device id，如上所示。
+执行命令启动训练脚本：
+```
+mpirun -np 8 -bind-to none -map-by slot --allow-run-as-root ./start.sh
+```
+其中，-np参数指定启动进程个数，该命令在当前服务器上启动8条进程，start.sh为模型的启动脚本， atlasboost模块会在当前目录为每一台服务器创建rank_table_file，文件在启动目录中。
+#### 3.多机环境部署与测试
+首先在每台参与训练的服务器中进行单机环境测试；
+在多机环境下使用atlasboost，需要配置启动训练服务器到其他参与训练服务器SSH免密登录；
+在启动服务器生成公钥：
+```
+ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa
+```
+将启动服务器的公钥发送到其他每台服务器
+测试：
+```
+ssh xx.xx.xx.xx
+```
+若免密登录配置成功，则可直接使用SSH登录到xx.xx.xx.xx。
+在每台服务器的相同位置保存一份atlasdc，若OS属于不同的CPU架构(arm或X86)，需要重新编译；在启动服务器中，切换到atlasdc/test目录下，配置mpi.sh脚本：
+```
+#!/bin/bash
+
+mpirun -H xx.xx.xx.xx:2,xx.xx.xx.xx:4 \
+       --allow-run-as-root \
+       --mca btl_tcp_if_exclude lo,docker0,endvnic \
+       python3 test_tensorflow.py
+```
+该脚本为在多台服务器上同时启动多条进程的命令，其中-H参数指定了启动哪些服务器上的test_tensorflow.py脚本以及每台服务器上启动几条进程，其中冒号后数值即为在该服务器上启动进程数，根据自己的环境进行配置。
+执行测试脚本：
+```
+./mpi.sh
+```
+若多机环境正常，则会在每台服务器的atlasboost/test目录下生成进程的工作目录，工作目录中生成了rank table file。
+#### 4.多机多卡训练
+训练脚本经过单机多卡分布式部署的配置之后，将训练脚本复制到每台参与训练服务器的相同位置，然后执行如下命令：
+```
+mpirun -H xx.xx.xx.xx:8,xx.xx.xx.xx:8 \
+       --allow-run-as-root \
+       -bind-to none -map-by slot \
+       --mca btl_tcp_if_exclude lo,docker0,endvnic \
+       ./mpi_start.sh
+```
+该命令在每台服务器上都启动了8条进程进行训练，每台服务器都生成了rank_table_file，其中--mca btl_tcp_if_exclude参数用于限制tcp通信时使用的网卡(不使用lo,docker0,endvnic)。
+
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+import os
+import subprocess
+import yaml
+
+
+def get_model_parameter(config_type):
+    yaml_path = os.getenv("YAML_PATH")
+    with open(yaml_path, 'r') as f:
+        model_parameter_dict = yaml.load(f)
+    parameter_dict = model_parameter_dict.get(config_type)
+    if "tensorflow" in config_type:
+        parameter_dict.pop("mpirun_ip")
+    parameter_dict.pop("docker_image")
+    return parameter_dict
+
+
+def get_environment_info(framework):
+    cpu_info = subprocess.getstatusoutput('lscpu')[1]
+    cpu_info = cpu_info.split("\nFlags")[0]
+    cpu_info_list = cpu_info.split()
+    cpu_info_keys = []
+    cpu_info_values = []
+    value_info = ""
+    for i in cpu_info_list:
+        if ":" not in i:
+            value_info += i
+        else:
+            i = i.split(":")[0]
+            cpu_info_keys.append(i)
+            if value_info:
+                cpu_info_values.append(value_info)
+            value_info = ""
+    cpu_info_dict = {}
+    for k, v in zip(cpu_info_keys, cpu_info_values):
+        cpu_info_dict[k] = v
+    NPU_info = "Ascend910"
+    framework_info = ""
+    if framework.lower() == "tensorflow":
+        import tensorflow as tf
+        framework_info = "tensorflow {}".format(tf.__version__)
+    if framework.lower() == "pytorch":
+        import torch
+        framework_info = "pytorch {}".format(torch.__version__)
+    os_info = subprocess.getstatusoutput('cat /proc/version')[1]
+    benchmark_version = "v1.0.0"
+    return cpu_info_dict, NPU_info, framework_info, os_info, benchmark_version
@@ -0,0 +1,276 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import logging
+import json
+import os
+import re
+import sys
+import uuid
+import datetime
+
+
+ABK_VERSION = "1.0.0"   # ABK version
+CPU_INFO = "cpu_info"
+NPU_INFO = "npu_info"
+OS_INFO = "os_info"
+FRAMEWORK_INFO = "framework_info"
+CONFIG_INFO = "config_info"
+BENCHMARK_VERSION = "benchmark_version"
+YAML_INFO = "yaml_info"
+DATA_URL = "data_url"
+LOSS_SCALE = "loss_scale"
+ITERATION_TIME = "iteration_time"
+TOTAL_RUNNING_TIME = "total_running_time"
+LOSS = "loss"
+MLM_LOSS = "mlm_loss"
+NSP_LOSS = "nsp_loss"
+Average_LOSS = "average_loss"
+MASKED_LM_ACCURACY = "masked_lm_accuracy"
+MASKED_LM_LOSS = "masked_lm_loss"
+NEXT_SENTENCE_ACCURACY = "next_sentence_accuracy"
+NEXT_SENTENCE_LOSS = "next_sentence_loss"
+GLOBAL_BATCH_SIZE = "global_batch_size"
+ACC = "acc"
+F1 = "f1"
+PREC = "prec"
+REC = "rec"
+RUN_START = "run_start"
+RUN_STOP = "run_stop"
+RUN_FINAL = "run_final"
+INPUT_SIZE = "input_size"
+INPUT_BATCH_SIZE = "input_batch_size"
+OPT_NAME = "opt_name"
+OPT_LR = "opt_learning_rate"
+OPT_MOMENTUM = "opt_momentum"
+OPT_WEIGHT_DECAY = "opt_weight_decay"
+GLOBAL_STEP = "global_step"
+CURRENT_STEP = "current_step"
+EVAL_RESULTS = "eval_results"
+TRAIN_LOOP = "train_loop"
+TOTAL_TRAIN_EPOCH = "total_train_epoch"
+CURRENT_EPOCH = "current_epoch"
+FPS = "fps"
+THROWOUT = "throwout"
+TRAIN_ACCURACY = "train_accuracy"
+TRAIN_ACCURACY_TOP1 = "train_accuracy_top1"
+TRAIN_ACCURACY_TOP5 = "train_accuracy_top5"
+TRAIN_CHECKPOINT = "train_checkpoint"
+EVAL_START = "eval_start"
+EVAL_SIZE = "eval_size"
+EVAL_TARGET = "eval_target"
+EVAL_ACCURACY = "eval_accuracy"
+EVAL_ACCURACY_TOP1 = "eval_accuracy_top1"
+EVAL_ACCURACY_TOP5 = "eval_accuracy_top5"
+EVAL_STOP = "eval_stop"
+EVAL_ITERATION_ACCURACY = "eval_iteration_accuracy"
+DATASET = "dataset"
+BASE_LR = "base_lr"
+# Set by imagenet_main.py
+STDOUT_TAG_SET = {
+    ABK_VERSION,
+    CPU_INFO,
+    NPU_INFO,
+    OS_INFO,
+    FRAMEWORK_INFO,
+    CONFIG_INFO,
+    BENCHMARK_VERSION,
+    YAML_INFO,
+    DATA_URL,
+    DATASET,
+    TRAIN_ACCURACY,
+    LOSS_SCALE,
+    ITERATION_TIME,
+    TOTAL_RUNNING_TIME,
+    RUN_START,
+    RUN_STOP,
+    RUN_FINAL,
+    INPUT_SIZE,
+    GLOBAL_BATCH_SIZE,
+    INPUT_BATCH_SIZE,
+    OPT_NAME,
+    OPT_LR,
+    BASE_LR,
+    OPT_MOMENTUM,
+    OPT_WEIGHT_DECAY,
+    GLOBAL_STEP,
+    CURRENT_STEP,
+    TRAIN_LOOP,
+    TRAIN_ACCURACY_TOP1,
+    TRAIN_ACCURACY_TOP5,
+    TOTAL_TRAIN_EPOCH,
+    CURRENT_EPOCH,
+    FPS,
+    THROWOUT,
+    TRAIN_CHECKPOINT,
+    EVAL_START,
+    EVAL_SIZE,
+    EVAL_TARGET,
+    EVAL_ACCURACY,
+    EVAL_ACCURACY_TOP1,
+    EVAL_ACCURACY_TOP5,
+    EVAL_STOP,
+    EVAL_ITERATION_ACCURACY,
+    MLM_LOSS,
+    NSP_LOSS,
+    Average_LOSS,
+    MASKED_LM_ACCURACY,
+    MASKED_LM_LOSS,
+    NEXT_SENTENCE_ACCURACY,
+    NEXT_SENTENCE_LOSS,
+    LOSS,
+    EVAL_RESULTS,
+    ACC,
+    F1,
+    PREC,
+    REC,
+}
+
+
+REMARK_TAGS = (
+    ABK_VERSION,
+    CPU_INFO,
+    NPU_INFO,
+    OS_INFO,
+    FRAMEWORK_INFO,
+    CONFIG_INFO,
+    BENCHMARK_VERSION,
+    YAML_INFO,
+    DATA_URL,
+    DATASET,
+    LOSS_SCALE,
+    ITERATION_TIME,
+    TOTAL_RUNNING_TIME,
+    RUN_START,
+    RUN_STOP,
+    RUN_FINAL,
+    INPUT_SIZE,
+    GLOBAL_BATCH_SIZE,
+    INPUT_BATCH_SIZE,
+    OPT_NAME,
+    TRAIN_ACCURACY,
+    TRAIN_ACCURACY_TOP1,
+    TRAIN_ACCURACY_TOP5,
+    OPT_LR,
+    BASE_LR,
+    OPT_MOMENTUM,
+    OPT_WEIGHT_DECAY,
+    GLOBAL_STEP,
+    CURRENT_STEP,
+    TRAIN_LOOP,
+    TOTAL_TRAIN_EPOCH,
+    CURRENT_EPOCH,
+    FPS,
+    THROWOUT,
+    TRAIN_CHECKPOINT,
+    EVAL_START,
+    EVAL_SIZE,
+    EVAL_TARGET,
+    EVAL_ACCURACY,
+    EVAL_ACCURACY_TOP1,
+    EVAL_ACCURACY_TOP5,
+    EVAL_STOP,
+    EVAL_ITERATION_ACCURACY,
+    MLM_LOSS,
+    NSP_LOSS,
+    Average_LOSS,
+    MASKED_LM_ACCURACY,
+    MASKED_LM_LOSS,
+    NEXT_SENTENCE_ACCURACY,
+    NEXT_SENTENCE_LOSS,
+    LOSS,
+    EVAL_RESULTS,
+    ACC,
+    F1,
+    PREC,
+    REC,
+)
+
+
+ABK_VERSION = "1.0.0"   # ABK version
+ROOT_DIR = None
+PATTERN = re.compile('[a-zA-Z0-9]+')
+LOG_FILE = os.getenv("REMARK_LOG_FILE")
+LOGGER = logging.getLogger('benchmark_log')
+LOGGER.setLevel(logging.DEBUG)
+_STREAM_HANDLER = logging.StreamHandler(stream=sys.stdout)
+_STREAM_HANDLER.setLevel(logging.INFO)
+LOGGER.addHandler(_STREAM_HANDLER)
+BENCHMARK = (os.getenv("REMARK_LOG_FILE").split("_")[1]).split(".")[0]
+
+
+if LOG_FILE:
+    _FILE_HANDLER = logging.FileHandler(LOG_FILE)
+    _FILE_HANDLER.setLevel(logging.DEBUG)
+    LOGGER.addHandler(_FILE_HANDLER)
+else:
+    _STREAM_HANDLER.setLevel(logging.DEBUG)
+
+
+def get_caller(stack_index=2, root_dir=None):
+    ''' Returns file.py:lineno of your caller. A stack_index of 2 will provide
+        the caller of the function calling this function. Notice that stack_index
+        of 2 or more will fail if called from global scope. '''
+    caller = inspect.getframeinfo(inspect.stack()[stack_index][0])
+    # Trim the filenames for readability.
+    filename = caller.filename
+    filename = os.path.basename(filename)
+    # if root_dir is not None:
+    #  filename = re.sub("^" + root_dir + "/", "", filename)
+    return "%s:%d" % (filename, caller.lineno)
+
+
+TAG_SET = set(REMARK_TAGS)
+
+
+def remark_print(key, value=None, benchmark=BENCHMARK, stack_offset=0,
+                 tag_set=TAG_SET, deferred=False, root_dir=ROOT_DIR,
+                 extra_print=False):
+    ''' Prints out an benchmark Log Line.
+    key: The benchmark log key such as 'EVAL_ACCURACY_TOP1' or 'FPS'.
+    value: The value which contains no newlines.
+    benchmark:  model type: such as resnet50
+    stack_offset: Increase the value to go deeper into the stack to find the callsite. For example, if this
+                  is being called by a wraper/helper you may want to set stack_offset=1 to use the callsite
+                  of the wraper/helper itself.
+    tag_set: The set of tags in which key must belong.
+    deferred: The value is not presently known. In that case, a unique ID will
+              be assigned as the value of this call and will be returned. The
+              caller can then include said unique ID when the value is known
+              later.
+    root_dir: Directory prefix which will be trimmed when reporting calling file
+              for compliance logging.
+    extra_print: Print a blank line before logging to clear any text in the line.
+    Example output:
+      ::::ABK V1.0.0 resnet50 2020-08-12 06:22:09.670723 (hooks.py:149) fps: 681.8494655321242
+    '''
+
+    return_value = None
+    if (tag_set is None and not PATTERN.match(key)) or key not in tag_set:
+        raise ValueError('Invalid key for MLPerf print: ' + str(key))
+    if value is not None and deferred:
+        raise ValueError("deferred is set to True, but a value was provided")
+    if deferred:
+        return_value = str(uuid.uuid4())
+        value = "DEFERRED: {}".format(return_value)
+    if value is None:
+        tag = key
+    else:
+        str_json = json.dumps(value)
+        tag = "{key}: {value}".format(key=key, value=str_json)
+    callsite = get_caller(2 + stack_offset, root_dir=root_dir)
+    # now = time.time()
+    now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
+    message = ':::ABK {version} {benchmark} {secs} ({callsite}) {tag}'.format(
+        version=ABK_VERSION, secs=now, benchmark=benchmark, callsite=callsite, tag=tag)
+    if extra_print:
+        print()  # There could be prior text on a line
+    if tag in STDOUT_TAG_SET:
+        LOGGER.info(message)
+    else:
+        LOGGER.debug(message)
+    return return_value
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Get COCO 2014 data sets
+
+if [ $1 == 'YoLov3' ];then
+	echo 111
+	mkdir -p /home/datasets/coco
+	pushd /home/datasets/coco
+
+	curl -O http://images.cocodataset.org/zips/train2014.zip
+	unzip train2014.zip
+
+	curl -O http://images.cocodataset.org/zips/val2014.zip
+	unzip val2014.zip
+
+	curl -O http://images.cocodataset.org/annotations/annotations_trainval2014.zip
+	unzip annotations_trainval2014.zip
+
+# Get bert/cule data sets
+elif [ $1 == 'Bert' ];then
+	echo 222
+	mkdir -p /home/datasets/Bertdata
+	pushd /home/datasets/Bertdata
+
+	curl -O xxxxxxxxxxx
+	tar xxxxx 
+
+# Get imagenet_TF data sets
+else
+	echo 333
+	mkdir -p /home/datasets/imagenet_TF
+	pushd /home/datasets/imagenet_TF
+
+
+	curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_val.tar
+	tar xvf ILSVRC2012_img_val.tar
+
+	curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_img_train.tar
+	tar xvf ILSVRC2012_img_train.tar
+
+	curl -O http://www.image-net.org/challenges/LSVRC/2012/nnoupb/ILSVRC2012_bbox_train_v2.tar
+	tar xvf ILSVRC2012_bbox_train_v2.tar
+fi
+
+
+
+popd
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+#  解析 yaml 文件中的配置, 并以键值对形式输出
+#  args:
+#      $1: yaml 文件路径
+#      $2: 要获取的节点名
+#
+#  return:
+#      key1=value1
+#      key2=value2
+#      ...
+#
+#  可以使用
+#  `eval $(./get_params_for_yaml.sh $yamlPath $section)`
+#  直接将参数作为变量存入内存
+
+
+params=$(python3.7 -c "import yaml; print('\n'.join(['%s=\"%s\"' % i for i in yaml.load(open(r'$1'), Loader=yaml.FullLoader).get('$2').items()]))")
+if [ x"$params" == x"" ];then
+    echo "path: $1 not found key: $2"
+    exit 1
+fi
+echo -e "$params"
@@ -0,0 +1,9 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [{devices}],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+rank_size=$1
+yamlPath=$2
+modelDir=$3
+config_section=$4
+currentDir=$(cd "$(dirname "$0")"; pwd)
+
+# 从 yaml 获取配置
+eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
+
+# device 列表, 若无指定 device 时根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+arr=($device_group)
+if [ ${#arr[@]} -ne ${rank_size} ];then
+    echo "ERROR: device_group: $device_group, quantity is not equal to rank_size: $rank_size"
+    exit 1
+fi
+
+HCCL_dir=$modelDir/config
+cp ${currentDir}/hccl_sample.json ${HCCL_dir}/${rank_size}p.json
+
+DEVICES=""
+
+rank_id=0
+for device_id in $device_group;do
+    DEVICE_IP=`hccn_tool -i ${device_id} -ip -g|awk -F ":" '/ipaddr/{print $2}'`
+    DEVICES+="\n\
+            {\n\
+                \"device_id\": \"${device_id}\",\n\
+                \"device_ip\": \"${DEVICE_IP}\",\n\
+                \"rank_id\": \"${rank_id}\"\n\
+            },"
+    let rank_id++
+done
+sed -i 's#{devices}#'"${DEVICES%?}"'#g' ${HCCL_dir}/${rank_size}p.json
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+
+model=$1
+hardware=$2
+yamlPath=$3
+modelDir=$4
+framework=$5
+
+modelScripts="$modelDir/scripts"
+
+currentDir=$(cd "$(dirname "$0")"; pwd)
+yamlDir=$(cd "$(dirname "${yamlPath}")";pwd)
+train_dir=${currentDir%train*}/train
+timeout=360000
+# 从 yaml 获取配置
+if [ x"${framework}" == x"tensorflow" ]; then
+    config_section="tensorflow_config"
+elif [ x"${framework}" == x"pytorch" ]; then
+    config_section="pytorch_config"
+else
+    config_section="mindspore_config"
+fi
+eval $(${currentDir}/get_params_for_yaml.sh ${yamlPath} ${config_section})
+
+if [ x"${hardware}" == x"cluster" ];then
+    export CLUSTER=True
+    IFS=","
+    array=($mpirun_ip)
+    m=${array[0]#*:}
+    rank_size=0
+    mpirun_all_ip=""
+    for var in ${array[@]}; do
+        n=${var#*:}
+        mpirun_all_ip+=" ${var%:*}"
+        let a="$n & ($n-1)"
+        let rank_size+=$n
+        if [ $a -ne 0 ] || [ $n -ne $m ];then
+            echo "mpirun_ip: $mpirun_ip error"
+            exit 1
+        fi
+    done
+    export MPIRUN_ALL_IP=${mpirun_all_ip#?}
+else
+    rank_size=${hardware%?}
+fi
+
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+#tensorflow docker时要映射的路径
+if [ x"${framework}" == x"tensorflow" ]; then
+    if [ x"${hardware}" != x"cluster" ];then
+	# 仅单机执行需要配置 json
+        bash ${currentDir}/set_json.sh ${rank_size} ${yamlPath} ${modelDir} ${config_section} || exit 1
+    fi
+    yaml_file_name=${yamlPath##*/}
+    train_model_name=${yaml_file_name%%.*}
+    if [ x"${train_model_name}" == x"Bert-Base" ] || [ x"${train_model_name}" == x"Bert-Large" ]; then
+        data_urls="-v ${input_files_dir}:${input_files_dir} -v ${eval_files_dir}:${eval_files_dir}"
+    elif [ x"${train_model_name}" == x"MobileNet" ] || [ x"${train_model_name}" == x"YoLoV3" ]; then
+        data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
+    elif [ x"${train_model_name}" == x"SSD-Resnet34" ]; then
+        raw_data=${training_file_pattern%raw_data*}raw_data
+        data_urls="-v ${raw_data}:${raw_data}"
+    else
+        data_urls="-v ${data_url}:${data_url}"
+    fi
+fi
+
+
+if [ x"${framework}" == x"pytorch" ]; then
+    if [ x"${train_model_name}" == x"ResNet50" ]; then
+        data_urls="-v ${data_url}:${data_url} -v ${ckpt_path}:${ckpt_path}"
+    else
+        data_urls="-v ${data_url}:${data_url}"
+    fi
+fi
+
+
+if [ x"$model" == x"docker" ];then
+    # docker 侧执行
+    if [ x"${hardware}" == x"cluster" ];then
+        # docker多机
+        docker exec -i mpirun /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} ${CLUSTER} '${MPIRUN_ALL_IP}'" &
+    else
+        DEVICE_DEV=""
+        for device_id in $device_group;do
+            DEVICE_DEV=`echo "${DEVICE_DEV}" --device=/dev/davinci${device_id}`
+        done
+        docker run -i --ipc=host \
+        ${DEVICE_DEV} --device=/dev/davinci_manager \
+        --device=/dev/devmm_svm --device=/dev/hisi_hdc \
+        -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+        -v /usr/local/Ascend/add-ons/:/usr/local/Ascend/add-ons/ \
+        -v ${train_dir}:${train_dir} \
+        -v ${modelDir}:${modelDir}  \
+        ${data_urls} \
+        -v ${yamlDir}:${yamlDir}  \
+        -v /var/log/npu/conf/slog/slog.conf:/var/log/npu/conf/slog/slog.conf \
+        -v /var/log/npu/slog/:/var/log/npu/slog -v /var/log/npu/profiling/:/var/log/npu/profiling \
+        -v /var/log/npu/dump/:/var/log/npu/dump -v /var/log/npu/:/usr/slog ${docker_image} \
+        /bin/bash -c "${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir}" &
+    fi
+elif [ x"$model" == x"host" ]; then
+    # host 侧执行
+    bash ${modelScripts}/run.sh ${rank_size} ${yamlPath} ${currentDir} &
+fi
+workshell=$!
+timeused=0
+while true
+do
+    ret=`ps -ef | grep ${modelScripts}/run.sh | grep ${workshell} | grep -v grep`
+    if [ x"${ret}" = x ];
+    then
+        break
+    else
+        echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] train job is working, wait more 5s "
+        sleep 5
+        let timeused+=5
+        #如果超过配置的timeout时间，则kill 掉python训练进程
+        if [ ${timeused} -gt ${timeout} ];
+        then
+          echo "[`date +%Y%m%d-%H:%M:%S`] [ERROR] training  timeout ! "
+          #获取python进程ID
+          train_sh_pid=`pgrep -P $(pgrep -P $workshell)`
+          for pid in $train_sh_pid
+          do
+            id=`pgrep -P $pid`
+            kill -9 $id
+          done
+          break
+        fi
+    fi
+done
+
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO]  process end "