[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,36 @@
+FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
+
+
+WORKDIR /research
+
+RUN apt-get update
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    build-essential \
+    git \
+    python \
+    python-pip
+
+
+ENV HOME /research
+ENV PYENV_ROOT $HOME/.pyenv
+ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
+
+
+RUN apt-get install -y python-setuptools
+
+RUN apt-get install -y python-pip python3-pip virtualenv htop
+RUN pip3 install --upgrade numpy scipy sklearn tf-nightly-gpu
+
+
+# Mount data into the docker
+ADD . /research/resnet
+
+
+WORKDIR /research/resnet
+RUN pip3 install -r official/requirements.txt
+
+
+ENTRYPOINT ["/bin/bash"]
+
@@ -0,0 +1,47 @@
+#!/bin/sh
+currentDir=$(cd "$(dirname "$0")"; pwd)
+cd ${currentDir}
+
+DEVICE_LIST=$@
+
+export exec_type={MODE}
+
+prog_exit()
+{
+    if [ x"${exec_type}" = xdocker ];
+    then
+        # stop slogd progress
+        bash /usr/local/Ascend/driver/tools/docker_stop_post_sys.sh
+    fi
+}
+
+# register prog_exit
+trap "prog_exit" SIGTERM
+
+if [ x"${exec_type}" = xdocker ];
+then
+    #set env
+    . ${currentDir}/npu_set_env.sh
+
+    # start slogd progress
+    mkdir -p /var/log/npu/slog/slogd
+    /usr/local/Ascend/driver/tools/docker/slogd &
+
+    # start main.sh
+    ${currentDir}/main.sh ${DEVICE_LIST} &
+
+    # wait slogd stop
+    flag=1
+    while [ $flag -ne 0 ];
+    do
+        sleep 5;
+        flag=`ps -ef | grep train.sh | grep -v grep | wc -l`
+        ps -ef >> ${currentDir}/ps.log
+        echo "" >> ${currentDir}/ps.log
+    done
+else
+    # start main.sh
+    su - HwHiAiUser -c ". ${currentDir}/npu_set_env.sh;${currentDir}/main.sh ${DEVICE_LIST}" &
+    wait
+fi
+
@@ -0,0 +1,13 @@
+{
+"group_count": "1",
+"group_list": [
+{
+    "group_name": "worker",
+    "device_count": "{device_count}",
+    "instance_count": "{instance_count}",
+    "instance_list": [{instance_list}]
+}
+],
+"status": "completed"
+}
+
@@ -0,0 +1,18 @@
+#!/bin/sh
+currentDir=$(cd "$(dirname "$0")"; pwd)
+cd ${currentDir}
+
+device_group=$@
+device_num=$#
+
+touch ${currentDir}/main.log
+
+for device_phy_id in ${device_group}
+do
+    echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
+    ${currentDir}/train.sh ${device_phy_id}  &
+done
+
+wait
+
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
@@ -0,0 +1,28 @@
+# main env
+export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe:/code
+export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin
+export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+export DDK_VERSION_FLAG=1.60.T17.B830
+export HCCL_CONNECT_TIMEOUT=600
+
+# user env
+export JOB_ID={JOB_ID}
+export RANK_TABLE_FILE={RANK_TABLE_FILE}
+export RANK_SIZE={RANK_SIZE}
+export RANK_INDEX={RANK_INDEX}
+export RANK_ID={RANK_ID}
+
+# profiling env
+export PROFILING_MODE={PROFILING_MODE}
+export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
+export PROFILING_OPTIONS={PROFILING_OPTIONS}
+export FP_POINT={FP_POINT}
+export BP_POINT={BP_POINT}
+
+# debug env
+#export DUMP_GE_GRAPH=2
+#export DUMP_OP=1
+#export DUMP_OP_LESS=1
+#export PRINT_MODEL=1
+#export TE_PARALLEL_COMPILER=0
@@ -0,0 +1,33 @@
+#!/bin/sh
+currentDir=$(cd "$(dirname "$0")"; pwd)
+cd ${currentDir}
+
+PWD=${currentDir}
+
+device_id=$1
+if  [ x"${device_id}" = x ] ;
+then
+    echo "turing train fail" >> ${currentDir}/train_${device_id}.log
+    exit
+else
+    export DEVICE_ID=${device_id}
+fi
+
+DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
+export DEVICE_INDEX=${DEVICE_INDEX}
+
+env > ${currentDir}/env_${device_id}.log
+
+#mkdir exec path
+mkdir -p ${currentDir}/${device_id}
+rm -rf ${currentDir}/${device_id}/*
+cd ${currentDir}/${device_id}
+
+#start exec
+python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
+if [ $? -eq 0 ] ;
+then
+    echo "turing train success" >> ${currentDir}/train_${device_id}.log
+else
+    echo "turing train fail" >> ${currentDir}/train_${device_id}.log
+fi
@@ -0,0 +1,203 @@
+Copyright 2015 The TensorFlow Authors.  All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2015, The TensorFlow Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
@@ -0,0 +1,20 @@
+# Offically Supported TensorFlow 2.1 Models on Cloud TPU
+
+## Natural Language Processing
+
+*   [bert](nlp/bert): A powerful pre-trained language representation model:
+    BERT, which stands for Bidirectional Encoder Representations from
+    Transformers.
+    [BERT FineTuning with Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/bert-2.x) provides step by step instructions on Cloud TPU training. You can look [Bert MNLI Tensorboard.dev metrics](https://tensorboard.dev/experiment/mIah5lppTASvrHqWrdr6NA) for MNLI fine tuning task.
+*   [transformer](nlp/transformer): A transformer model to translate the WMT
+    English to German dataset.
+        [Training transformer on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/transformer-2.x) for step by step instructions on Cloud TPU training.
+
+## Computer Vision
+
+*   [mnist](vision/image_classification): A basic model to classify digits
+    from the MNIST dataset. See [Running MNIST on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/mnist-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/mIah5lppTASvrHqWrdr6NA).
+*   [resnet](vision/image_classification): A deep residual network that can
+    be used to classify ImageNet's dataset of 1000 classes.
+    See [Training ResNet on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/resnet-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/CxlDK8YMRrSpYEGtBRpOhg).
+*   [retinanet](vision/detection): A fast and powerful object detector. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/b8NRnWU3TqG6Rw0UxueU6Q).
@@ -0,0 +1,149 @@
+# TensorFlow Official Models
+
+The TensorFlow official models are a collection of models that use
+TensorFlow's high-level APIs. They are intended to be well-maintained, tested,
+and kept up to date with the latest TensorFlow API. They should also be
+reasonably optimized for fast performance while still being easy to read.
+
+These models are used as end-to-end tests, ensuring that the models run with the
+same or improved speed and performance with each new TensorFlow build.
+
+## Tensorflow releases
+
+The master branch of the models are **in development** with TensorFlow 2.x, and
+they target the
+[nightly binaries](https://github.com/tensorflow/tensorflow#installation) built
+from the
+[master branch of TensorFlow](https://github.com/tensorflow/tensorflow/tree/master).
+You may start from installing with pip:
+
+```shell
+pip3 install tf-nightly
+```
+
+**Stable versions** of the official models targeting releases of TensorFlow are
+available as tagged branches or
+[downloadable releases](https://github.com/tensorflow/models/releases). Model
+repository version numbers match the target TensorFlow release, such that
+[release v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0) are
+compatible with
+[TensorFlow v2.1.0](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0).
+
+If you are on a version of TensorFlow earlier than 1.4, please
+[update your installation](https://www.tensorflow.org/install/).
+
+## Requirements
+
+Please follow the below steps before running models in this repo:
+
+1.  TensorFlow
+    [nightly binaries](https://github.com/tensorflow/tensorflow#installation)
+
+2.  If users would like to clone this repo but do not care about change history,
+please consider:
+
+  ```shell
+  export repo_version="master"
+  git clone -b ${repo_version} https://github.com/tensorflow/models.git --depth=1
+  ```
+
+3.  Add the top-level ***/models*** folder to the Python path with the command:
+
+  ```shell
+  export PYTHONPATH=$PYTHONPATH:/path/to/models
+  ```
+
+  Using Colab:
+
+  ```python
+  import os
+  os.environ['PYTHONPATH'] += ":/path/to/models"
+  ```
+
+4.  Install dependencies:
+
+  ```shell
+  pip3 install --user -r official/requirements.txt
+  ```
+
+
+To make Official Models easier to use, we are planning to create a pip
+installable Official Models package. This is being tracked in
+[#917](https://github.com/tensorflow/models/issues/917).
+
+## Available models
+
+**NOTE: For Officially Supported TPU models please check [README-TPU](README-TPU.md).**
+
+**NOTE:** Please make sure to follow the steps in the
+[Requirements](#requirements) section.
+
+### Natural Language Processing
+
+*   [bert](nlp/bert): A powerful pre-trained language representation model:
+    BERT, which stands for Bidirectional Encoder Representations from
+    Transformers.
+*   [transformer](nlp/transformer): A transformer model to translate the WMT English
+    to German dataset.
+*   [xlnet](nlp/xlnet): XLNet: Generalized Autoregressive Pretraining for
+    Language Understanding.
+
+### Computer Vision
+
+*   [mnist](vision/image_classification): A basic model to classify digits from
+    the MNIST dataset.
+*   [resnet](vision/image_classification): A deep residual network that can be
+    used to classify both CIFAR-10 and ImageNet's dataset of 1000 classes.
+*   [retinanet](vision/detection): A fast and powerful object detector.
+
+### Others
+
+*   [ncf](recommendation): Neural Collaborative Filtering model for
+    recommendation tasks.
+
+Models that will not update to TensorFlow 2.x stay inside R1 directory:
+
+*   [boosted_trees](r1/boosted_trees): A Gradient Boosted Trees model to
+    classify higgs boson process from HIGGS Data Set.
+*   [wide_deep](r1/wide_deep): A model that combines a wide model and deep
+    network to classify census income data.
+
+## More models to come!
+
+We are in the progress to revamp official model garden with TensorFlow 2.0 and
+Keras. In the near future, we will bring:
+
+*   State-of-the-art language understanding models: XLNet, GPT2, and more
+    members in Transformer family.
+*   Start-of-the-art image classification models: EfficientNet, MnasNet and
+    variants.
+*   A set of excellent objection detection models.
+
+If you would like to make any fixes or improvements to the models, please
+[submit a pull request](https://github.com/tensorflow/models/compare).
+
+## New Models
+
+The team is actively working to add new models to the repository. Every model
+should follow the following guidelines, to uphold the our objectives of
+readable, usable, and maintainable code.
+
+**General guidelines**
+
+* Code should be well documented and tested.
+* Runnable from a blank environment with relative ease.
+* Trainable on: single GPU/CPU (baseline), multiple GPUs, TPU
+* Compatible with Python 3 (using [six](https://pythonhosted.org/six/) when
+  being compatible with Python 2 is necessary)
+* Conform to [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md)
+
+**Implementation guidelines**
+
+These guidelines exist so the model implementations are consistent for better
+readability and maintainability.
+
+*   Use [common utility functions](utils)
+*   Export SavedModel at the end of training.
+*   Consistent flags and flag-parsing library
+    ([read more here](utils/flags/guidelines.md))
+*   Produce benchmarks and logs ([read more here](utils/logs/guidelines.md))
@@ -0,0 +1,56 @@
+[
+  {
+    "description": "The ID of the benchmark run, where this metric should tie to.",
+    "mode": "REQUIRED",
+    "name": "run_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The name of the metric, which should be descriptive. E.g. training_loss, accuracy.",
+    "mode": "REQUIRED",
+    "name": "name",
+    "type": "STRING"
+  },
+  {
+    "description": "The unit of the metric. E.g. MB per sec.",
+    "mode": "NULLABLE",
+    "name": "unit",
+    "type": "STRING"
+  },
+  {
+    "description": "The value of the metric.",
+    "mode": "NULLABLE",
+    "name": "value",
+    "type": "FLOAT"
+  },
+  {
+    "description": "The timestamp when the metric is recorded.",
+    "mode": "REQUIRED",
+    "name": "timestamp",
+    "type": "TIMESTAMP"
+  },
+  {
+    "description": "The global step when this metric is recorded.",
+    "mode": "NULLABLE",
+    "name": "global_step",
+    "type": "INTEGER"
+  },
+  {
+    "description": "Free format metadata for the extra information about the metric.",
+    "mode": "REPEATED",
+    "name": "extras",
+    "type": "RECORD",
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ]
+  }
+]
@@ -0,0 +1,368 @@
+[
+  {
+    "description": "The UUID of the run for the benchmark.",
+    "mode": "REQUIRED",
+    "name": "model_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The name of the model, E.g ResNet50, LeNet-5 etc.",
+    "mode": "REQUIRED",
+    "name": "model_name",
+    "type": "STRING"
+  },
+  {
+    "description": "The date when the test of the model is started",
+    "mode": "REQUIRED",
+    "name": "run_date",
+    "type": "TIMESTAMP"
+  },
+  {
+    "description": "The unique name for a test by the combination of key parameters, eg batch size, num of GPU, etc. It is hardware independent.",
+    "mode": "NULLABLE",
+    "name": "test_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The tensorflow version information.",
+    "fields": [
+      {
+        "description": "Version of the tensorflow. E.g. 1.7.0-rc0",
+        "mode": "REQUIRED",
+        "name": "version",
+        "type": "STRING"
+      },
+      {
+        "description": "Git Hash of the tensorflow",
+        "mode": "NULLABLE",
+        "name": "git_hash",
+        "type": "STRING"
+      },
+      {
+        "description": "The channel of the tensorflow binary, eg, nightly, RC, final, custom.",
+        "mode": "NULLABLE",
+        "name": "channel",
+        "type": "STRING"
+      },
+      {
+        "description": "Identify anything special about the build, eg CUDA 10, NCCL, MKL, etc.",
+        "mode": "NULLABLE",
+        "name": "build_type",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REQUIRED",
+    "name": "tensorflow_version",
+    "type": "RECORD"
+  },
+  {
+    "description": "The arbitrary attribute of the model.",
+    "fields": [
+      {
+        "description": "The name of the attribute.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the attribute.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "attribute",
+    "type": "RECORD"
+  },
+  {
+    "description": "Environment variables when the benchmark run is executed.",
+    "fields": [
+      {
+        "description": "The name of the variable.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the variable.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "environment_variable",
+    "type": "RECORD"
+  },
+  {
+    "description": "TF Environment variables when the benchmark run is executed.",
+    "fields": [
+      {
+        "description": "The name of the variable.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the variable.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "tensorflow_environment_variables",
+    "type": "RECORD"
+  },
+  {
+    "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
+    "fields": [
+      {
+        "description": "The name of the parameter.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The string value of the parameter.",
+        "mode": "NULLABLE",
+        "name": "string_value",
+        "type": "STRING"
+      },
+      {
+        "description": "The bool value of the parameter.",
+        "mode": "NULLABLE",
+        "name": "bool_value",
+        "type": "STRING"
+      },
+      {
+        "description": "The int/long value of the parameter.",
+        "mode": "NULLABLE",
+        "name": "long_value",
+        "type": "INTEGER"
+      },
+      {
+        "description": "The double/float value of parameter.",
+        "mode": "NULLABLE",
+        "name": "float_value",
+        "type": "FLOAT"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "run_parameters",
+    "type": "RECORD"
+  },
+  {
+    "description": "The dataset that run with the benchmark.",
+    "mode": "NULLABLE",
+    "name": "dataset",
+    "type": "RECORD",
+    "fields": [
+      {
+        "description": "The name of the dataset that the model is trained/validated with. E.g ImageNet, mnist.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The arbitrary attribute of the dataset.",
+        "fields": [
+          {
+            "description": "The name of the attribute.",
+            "mode": "REQUIRED",
+            "name": "name",
+            "type": "STRING"
+          },
+          {
+            "description": "The value of the attribute.",
+            "mode": "NULLABLE",
+            "name": "value",
+            "type": "STRING"
+          }
+        ],
+        "mode": "REPEATED",
+        "name": "attribute",
+        "type": "RECORD"
+      }
+    ]
+  },
+  {
+    "description": "Used to differentiate from AWS, GCE or DGX-1 at a high level",
+    "mode": "NULLABLE",
+    "name": "test_environment",
+    "type": "STRING"
+  },
+  {
+    "description": "The machine configuration of the benchmark run.",
+    "mode": "NULLABLE",
+    "name": "machine_config",
+    "type": "RECORD",
+    "fields": [
+      {
+        "description": "The platform information of the benchmark run.",
+        "mode": "NULLABLE",
+        "name": "platform_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "description": "Eg: 64bit.",
+            "mode": "NULLABLE",
+            "name": "bits",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: ELF.",
+            "mode": "NULLABLE",
+            "name": "linkage",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: i386.",
+            "mode": "NULLABLE",
+            "name": "machine",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: 3.13.0-76-generic.",
+            "mode": "NULLABLE",
+            "name": "release",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: Linux.",
+            "mode": "NULLABLE",
+            "name": "system",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: #120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016.",
+            "mode": "NULLABLE",
+            "name": "version",
+            "type": "STRING"
+          }
+        ]
+      },
+      {
+        "description": "The CPU information of the benchmark run.",
+        "mode": "NULLABLE",
+        "name": "cpu_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "mode": "NULLABLE",
+            "name": "num_cores",
+            "type": "INTEGER"
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "num_cores_allowed",
+            "type": "INTEGER"
+          },
+          {
+            "description" : "How fast are those CPUs.",
+            "mode": "NULLABLE",
+            "name": "mhz_per_cpu",
+            "type": "FLOAT"
+          },
+          {
+            "description" : "Additional CPU info, Eg: Intel Ivybridge with HyperThreading (24 cores).",
+            "mode": "NULLABLE",
+            "name": "cpu_info",
+            "type": "STRING"
+          },
+          {
+            "description" : "What kind of cpu scaling is enabled on the host. Eg performance, ondemand, conservative, mixed.",
+            "mode": "NULLABLE",
+            "name": "cpu_governor",
+            "type": "STRING"
+          },
+          {
+            "description": "Cache size of the CPUs.",
+            "mode": "NULLABLE",
+            "name": "cache_size",
+            "type": "RECORD",
+            "fields": [
+              {
+                "mode": "NULLABLE",
+                "name": "level",
+                "type": "STRING"
+              },
+              {
+                "mode": "NULLABLE",
+                "name": "size",
+                "type": "INTEGER"
+              }
+            ]
+          }
+        ]
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "gpu_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "mode": "NULLABLE",
+            "name": "count",
+            "type": "INTEGER"
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "model",
+            "type": "STRING"
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "cuda_version",
+            "type": "STRING"
+          }
+        ]
+      },
+      {
+        "description": "The cloud instance inforation if the benchmark run is executed on cloud",
+        "mode": "NULLABLE",
+        "name": "cloud_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "description": "The instance type, E.g. n1-standard-4.",
+            "mode": "NULLABLE",
+            "name": "instance_type",
+            "type": "STRING"
+          },
+          {
+            "description": "The arbitrary attribute of the cloud info.",
+            "fields": [
+              {
+                "description": "The name of the attribute.",
+                "mode": "REQUIRED",
+                "name": "name",
+                "type": "STRING"
+              },
+              {
+                "description": "The value of the attribute.",
+                "mode": "NULLABLE",
+                "name": "value",
+                "type": "STRING"
+              }
+            ],
+            "mode": "REPEATED",
+            "name": "attribute",
+            "type": "RECORD"
+          }
+        ]
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "memory_total",
+        "type": "INTEGER"
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "memory_available",
+        "type": "STRING"
+      }
+    ]
+  }
+]
@@ -0,0 +1,14 @@
+[
+  {
+    "description": "The UUID of the run for the benchmark.",
+    "mode": "REQUIRED",
+    "name": "run_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The status of the run for the benchmark. Eg, running, failed, success",
+    "mode": "REQUIRED",
+    "name": "status",
+    "type": "STRING"
+  }
+]
@@ -0,0 +1,285 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the Cifar-10 dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow as tf
+from official.benchmark.models import resnet_cifar_model
+from official.utils.flags import core as flags_core
+from official.utils.logs import logger
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+from official.vision.image_classification.resnet import cifar_preprocessing
+from official.vision.image_classification.resnet import common
+
+
+LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
+    (0.1, 91), (0.01, 136), (0.001, 182)
+]
+
+
+def learning_rate_schedule(current_epoch,
+                           current_batch,
+                           batches_per_epoch,
+                           batch_size):
+  """Handles linear scaling rule and LR decay.
+
+  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
+  provided scaling factor.
+
+  Args:
+    current_epoch: integer, current epoch indexed from 0.
+    current_batch: integer, current batch in the current epoch, indexed from 0.
+    batches_per_epoch: integer, number of steps in an epoch.
+    batch_size: integer, total batch sized.
+
+  Returns:
+    Adjusted learning rate.
+  """
+  del current_batch, batches_per_epoch  # not used
+  initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
+  learning_rate = initial_learning_rate
+  for mult, start_epoch in LR_SCHEDULE:
+    if current_epoch >= start_epoch:
+      learning_rate = initial_learning_rate * mult
+    else:
+      break
+  return learning_rate
+
+
+class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
+  """Callback to update learning rate on every batch (not epoch boundaries).
+
+  N.B. Only support Keras optimizers, not TF optimizers.
+
+  Attributes:
+      schedule: a function that takes an epoch index and a batch index as input
+          (both integer, indexed from 0) and returns a new learning rate as
+          output (float).
+  """
+
+  def __init__(self, schedule, batch_size, steps_per_epoch):
+    super(LearningRateBatchScheduler, self).__init__()
+    self.schedule = schedule
+    self.steps_per_epoch = steps_per_epoch
+    self.batch_size = batch_size
+    self.epochs = -1
+    self.prev_lr = -1
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if not hasattr(self.model.optimizer, 'learning_rate'):
+      raise ValueError('Optimizer must have a "learning_rate" attribute.')
+    self.epochs += 1
+
+  def on_batch_begin(self, batch, logs=None):
+    """Executes before step begins."""
+    lr = self.schedule(self.epochs,
+                       batch,
+                       self.steps_per_epoch,
+                       self.batch_size)
+    if not isinstance(lr, (float, np.float32, np.float64)):
+      raise ValueError('The output of the "schedule" function should be float.')
+    if lr != self.prev_lr:
+      self.model.optimizer.learning_rate = lr  # lr should be a float here
+      self.prev_lr = lr
+      tf.compat.v1.logging.debug(
+          'Epoch %05d Batch %05d: LearningRateBatchScheduler '
+          'change learning rate to %s.', self.epochs, batch, lr)
+
+
+def run(flags_obj):
+  """Run ResNet Cifar-10 training and eval loop using native Keras APIs.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+
+  # Execute flag override logic for better model performance
+  if flags_obj.tf_gpu_thread_mode:
+    keras_utils.set_gpu_thread_mode_and_count(
+        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+        num_gpus=flags_obj.num_gpus,
+        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+  common.set_cudnn_batchnorm_mode()
+
+  dtype = flags_core.get_tf_dtype(flags_obj)
+  if dtype == 'fp16':
+    raise ValueError('dtype fp16 is not supported in Keras. Use the default '
+                     'value(fp32).')
+
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs)
+
+  if strategy:
+    # flags_obj.enable_get_next_as_optional controls whether enabling
+    # get_next_as_optional behavior in DistributedIterator. If true, last
+    # partial batch can be supported.
+    strategy.extended.experimental_enable_get_next_as_optional = (
+        flags_obj.enable_get_next_as_optional
+    )
+
+  strategy_scope = distribution_utils.get_strategy_scope(strategy)
+
+  if flags_obj.use_synthetic_data:
+    distribution_utils.set_up_synthetic_data()
+    input_fn = common.get_synth_input_fn(
+        height=cifar_preprocessing.HEIGHT,
+        width=cifar_preprocessing.WIDTH,
+        num_channels=cifar_preprocessing.NUM_CHANNELS,
+        num_classes=cifar_preprocessing.NUM_CLASSES,
+        dtype=flags_core.get_tf_dtype(flags_obj),
+        drop_remainder=True)
+  else:
+    distribution_utils.undo_set_up_synthetic_data()
+    input_fn = cifar_preprocessing.input_fn
+
+  train_input_dataset = input_fn(
+      is_training=True,
+      data_dir=flags_obj.data_dir,
+      batch_size=flags_obj.batch_size,
+      parse_record_fn=cifar_preprocessing.parse_record,
+      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
+      dtype=dtype,
+      # Setting drop_remainder to avoid the partial batch logic in normalization
+      # layer, which triggers tf.where and leads to extra memory copy of input
+      # sizes between host and GPU.
+      drop_remainder=(not flags_obj.enable_get_next_as_optional))
+
+  eval_input_dataset = None
+  if not flags_obj.skip_eval:
+    eval_input_dataset = input_fn(
+        is_training=False,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        parse_record_fn=cifar_preprocessing.parse_record)
+
+  steps_per_epoch = (
+      cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+  lr_schedule = 0.1
+  if flags_obj.use_tensor_lr:
+    initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
+    lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
+        values=[initial_learning_rate] +
+        list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
+
+  with strategy_scope:
+    optimizer = common.get_optimizer(lr_schedule)
+    model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
+    model.compile(
+        loss='sparse_categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=(['sparse_categorical_accuracy']
+                 if flags_obj.report_accuracy_metrics else None),
+        run_eagerly=flags_obj.run_eagerly)
+
+  train_epochs = flags_obj.train_epochs
+
+  callbacks = common.get_callbacks(steps_per_epoch)
+
+  if not flags_obj.use_tensor_lr:
+    lr_callback = LearningRateBatchScheduler(
+        schedule=learning_rate_schedule,
+        batch_size=flags_obj.batch_size,
+        steps_per_epoch=steps_per_epoch)
+    callbacks.append(lr_callback)
+
+  # if mutliple epochs, ignore the train_steps flag.
+  if train_epochs <= 1 and flags_obj.train_steps:
+    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
+    train_epochs = 1
+
+  num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
+                    flags_obj.batch_size)
+
+  validation_data = eval_input_dataset
+  if flags_obj.skip_eval:
+    if flags_obj.set_learning_phase_to_train:
+      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
+      # not using distribution strategy.
+      tf.keras.backend.set_learning_phase(1)
+    num_eval_steps = None
+    validation_data = None
+
+  if not strategy and flags_obj.explicit_gpu_placement:
+    # TODO(b/135607227): Add device scope automatically in Keras training loop
+    # when not using distribition strategy.
+    no_dist_strat_device = tf.device('/device:GPU:0')
+    no_dist_strat_device.__enter__()
+
+  history = model.fit(train_input_dataset,
+                      epochs=train_epochs,
+                      steps_per_epoch=steps_per_epoch,
+                      callbacks=callbacks,
+                      validation_steps=num_eval_steps,
+                      validation_data=validation_data,
+                      validation_freq=flags_obj.epochs_between_evals,
+                      verbose=2)
+  eval_output = None
+  if not flags_obj.skip_eval:
+    eval_output = model.evaluate(eval_input_dataset,
+                                 steps=num_eval_steps,
+                                 verbose=2)
+
+  if not strategy and flags_obj.explicit_gpu_placement:
+    no_dist_strat_device.__exit__()
+
+  stats = common.build_stats(history, eval_output, callbacks)
+  return stats
+
+
+def define_cifar_flags():
+  common.define_keras_flags(dynamic_loss_scale=False)
+
+  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
+                          model_dir='/tmp/cifar10_model',
+                          epochs_between_evals=10,
+                          batch_size=128)
+
+
+def main(_):
+  with logger.benchmark_context(flags.FLAGS):
+    return run(flags.FLAGS)
+
+
+if __name__ == '__main__':
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  define_cifar_flags()
+  app.run(main)
@@ -0,0 +1,262 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
+
+# Reference:
+- [Deep Residual Learning for Image Recognition](
+    https://arxiv.org/abs/1512.03385)
+Adapted from code contributed by BigMoyan.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import tensorflow as tf
+from tensorflow.python.keras import backend
+from tensorflow.python.keras  import initializers
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import regularizers
+
+
+BATCH_NORM_DECAY = 0.997
+BATCH_NORM_EPSILON = 1e-5
+L2_WEIGHT_DECAY = 2e-4
+
+
+def identity_building_block(input_tensor,
+                            kernel_size,
+                            filters,
+                            stage,
+                            block,
+                            training=None):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Arguments:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of
+        middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: current block label, used for generating layer names
+    training: Only used if training keras model with Estimator.  In other
+      scenarios it is handled automatically.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(filters1, kernel_size,
+                    padding='same', use_bias=False,
+                    kernel_initializer='he_normal',
+                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                    name=conv_name_base + '2a')(input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(x, training=training)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(filters2, kernel_size,
+                    padding='same', use_bias=False,
+                    kernel_initializer='he_normal',
+                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                    name=conv_name_base + '2b')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(x, training=training)
+
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def conv_building_block(input_tensor,
+                        kernel_size,
+                        filters,
+                        stage,
+                        block,
+                        strides=(2, 2),
+                        training=None):
+  """A block that has a conv layer at shortcut.
+
+  Arguments:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of
+        middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: current block label, used for generating layer names
+    strides: Strides for the first conv layer in the block.
+    training: Only used if training keras model with Estimator.  In other
+      scenarios it is handled automatically.
+
+  Returns:
+    Output tensor for the block.
+
+  Note that from stage 3,
+  the first conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+  """
+  filters1, filters2 = filters
+  if tf.keras.backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(filters1, kernel_size, strides=strides,
+                    padding='same', use_bias=False,
+                    kernel_initializer='he_normal',
+                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                    name=conv_name_base + '2a')(input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(x, training=training)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(filters2, kernel_size, padding='same', use_bias=False,
+                    kernel_initializer='he_normal',
+                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                    name=conv_name_base + '2b')(x)
+  x = layers.BatchNormalization(
+      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(x, training=training)
+
+  shortcut = layers.Conv2D(filters2, (1, 1), strides=strides, use_bias=False,
+                           kernel_initializer='he_normal',
+                           kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                           name=conv_name_base + '1')(input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '1')(shortcut, training=training)
+
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def resnet_block(input_tensor,
+                 size,
+                 kernel_size,
+                 filters,
+                 stage,
+                 conv_strides=(2, 2),
+                 training=None):
+  """A block which applies conv followed by multiple identity blocks.
+
+  Arguments:
+    input_tensor: input tensor
+    size: integer, number of constituent conv/identity building blocks.
+    A conv block is applied once, followed by (size - 1) identity blocks.
+    kernel_size: default 3, the kernel size of
+        middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    conv_strides: Strides for the first conv layer in the block.
+    training: Only used if training keras model with Estimator.  In other
+      scenarios it is handled automatically.
+
+  Returns:
+    Output tensor after applying conv and identity blocks.
+  """
+
+  x = conv_building_block(input_tensor, kernel_size, filters, stage=stage,
+                          strides=conv_strides, block='block_0',
+                          training=training)
+  for i in range(size - 1):
+    x = identity_building_block(x, kernel_size, filters, stage=stage,
+                                block='block_%d' % (i + 1), training=training)
+  return x
+
+
+def resnet(num_blocks, classes=10, training=None):
+  """Instantiates the ResNet architecture.
+
+  Arguments:
+    num_blocks: integer, the number of conv/identity blocks in each block.
+      The ResNet contains 3 blocks with each block containing one conv block
+      followed by (layers_per_block - 1) number of idenity blocks. Each
+      conv/idenity block has 2 convolutional layers. With the input
+      convolutional layer and the pooling layer towards the end, this brings
+      the total size of the network to (6*num_blocks + 2)
+    classes: optional number of classes to classify images into
+    training: Only used if training keras model with Estimator.  In other
+    scenarios it is handled automatically.
+
+  Returns:
+    A Keras model instance.
+  """
+
+  input_shape = (32, 32, 3)
+  img_input = layers.Input(shape=input_shape)
+
+  if backend.image_data_format() == 'channels_first':
+    x = layers.Lambda(lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
+                      name='transpose')(img_input)
+    bn_axis = 1
+  else:  # channel_last
+    x = img_input
+    bn_axis = 3
+
+  x = layers.ZeroPadding2D(padding=(1, 1), name='conv1_pad')(x)
+  x = layers.Conv2D(16, (3, 3),
+                    strides=(1, 1),
+                    padding='valid', use_bias=False,
+                    kernel_initializer='he_normal',
+                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                    name='conv1')(x)
+  x = layers.BatchNormalization(axis=bn_axis,
+                                momentum=BATCH_NORM_DECAY,
+                                epsilon=BATCH_NORM_EPSILON,
+                                name='bn_conv1',)(x, training=training)
+  x = layers.Activation('relu')(x)
+
+  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[16, 16],
+                   stage=2, conv_strides=(1, 1), training=training)
+
+  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[32, 32],
+                   stage=3, conv_strides=(2, 2), training=training)
+
+  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[64, 64],
+                   stage=4, conv_strides=(2, 2), training=training)
+
+  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
+  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
+  x = layers.Dense(classes,
+                   activation='softmax',
+                   kernel_initializer=initializers.RandomNormal(stddev=0.01),
+                   kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                   bias_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
+                   name='fc10')(x)
+
+  inputs = img_input
+  # Create model.
+  model = tf.keras.models.Model(inputs, x, name='resnet56')
+
+  return model
+
+
+resnet20 = functools.partial(resnet, num_blocks=3)
+resnet32 = functools.partial(resnet, num_blocks=5)
+resnet56 = functools.partial(resnet, num_blocks=9)
+resnet10 = functools.partial(resnet, num_blocks=110)
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test the keras ResNet model with Cifar data."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+import tensorflow as tf
+
+from tensorflow.python.eager import context
+from tensorflow.python.platform import googletest
+from official.benchmark.models import resnet_cifar_main
+from official.utils.misc import keras_utils
+from official.utils.testing import integration
+from official.vision.image_classification.resnet import cifar_preprocessing
+
+
+class KerasCifarTest(googletest.TestCase):
+  """Unit tests for Keras ResNet with Cifar."""
+
+  _extra_flags = [
+      "-batch_size", "4",
+      "-train_steps", "1",
+      "-use_synthetic_data", "true"
+  ]
+  _tempdir = None
+
+  def get_temp_dir(self):
+    if not self._tempdir:
+      self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+    return self._tempdir
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(KerasCifarTest, cls).setUpClass()
+    resnet_cifar_main.define_cifar_flags()
+
+  def setUp(self):
+    super(KerasCifarTest, self).setUp()
+    cifar_preprocessing.NUM_IMAGES["validation"] = 4
+
+  def tearDown(self):
+    super(KerasCifarTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+
+  def test_end_to_end_no_dist_strat(self):
+    """Test Keras model with 1 GPU, no distribution strategy."""
+    config = keras_utils.get_config_proto_v1()
+    tf.compat.v1.enable_eager_execution(config=config)
+
+    extra_flags = [
+        "-distribution_strategy", "off",
+        "-model_dir", "keras_cifar_no_dist_strat",
+        "-data_format", "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags
+    )
+
+  def test_end_to_end_graph_no_dist_strat(self):
+    """Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
+    extra_flags = [
+        "-enable_eager", "false",
+        "-distribution_strategy", "off",
+        "-model_dir", "keras_cifar_graph_no_dist_strat",
+        "-data_format", "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags
+    )
+
+  def test_end_to_end_1_gpu(self):
+    """Test Keras model with 1 GPU."""
+    config = keras_utils.get_config_proto_v1()
+    tf.compat.v1.enable_eager_execution(config=config)
+
+    if context.num_gpus() < 1:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available".
+          format(1, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus", "1",
+        "-distribution_strategy", "mirrored",
+        "-model_dir", "keras_cifar_1_gpu",
+        "-data_format", "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags
+    )
+
+  def test_end_to_end_graph_1_gpu(self):
+    """Test Keras model in legacy graph mode with 1 GPU."""
+    if context.num_gpus() < 1:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available".
+          format(1, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus", "1",
+        "-noenable_eager",
+        "-distribution_strategy", "mirrored",
+        "-model_dir", "keras_cifar_graph_1_gpu",
+        "-data_format", "channels_last",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags
+    )
+
+  def test_end_to_end_2_gpu(self):
+    """Test Keras model with 2 GPUs."""
+    config = keras_utils.get_config_proto_v1()
+    tf.compat.v1.enable_eager_execution(config=config)
+
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available".
+          format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus", "2",
+        "-distribution_strategy", "mirrored",
+        "-model_dir", "keras_cifar_2_gpu",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags
+    )
+
+  def test_end_to_end_graph_2_gpu(self):
+    """Test Keras model in legacy graph mode with 2 GPUs."""
+    if context.num_gpus() < 2:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available".
+          format(2, context.num_gpus()))
+
+    extra_flags = [
+        "-num_gpus", "2",
+        "-enable_eager", "false",
+        "-distribution_strategy", "mirrored",
+        "-model_dir", "keras_cifar_graph_2_gpu",
+    ]
+    extra_flags = extra_flags + self._extra_flags
+
+    integration.run_synthetic(
+        main=resnet_cifar_main.run,
+        tmp_root=self.get_temp_dir(),
+        extra_flags=extra_flags
+    )
+
+
+if __name__ == "__main__":
+  googletest.main()
@@ -0,0 +1,259 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes CTL benchmarks and accuracy tests."""
+from __future__ import print_function
+
+import os
+import sys
+import time
+# import pydevd_pycharm
+# pydevd_pycharm.settrace('90.253.17.223', port=8008, stdoutToServer=True, stderrToServer=True, suspend=False)
+# pylint: disable=g-bad-import-order
+from absl import flags
+import tensorflow as tf
+
+#sys.path.append(r"/home/wx933135/0708/ResNet50/tensorflow/code")
+
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../../utils/atlasboost'))
+
+from official.r1.resnet import imagenet_main
+from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
+from official.utils.testing import benchmark_wrappers
+from official.utils.flags import core as flags_core
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+
+
+MIN_TOP_1_ACCURACY = 0.76
+MAX_TOP_1_ACCURACY = 0.77
+
+flags.DEFINE_integer('iterations_per_loop', 1000,'iterations per loop')
+flags.DEFINE_integer('save_checkpoints_steps', 115200,'save checkpoints steps')
+FLAGS = flags.FLAGS
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../config'))
+
+
+class CtlBenchmark(PerfZeroBenchmark):
+  """Base benchmark class with methods to simplify testing."""
+
+  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
+    self.output_dir = output_dir
+    self.default_flags = default_flags or {}
+    self.flag_methods = flag_methods or {}
+    super(CtlBenchmark, self).__init__(
+        output_dir=self.output_dir,
+        default_flags=self.default_flags,
+        flag_methods=self.flag_methods)
+
+  def _report_benchmark(self,
+                        stats,
+                        wall_time_sec,
+                        top_1_max=None,
+                        top_1_min=None,
+                        total_batch_size=None,
+                        log_steps=None,
+                        warmup=1):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      stats: dict returned from keras models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      top_1_max: highest passing level for top_1 accuracy.
+      top_1_min: lowest passing level for top_1 accuracy.
+      total_batch_size: Global batch-size.
+      log_steps: How often the log was created for stats['step_timestamp_log'].
+      warmup: number of entries in stats['step_timestamp_log'] to ignore.
+    """
+
+    metrics = []
+    if 'eval_acc' in stats:
+      metrics.append({
+          'name': 'accuracy_top_1',
+          'value': stats['eval_acc'],
+          'min_value': top_1_min,
+          'max_value': top_1_max
+      })
+      metrics.append({'name': 'eval_loss', 'value': stats['eval_loss']})
+
+      metrics.append({
+          'name': 'top_1_train_accuracy',
+          'value': stats['train_acc']
+      })
+      metrics.append({'name': 'train_loss', 'value': stats['train_loss']})
+
+    if (warmup and 'step_timestamp_log' in stats and
+        len(stats['step_timestamp_log']) > warmup + 1):
+      # first entry in the time_log is start of step 0. The rest of the
+      # entries are the end of each step recorded
+      time_log = stats['step_timestamp_log']
+      steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
+      time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
+      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
+
+    if 'avg_exp_per_second' in stats:
+      metrics.append({
+          'name': 'avg_exp_per_second',
+          'value': stats['avg_exp_per_second']
+      })
+    print("start flags_core.get_nondefault_flags_as_str")
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=-1,
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
+
+
+class Resnet50CtlAccuracy(CtlBenchmark):
+  """Benchmark accuracy tests for ResNet50 in CTL."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """A benchmark class.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+        constructor forward compatible in case PerfZero provides more named
+        arguments before updating the constructor.
+    """
+
+    # flag_methods = [common.define_keras_flags]
+
+    self.data_dir = os.path.join(root_data_dir, 'imagenet')
+    super(Resnet50CtlAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flags)
+
+
+  # @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = imagenet_main.main(flags.FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    super(Resnet50CtlAccuracy, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        top_1_min=MIN_TOP_1_ACCURACY,
+        top_1_max=MAX_TOP_1_ACCURACY,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=100)
+
+  def _get_model_dir(self, folder_name):
+    return os.path.join(self.output_dir, folder_name)
+
+
+class Resnet50CtlBenchmarkBase(CtlBenchmark):
+  """Resnet50 benchmarks."""
+
+  def __init__(self, output_dir=None, default_flags=None):
+
+    super(Resnet50CtlBenchmarkBase, self).__init__(
+        output_dir=output_dir,
+        flag_methods=flags,
+        default_flags=default_flags)
+
+  # @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = imagenet_main.benchmark_main()
+    wall_time_sec = time.time() - start_time_sec
+
+    # Number of logged step time entries that are excluded in performance
+    # report. We keep results from last 100 batches in this case.
+    warmup = (FLAGS.train_steps - 100) // FLAGS.log_steps
+
+    super(Resnet50CtlBenchmarkBase, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps,
+        warmup=warmup)
+
+
+  def benchmark_1_npu_fp16(self, config_dict, cluster_device_id):
+    """Test v1 model with 1 NPU with tf mixed precision."""
+    print("start benchmark_1_npu_fp16")
+    FLAGS.resnet_size = 50
+    FLAGS.resnet_version = 1
+    # FLAGS.max_train_steps = 1000 # this is not global step , only the step per epoch. default is according to train images
+    FLAGS.max_train_steps = config_dict.get('max_train_steps')
+    FLAGS.hooks = ['examplespersecondhook']
+    #FLAGS.data_dir = '/home/w00563133/data/resnet/imagenet_TF'
+    FLAGS.data_dir = config_dict.get('data_dir')
+    FLAGS.model_dir = os.getenv('MODEL_CKPT_PATH')
+    FLAGS.train_epochs = config_dict.get('train_epochs')
+    FLAGS.batch_size = config_dict.get('batch_size')
+    # FLAGS.epochs_between_evals = 1
+    FLAGS.epochs_between_evals = config_dict.get('epochs_between_evals')
+    FLAGS.iterations_per_loop = config_dict.get('iterations_per_loop')
+    FLAGS.save_checkpoints_steps = config_dict.get('save_checkpoints_steps')
+    FLAGS.stop_threshold = MIN_TOP_1_ACCURACY
+    self._run_and_report_benchmark()
+
+
+class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
+  """Resnet50 real data benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    # def_flags['skip_eval'] = True
+    # def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
+    # def_flags['train_steps'] = 110
+    # def_flags['steps_per_loop'] = 20
+    # def_flags['log_steps'] = 10
+
+    super(Resnet50CtlBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
+if __name__ == '__main__':
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+    config_info = get_model_parameter("tensorflow_config")
+    initinal_data = {"base_lr": 0.128, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
+    cluster_device_id = None
+    rank_count = sys.argv[1]
+    if rank_count == "1":
+        from resnet_config_1p_npu import resnet50_config
+    elif rank_count == "2":
+        from resnet_config_2p_npu import resnet50_config
+    elif rank_count == "4":
+        from resnet_config_4p_npu import resnet50_config
+    elif rank_count == "16":
+        from resnet_config_16p_npu import resnet50_config
+    elif rank_count == "32":
+        from resnet_config_32p_npu import resnet50_config
+    else:
+        from resnet_config_8p_npu import resnet50_config
+    config_dict = resnet50_config()
+    print("config dict info is {}".format(config_dict))
+    imagenet_main.benchmark_pre()
+    test=Resnet50CtlBenchmarkReal("./result","./result")
+    test.benchmark_1_npu_fp16(config_dict, cluster_device_id)
+
@@ -0,0 +1,19 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Activations package definition."""
+from official.modeling.activations.gelu import gelu
+from official.modeling.activations.swish import hard_swish
+from official.modeling.activations.swish import identity
+from official.modeling.activations.swish import simple_swish
@@ -0,0 +1,40 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Gaussian error linear unit."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def gelu(x):
+  """Gaussian Error Linear Unit.
+
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+  Args:
+    x: float Tensor to perform activation.
+
+  Returns:
+    `x` with the GELU activation applied.
+  """
+  cdf = 0.5 * (1.0 + tf.tanh(
+      (math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+  return x * cdf
@@ -0,0 +1,38 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the Gaussian error linear unit."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class GeluTest(keras_parameterized.TestCase):
+
+  def test_gelu(self):
+    expected_data = [[0.14967535, 0., -0.10032465],
+                     [-0.15880796, -0.04540223, 2.9963627]]
+    gelu_data = activations.gelu([[.25, 0, -.25], [-1, -2, 3]])
+    self.assertAllClose(expected_data, gelu_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,75 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Customized Swish activation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def simple_swish(features):
+  """Computes the Swish activation function.
+
+  The tf.nn.swish operation uses a custom gradient to reduce memory usage.
+  Since saving custom gradients in SavedModel is currently not supported, and
+  one would not be able to use an exported TF-Hub module for fine-tuning, we
+  provide this wrapper that can allow to select whether to use the native
+  TensorFlow swish operation, or whether to use a customized operation that
+  has uses default TensorFlow gradient computation.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return features * tf.nn.sigmoid(features)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def hard_swish(features):
+  """Computes a hard version of the swish function.
+
+  This operation can be used to reduce computational cost and improve
+  quantization for edge devices.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return features * tf.nn.relu6(features + tf.constant(3.)) * (1. / 6.)
+
+
+@tf.keras.utils.register_keras_serializable(package='Text')
+def identity(features):
+  """Computes the identity function.
+
+  Useful for helping in quantization.
+
+  Args:
+    features: A `Tensor` representing preactivation values.
+
+  Returns:
+    The activation value.
+  """
+  features = tf.convert_to_tensor(features)
+  return tf.identity(features)
@@ -0,0 +1,49 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the customized Swish activation."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.modeling import activations
+
+
+@keras_parameterized.run_all_keras_modes
+class CustomizedSwishTest(keras_parameterized.TestCase):
+
+  def _hard_swish_np(self, x):
+    x = np.float32(x)
+    return x * np.clip(x + 3, 0, 6) / 6
+
+  def test_simple_swish(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_swish_data = activations.simple_swish(features)
+    swish_data = tf.nn.swish(features)
+    self.assertAllClose(customized_swish_data, swish_data)
+
+  def test_hard_swish(self):
+    features = [[.25, 0, -.25], [-1, -2, 3]]
+    customized_swish_data = activations.hard_swish(features)
+    swish_data = self._hard_swish_np(features)
+    self.assertAllClose(customized_swish_data, swish_data)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,318 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base configurations to standardize experiments."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import copy
+import functools
+from typing import Any, List, Mapping, Optional, Type
+
+import dataclasses
+import tensorflow as tf
+import yaml
+
+from official.modeling.hyperparams import params_dict
+
+
+@dataclasses.dataclass
+class Config(params_dict.ParamsDict):
+  """The base configuration class that supports YAML/JSON based overrides.
+
+  * It recursively enforces a whitelist of basic types and container types, so
+    it avoids surprises with copy and reuse caused by unanticipated types.
+  * It converts dict to Config even within sequences,
+    e.g. for config = Config({'key': [([{'a': 42}],)]),
+         type(config.key[0][0][0]) is Config rather than dict.
+  """
+
+  # It's safe to add bytes and other immutable types here.
+  IMMUTABLE_TYPES = (str, int, float, bool, type(None))
+  # It's safe to add set, frozenset and other collections here.
+  SEQUENCE_TYPES = (list, tuple)
+
+  default_params: dataclasses.InitVar[Optional[Mapping[str, Any]]] = None
+  restrictions: dataclasses.InitVar[Optional[List[str]]] = None
+
+  @classmethod
+  def _isvalidsequence(cls, v):
+    """Check if the input values are valid sequences.
+
+    Args:
+      v: Input sequence.
+
+    Returns:
+      True if the sequence is valid. Valid sequence includes the sequence
+      type in cls.SEQUENCE_TYPES and element type is in cls.IMMUTABLE_TYPES or
+      is dict or ParamsDict.
+    """
+    if not isinstance(v, cls.SEQUENCE_TYPES):
+      return False
+    return (all(isinstance(e, cls.IMMUTABLE_TYPES) for e in v) or
+            all(isinstance(e, dict) for e in v) or
+            all(isinstance(e, params_dict.ParamsDict) for e in v))
+
+  @classmethod
+  def _import_config(cls, v, subconfig_type):
+    """Returns v with dicts converted to Configs, recursively."""
+    if not issubclass(subconfig_type, params_dict.ParamsDict):
+      raise TypeError(
+          'Subconfig_type should be subclass of ParamsDict, found {!r}'.format(
+              subconfig_type))
+    if isinstance(v, cls.IMMUTABLE_TYPES):
+      return v
+    elif isinstance(v, cls.SEQUENCE_TYPES):
+      # Only support one layer of sequence.
+      if not cls._isvalidsequence(v):
+        raise TypeError(
+            'Invalid sequence: only supports single level {!r} of {!r} or '
+            'dict or ParamsDict found: {!r}'.format(cls.SEQUENCE_TYPES,
+                                                    cls.IMMUTABLE_TYPES, v))
+      import_fn = functools.partial(
+          cls._import_config, subconfig_type=subconfig_type)
+      return type(v)(map(import_fn, v))
+    elif isinstance(v, params_dict.ParamsDict):
+      # Deepcopy here is a temporary solution for preserving type in nested
+      # Config object.
+      return copy.deepcopy(v)
+    elif isinstance(v, dict):
+      return subconfig_type(v)
+    else:
+      raise TypeError('Unknown type: {!r}'.format(type(v)))
+
+  @classmethod
+  def _export_config(cls, v):
+    """Returns v with Configs converted to dicts, recursively."""
+    if isinstance(v, cls.IMMUTABLE_TYPES):
+      return v
+    elif isinstance(v, cls.SEQUENCE_TYPES):
+      return type(v)(map(cls._export_config, v))
+    elif isinstance(v, params_dict.ParamsDict):
+      return v.as_dict()
+    elif isinstance(v, dict):
+      raise TypeError('dict value not supported in converting.')
+    else:
+      raise TypeError('Unknown type: {!r}'.format(type(v)))
+
+  @classmethod
+  def _get_subconfig_type(cls, k) -> Type[params_dict.ParamsDict]:
+    """Get element type by the field name.
+
+    Args:
+      k: the key/name of the field.
+
+    Returns:
+      Config as default. If a type annotation is found for `k`,
+      1) returns the type of the annotation if it is subtype of ParamsDict;
+      2) returns the element type if the annotation of `k` is List[SubType]
+         or Tuple[SubType].
+    """
+    subconfig_type = Config
+    if k in cls.__annotations__:
+      # Directly Config subtype.
+      type_annotation = cls.__annotations__[k]
+      if (isinstance(type_annotation, type) and
+          issubclass(type_annotation, Config)):
+        subconfig_type = cls.__annotations__[k]
+      else:
+        # Check if the field is a sequence of subtypes.
+        field_type = getattr(type_annotation, '__origin__', type(None))
+        if (isinstance(field_type, type) and
+            issubclass(field_type, cls.SEQUENCE_TYPES)):
+          element_type = getattr(type_annotation, '__args__', [type(None)])[0]
+          subconfig_type = (
+              element_type if issubclass(element_type, params_dict.ParamsDict)
+              else subconfig_type)
+    return subconfig_type
+
+  def __post_init__(self, default_params, restrictions, *args, **kwargs):
+    super().__init__(default_params=default_params,
+                     restrictions=restrictions,
+                     *args,
+                     **kwargs)
+
+  def _set(self, k, v):
+    """Overrides same method in ParamsDict.
+
+    Also called by ParamsDict methods.
+
+    Args:
+      k: key to set.
+      v: value.
+
+    Raises:
+      RuntimeError
+    """
+    subconfig_type = self._get_subconfig_type(k)
+    if isinstance(v, dict):
+      if k not in self.__dict__ or not self.__dict__[k]:
+        # If the key not exist or the value is None, a new Config-family object
+        # sould be created for the key.
+        self.__dict__[k] = subconfig_type(v)
+      else:
+        self.__dict__[k].override(v)
+    else:
+      self.__dict__[k] = self._import_config(v, subconfig_type)
+
+  def __setattr__(self, k, v):
+    if k not in self.RESERVED_ATTR:
+      if getattr(self, '_locked', False):
+        raise ValueError('The Config has been locked. ' 'No change is allowed.')
+    self._set(k, v)
+
+  def _override(self, override_dict, is_strict=True):
+    """Overrides same method in ParamsDict.
+
+    Also called by ParamsDict methods.
+
+    Args:
+      override_dict: dictionary to write to .
+      is_strict: If True, not allows to add new keys.
+
+    Raises:
+      KeyError: overriding reserved keys or keys not exist (is_strict=True).
+    """
+    for k, v in sorted(override_dict.items()):
+      if k in self.RESERVED_ATTR:
+        raise KeyError('The key {!r} is internally reserved. '
+                       'Can not be overridden.'.format(k))
+      if k not in self.__dict__:
+        if is_strict:
+          raise KeyError('The key {!r} does not exist in {!r}. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(
+                             k, type(self)))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict) and self.__dict__[k]:
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, params_dict.ParamsDict) and self.__dict__[k]:
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self._set(k, v)
+
+  def as_dict(self):
+    """Returns a dict representation of params_dict.ParamsDict.
+
+    For the nested params_dict.ParamsDict, a nested dict will be returned.
+    """
+    return {
+        k: self._export_config(v)
+        for k, v in self.__dict__.items()
+        if k not in self.RESERVED_ATTR
+    }
+
+  def replace(self, **kwargs):
+    """Like `override`, but returns a copy with the current config unchanged."""
+    params = self.__class__(self)
+    params.override(kwargs, is_strict=True)
+    return params
+
+  @classmethod
+  def from_yaml(cls, file_path: str):
+    # Note: This only works if the Config has all default values.
+    with tf.io.gfile.GFile(file_path, 'r') as f:
+      loaded = yaml.load(f)
+      config = cls()
+      config.override(loaded)
+      return config
+
+  @classmethod
+  def from_json(cls, file_path: str):
+    """Wrapper for `from_yaml`."""
+    return cls.from_yaml(file_path)
+
+  @classmethod
+  def from_args(cls, *args, **kwargs):
+    """Builds a config from the given list of arguments."""
+    attributes = list(cls.__annotations__.keys())
+    default_params = {a: p for a, p in zip(attributes, args)}
+    default_params.update(kwargs)
+    return cls(default_params)
+
+
+@dataclasses.dataclass
+class RuntimeConfig(Config):
+  """High-level configurations for Runtime.
+
+  These include parameters that are not directly related to the experiment,
+  e.g. directories, accelerator type, etc.
+
+  Attributes:
+    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
+    enable_eager: Whether or not to enable eager mode.
+    enable_xla: Whether or not to enable XLA.
+    per_gpu_thread_count: thread count per GPU.
+    gpu_threads_enabled: Whether or not GPU threads are enabled.
+    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
+    dataset_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    tpu: The address of the TPU to use, if any.
+    num_gpus: The number of GPUs to use, if any.
+    worker_hosts: comma-separated list of worker ip:port pairs for running
+      multi-worker models with DistributionStrategy.
+    task_index: If multi-worker training, the task index of this worker.
+    all_reduce_alg: Defines the algorithm for performing all-reduce.
+    num_packs: Sets `num_packs` in the cross device ops used in
+      MirroredStrategy.  For details, see tf.distribute.NcclAllReduce.
+  """
+  distribution_strategy: str = 'mirrored'
+  enable_eager: bool = False
+  enable_xla: bool = False
+  gpu_threads_enabled: bool = False
+  gpu_thread_mode: Optional[str] = None
+  dataset_num_private_threads: Optional[int] = None
+  per_gpu_thread_count: int = 0
+  tpu: Optional[str] = None
+  num_gpus: int = 0
+  worker_hosts: Optional[str] = None
+  task_index: int = -1
+  all_reduce_alg: Optional[str] = None
+  num_packs: int = 1
+
+
+@dataclasses.dataclass
+class TensorboardConfig(Config):
+  """Configuration for Tensorboard.
+
+  Attributes:
+    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
+      to True.
+    write_model_weights: Whether or not to write the model weights as
+      images in Tensorboard. Defaults to False.
+
+  """
+  track_lr: bool = True
+  write_model_weights: bool = False
+
+
+@dataclasses.dataclass
+class CallbacksConfig(Config):
+  """Configuration for Callbacks.
+
+  Attributes:
+    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
+      Callback. Defaults to True.
+    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
+      Defaults to True.
+
+  """
+  enable_checkpoint_and_export: bool = True
+  enable_tensorboard: bool = True
@@ -0,0 +1,299 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import pprint
+from typing import List, Tuple
+
+from absl.testing import parameterized
+import dataclasses
+import tensorflow as tf
+from official.modeling.hyperparams import base_config
+
+
+@dataclasses.dataclass
+class DumpConfig1(base_config.Config):
+  a: int = 1
+  b: str = 'text'
+
+
+@dataclasses.dataclass
+class DumpConfig2(base_config.Config):
+  c: int = 2
+  d: str = 'text'
+  e: DumpConfig1 = DumpConfig1()
+
+
+@dataclasses.dataclass
+class DumpConfig3(DumpConfig2):
+  f: int = 2
+  g: str = 'text'
+  h: List[DumpConfig1] = dataclasses.field(
+      default_factory=lambda: [DumpConfig1(), DumpConfig1()])
+  g: Tuple[DumpConfig1, ...] = (DumpConfig1(),)
+
+
+class BaseConfigTest(parameterized.TestCase, tf.test.TestCase):
+
+  def assertHasSameTypes(self, c, d, msg=''):
+    """Checks if a Config has the same structure as a given dict.
+
+    Args:
+      c: the Config object to be check.
+      d: the reference dict object.
+      msg: The error message to show when type mismatched.
+    """
+    # Make sure d is not a Config. Assume d is either
+    # dictionary or primitive type and c is the Config or primitive types.
+    self.assertNotIsInstance(d, base_config.Config)
+    if isinstance(d, base_config.Config.IMMUTABLE_TYPES):
+      self.assertEqual(pprint.pformat(c), pprint.pformat(d), msg=msg)
+    elif isinstance(d, base_config.Config.SEQUENCE_TYPES):
+      self.assertEqual(type(c), type(d), msg=msg)
+      for i, v in enumerate(d):
+        self.assertHasSameTypes(c[i], v, msg='{}[{!r}]'.format(msg, i))
+    elif isinstance(d, dict):
+      self.assertIsInstance(c, base_config.Config, msg=msg)
+      for k, v in sorted(d.items()):
+        self.assertHasSameTypes(getattr(c, k), v, msg='{}[{!r}]'.format(msg, k))
+    else:
+      raise TypeError('Unknown type: %r' % type(d))
+
+  def assertImportExport(self, v):
+    config = base_config.Config({'key': v})
+    back = config.as_dict()['key']
+    self.assertEqual(pprint.pformat(back), pprint.pformat(v))
+    self.assertHasSameTypes(config.key, v, msg='=%s v' % pprint.pformat(v))
+
+  def test_invalid_keys(self):
+    params = base_config.Config()
+    with self.assertRaises(AttributeError):
+      _ = params.a
+
+  def test_nested_config_types(self):
+    config = DumpConfig3()
+    self.assertIsInstance(config.e, DumpConfig1)
+    self.assertIsInstance(config.h[0], DumpConfig1)
+    self.assertIsInstance(config.h[1], DumpConfig1)
+    self.assertIsInstance(config.g[0], DumpConfig1)
+
+    config.override({'e': {'a': 2, 'b': 'new text'}})
+    self.assertIsInstance(config.e, DumpConfig1)
+    self.assertEqual(config.e.a, 2)
+    self.assertEqual(config.e.b, 'new text')
+
+    config.override({'h': [{'a': 3, 'b': 'new text 2'}]})
+    self.assertIsInstance(config.h[0], DumpConfig1)
+    self.assertLen(config.h, 1)
+    self.assertEqual(config.h[0].a, 3)
+    self.assertEqual(config.h[0].b, 'new text 2')
+
+    config.override({'g': [{'a': 4, 'b': 'new text 3'}]})
+    self.assertIsInstance(config.g[0], DumpConfig1)
+    self.assertLen(config.g, 1)
+    self.assertEqual(config.g[0].a, 4)
+    self.assertEqual(config.g[0].b, 'new text 3')
+
+  @parameterized.parameters(
+      ('_locked', "The key '_locked' is internally reserved."),
+      ('_restrictions', "The key '_restrictions' is internally reserved."),
+      ('aa', "The key 'aa' does not exist."),
+  )
+  def test_key_error(self, key, msg):
+    params = base_config.Config()
+    with self.assertRaisesRegex(KeyError, msg):
+      params.override({key: True})
+
+  @parameterized.parameters(
+      ('str data',),
+      (123,),
+      (1.23,),
+      (None,),
+      (['str', 1, 2.3, None],),
+      (('str', 1, 2.3, None),),
+  )
+  def test_import_export_immutable_types(self, v):
+    self.assertImportExport(v)
+    out = base_config.Config({'key': v})
+    self.assertEqual(pprint.pformat(v), pprint.pformat(out.key))
+
+  def test_override_is_strict_true(self):
+    params = base_config.Config({
+        'a': 'aa',
+        'b': 2,
+        'c': {
+            'c1': 'cc',
+            'c2': 20
+        }
+    })
+    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c1, 'ccc')
+    with self.assertRaises(KeyError):
+      params.override({'d': 'ddd'}, is_strict=True)
+    with self.assertRaises(KeyError):
+      params.override({'c': {'c3': 30}}, is_strict=True)
+
+    config = base_config.Config({'key': [{'a': 42}]})
+    config.override({'key': [{'b': 43}]})
+    self.assertEqual(config.key[0].b, 43)
+    with self.assertRaisesRegex(AttributeError, 'The key `a` does not exist'):
+      _ = config.key[0].a
+
+  @parameterized.parameters(
+      (lambda x: x, 'Unknown type'),
+      (object(), 'Unknown type'),
+      (set(), 'Unknown type'),
+      (frozenset(), 'Unknown type'),
+  )
+  def test_import_unsupport_types(self, v, msg):
+    with self.assertRaisesRegex(TypeError, msg):
+      _ = base_config.Config({'key': v})
+
+  @parameterized.parameters(
+      ({
+          'a': [{
+              'b': 2,
+          }, {
+              'c': 3,
+          }]
+      },),
+      ({
+          'c': [{
+              'f': 1.1,
+          }, {
+              'h': [1, 2],
+          }]
+      },),
+      (({
+          'a': 'aa',
+          'b': 2,
+          'c': {
+              'c1': 10,
+              'c2': 20,
+          }
+      },),),
+  )
+  def test_import_export_nested_structure(self, d):
+    self.assertImportExport(d)
+
+  @parameterized.parameters(
+      ([{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }],),
+      (({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },),),
+  )
+  def test_import_export_nested_sequences(self, v):
+    self.assertImportExport(v)
+
+  @parameterized.parameters(
+      ([([{}],)],),
+      ([['str', 1, 2.3, None]],),
+      ((('str', 1, 2.3, None),),),
+      ([
+          ('str', 1, 2.3, None),
+      ],),
+      ([
+          ('str', 1, 2.3, None),
+      ],),
+      ([[{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }]],),
+      ([[[{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }]]],),
+      ((({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },),),),
+      (((({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },),),),),
+      ([({
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      },)],),
+      (([{
+          'a': 42,
+          'b': 'hello',
+          'c': 1.2
+      }],),),
+  )
+  def test_import_export_unsupport_sequence(self, v):
+    with self.assertRaisesRegex(TypeError,
+                                'Invalid sequence: only supports single level'):
+      _ = base_config.Config({'key': v})
+
+  def test_construct_subtype(self):
+    pass
+
+  def test_import_config(self):
+    params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
+    self.assertLen(params.a, 2)
+    self.assertEqual(params.a[0].b, 2)
+    self.assertEqual(type(params.a[0]), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[0].b), '2')
+    self.assertEqual(type(params.a[1]), base_config.Config)
+    self.assertEqual(type(params.a[1].c), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[1].c.d), '3')
+
+  def test_override(self):
+    params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
+    params.override({'a': [{'b': 4}, {'c': {'d': 5}}]}, is_strict=False)
+    self.assertEqual(type(params.a), list)
+    self.assertEqual(type(params.a[0]), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[0].b), '4')
+    self.assertEqual(type(params.a[1]), base_config.Config)
+    self.assertEqual(type(params.a[1].c), base_config.Config)
+    self.assertEqual(pprint.pformat(params.a[1].c.d), '5')
+
+  @parameterized.parameters(
+      ([{}],),
+      (({},),),
+  )
+  def test_config_vs_params_dict(self, v):
+    d = {'key': v}
+    self.assertEqual(type(base_config.Config(d).key[0]), base_config.Config)
+    self.assertEqual(type(base_config.params_dict.ParamsDict(d).key[0]), dict)
+
+  def test_ppformat(self):
+    self.assertEqual(
+        pprint.pformat([
+            's', 1, 1.0, True, None, {}, [], (), {
+                (2,): (3, [4], {
+                    6: 7,
+                }),
+                8: 9,
+            }
+        ]),
+        "['s', 1, 1.0, True, None, {}, [], (), {8: 9, (2,): (3, [4], {6: 7})}]")
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,410 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A parameter dictionary class which supports the nest structure."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import re
+
+import six
+import tensorflow as tf
+import yaml
+
+# regex pattern that matches on key-value pairs in a comma-separated
+# key-value pair string. It splits each k-v pair on the = sign, and
+# matches on values that are within single quotes, double quotes, single
+# values (e.g. floats, ints, etc.), and a lists within brackets.
+_PARAM_RE = re.compile(r"""
+  (?P<name>[a-zA-Z][\w\.]*)    # variable name: "var" or "x"
+  \s*=\s*
+  ((?P<val>\'(.*?)\'           # single quote
+  |
+  \"(.*?)\"                    # double quote
+  |
+  [^,\[]*                      # single value
+  |
+  \[[^\]]*\]))                 # list of values
+  ($|,\s*)""", re.VERBOSE)
+
+
+class ParamsDict(object):
+  """A hyperparameter container class."""
+
+  RESERVED_ATTR = ['_locked', '_restrictions']
+
+  def __init__(self, default_params=None, restrictions=None):
+    """Instantiate a ParamsDict.
+
+    Instantiate a ParamsDict given a set of default parameters and a list of
+    restrictions. Upon initialization, it validates itself by checking all the
+    defined restrictions, and raise error if it finds inconsistency.
+
+    Args:
+      default_params: a Python dict or another ParamsDict object including the
+        default parameters to initialize.
+      restrictions: a list of strings, which define a list of restrictions to
+        ensure the consistency of different parameters internally. Each
+        restriction string is defined as a binary relation with a set of
+        operators, including {'==', '!=',  '<', '<=', '>', '>='}.
+    """
+    self._locked = False
+    self._restrictions = []
+    if restrictions:
+      self._restrictions = restrictions
+    if default_params is None:
+      default_params = {}
+    self.override(default_params, is_strict=False)
+    self.validate()
+
+  def _set(self, k, v):
+    if isinstance(v, dict):
+      self.__dict__[k] = ParamsDict(v)
+    else:
+      self.__dict__[k] = copy.deepcopy(v)
+
+  def __setattr__(self, k, v):
+    """Sets the value of the existing key.
+
+    Note that this does not allow directly defining a new key. Use the
+    `override` method with `is_strict=False` instead.
+
+    Args:
+      k: the key string.
+      v: the value to be used to set the key `k`.
+
+    Raises:
+      KeyError: if k is not defined in the ParamsDict.
+    """
+    if k not in ParamsDict.RESERVED_ATTR:
+      if k not in self.__dict__.keys():
+        raise KeyError('The key `%{}` does not exist. '
+                       'To extend the existing keys, use '
+                       '`override` with `is_strict` = True.'.format(k))
+      if self._locked:
+        raise ValueError('The ParamsDict has been locked. '
+                         'No change is allowed.')
+    self._set(k, v)
+
+  def __getattr__(self, k):
+    """Gets the value of the existing key.
+
+    Args:
+      k: the key string.
+
+    Returns:
+      the value of the key.
+
+    Raises:
+      AttributeError: if k is not defined in the ParamsDict.
+    """
+    if k not in self.__dict__.keys():
+      raise AttributeError('The key `{}` does not exist. '.format(k))
+    return self.__dict__[k]
+
+  def __contains__(self, key):
+    """Implements the membership test operator."""
+    return key in self.__dict__
+
+  def get(self, key, value=None):
+    """Accesses through built-in dictionary get method."""
+    return self.__dict__.get(key, value)
+
+  def override(self, override_params, is_strict=True):
+    """Override the ParamsDict with a set of given params.
+
+    Args:
+      override_params: a dict or a ParamsDict specifying the parameters to
+        be overridden.
+      is_strict: a boolean specifying whether override is strict or not. If
+        True, keys in `override_params` must be present in the ParamsDict.
+        If False, keys in `override_params` can be different from what is
+        currently defined in the ParamsDict. In this case, the ParamsDict will
+        be extended to include the new keys.
+    """
+    if self._locked:
+      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+    if isinstance(override_params, ParamsDict):
+      override_params = override_params.as_dict()
+    self._override(override_params, is_strict)  # pylint: disable=protected-access
+
+  def _override(self, override_dict, is_strict=True):
+    """The implementation of `override`."""
+    for k, v in six.iteritems(override_dict):
+      if k in ParamsDict.RESERVED_ATTR:
+        raise KeyError('The key `%{}` is internally reserved. '
+                       'Can not be overridden.')
+      if k not in self.__dict__.keys():
+        if is_strict:
+          raise KeyError('The key `{}` does not exist. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(k))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict):
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, ParamsDict):
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self.__dict__[k] = copy.deepcopy(v)
+
+  def lock(self):
+    """Makes the ParamsDict immutable."""
+    self._locked = True
+
+  def as_dict(self):
+    """Returns a dict representation of ParamsDict.
+
+    For the nested ParamsDict, a nested dict will be returned.
+    """
+    params_dict = {}
+    for k, v in six.iteritems(self.__dict__):
+      if k not in ParamsDict.RESERVED_ATTR:
+        if isinstance(v, ParamsDict):
+          params_dict[k] = v.as_dict()
+        else:
+          params_dict[k] = copy.deepcopy(v)
+    return params_dict
+
+  def validate(self):
+    """Validate the parameters consistency based on the restrictions.
+
+    This method validates the internal consistency using the pre-defined list of
+    restrictions. A restriction is defined as a string which specfiies a binary
+    operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
+    '>='}. Note that the meaning of these operators are consistent with the
+    underlying Python immplementation. Users should make sure the define
+    restrictions on their type make sense.
+
+    For example, for a ParamsDict like the following
+    ```
+    a:
+      a1: 1
+      a2: 2
+    b:
+      bb:
+        bb1: 10
+        bb2: 20
+      ccc:
+        a1: 1
+        a3: 3
+    ```
+    one can define two restrictions like this
+    ['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
+
+    What it enforces are:
+     - a.a1 = 1 == b.ccc.a1 = 2
+     - a.a2 = 2 <= b.bb.bb2 = 20
+
+    Raises:
+      KeyError: if any of the following happens
+        (1) any of parameters in any of restrictions is not defined in
+            ParamsDict,
+        (2) any inconsistency violating the restriction is found.
+      ValueError: if the restriction defined in the string is not supported.
+    """
+    def _get_kv(dotted_string, params_dict):
+      tokenized_params = dotted_string.split('.')
+      v = params_dict
+      for t in tokenized_params:
+        v = v[t]
+      return tokenized_params[-1], v
+
+    def _get_kvs(tokens, params_dict):
+      if len(tokens) != 2:
+        raise ValueError('Only support binary relation in restriction.')
+      stripped_tokens = [t.strip() for t in tokens]
+      left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
+      right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
+      return left_k, left_v, right_k, right_v
+
+    params_dict = self.as_dict()
+    for restriction in self._restrictions:
+      if '==' in restriction:
+        tokens = restriction.split('==')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v != right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '!=' in restriction:
+        tokens = restriction.split('!=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v == right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '<' in restriction:
+        tokens = restriction.split('<')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v >= right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '<=' in restriction:
+        tokens = restriction.split('<=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v > right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '>' in restriction:
+        tokens = restriction.split('>')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v <= right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '>=' in restriction:
+        tokens = restriction.split('>=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v < right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      else:
+        raise ValueError('Unsupported relation in restriction.')
+
+
+def read_yaml_to_params_dict(file_path):
+  """Reads a YAML file to a ParamsDict."""
+  with tf.io.gfile.GFile(file_path, 'r') as f:
+    params_dict = yaml.load(f)
+    return ParamsDict(params_dict)
+
+
+def save_params_dict_to_yaml(params, file_path):
+  """Saves the input ParamsDict to a YAML file."""
+  with tf.io.gfile.GFile(file_path, 'w') as f:
+
+    def _my_list_rep(dumper, data):
+      # u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
+      return dumper.represent_sequence(
+          u'tag:yaml.org,2002:seq', data, flow_style=True)
+    yaml.add_representer(list, _my_list_rep)
+    yaml.dump(params.as_dict(), f, default_flow_style=False)
+
+
+def nested_csv_str_to_json_str(csv_str):
+  """Converts a nested (using '.') comma-separated k=v string to a JSON string.
+
+  Converts a comma-separated string of key/value pairs that supports
+  nesting of keys to a JSON string. Nesting is implemented using
+  '.' between levels for a given key.
+
+  Spacing between commas and = is supported (e.g. there is no difference between
+  "a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
+  keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
+
+  Note that this will only support values supported by CSV, meaning
+  values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
+  supported. Strings are supported as well, e.g. "a='hello'".
+
+  An example conversion would be:
+
+  "a=1, b=2, c.a=2, c.b=3, d.a.a=5"
+
+  to
+
+  "{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
+
+  Args:
+    csv_str: the comma separated string.
+
+  Returns:
+    the converted JSON string.
+
+  Raises:
+    ValueError: If csv_str is not in a comma separated string or
+      if the string is formatted incorrectly.
+  """
+  if not csv_str:
+    return ''
+
+  formatted_entries = []
+  nested_map = collections.defaultdict(list)
+  pos = 0
+  while pos < len(csv_str):
+    m = _PARAM_RE.match(csv_str, pos)
+    if not m:
+      raise ValueError('Malformed hyperparameter value while parsing '
+                       'CSV string: %s' % csv_str[pos:])
+    pos = m.end()
+    # Parse the values.
+    m_dict = m.groupdict()
+    name = m_dict['name']
+    v = m_dict['val']
+
+    # If a GCS path (e.g. gs://...) is provided, wrap this in quotes
+    # as yaml.load would otherwise throw an exception
+    if re.match(r'(?=[^\"\'])(?=[gs://])', v):
+      v = '\'{}\''.format(v)
+
+    name_nested = name.split('.')
+    if len(name_nested) > 1:
+      grouping = name_nested[0]
+      value = '.'.join(name_nested[1:]) + '=' + v
+      nested_map[grouping].append(value)
+    else:
+      formatted_entries.append('%s : %s' % (name, v))
+
+  for grouping, value in nested_map.items():
+    value = ','.join(value)
+    value = nested_csv_str_to_json_str(value)
+    formatted_entries.append('%s : %s' % (grouping, value))
+  return '{' + ', '.join(formatted_entries) + '}'
+
+
+def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
+  """Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
+
+  The logic of the function is outlined below:
+  1. Test that the input is a dict. If not, proceed to 2.
+  2. Tests that the input is a string. If not, raise unknown ValueError
+  2.1. Test if the string is in a CSV format. If so, parse.
+  If not, proceed to 2.2.
+  2.2. Try loading the string as a YAML/JSON. If successful, parse to
+  dict and use it to override. If not, proceed to 2.3.
+  2.3. Try using the string as a file path and load the YAML file.
+
+  Args:
+    params: a ParamsDict object to be overridden.
+    dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or
+      path to a YAML file specifying the parameters to be overridden.
+    is_strict: a boolean specifying whether override is strict or not.
+
+  Returns:
+    params: the overridden ParamsDict object.
+
+  Raises:
+    ValueError: if failed to override the parameters.
+  """
+  if not dict_or_string_or_yaml_file:
+    return params
+  if isinstance(dict_or_string_or_yaml_file, dict):
+    params.override(dict_or_string_or_yaml_file, is_strict)
+  elif isinstance(dict_or_string_or_yaml_file, six.string_types):
+    try:
+      dict_or_string_or_yaml_file = (
+          nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
+    except ValueError:
+      pass
+    params_dict = yaml.load(dict_or_string_or_yaml_file)
+    if isinstance(params_dict, dict):
+      params.override(params_dict, is_strict)
+    else:
+      with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
+        params.override(yaml.load(f), is_strict)
+  else:
+    raise ValueError('Unknown input type to parse.')
+  return params
@@ -0,0 +1,322 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for official.modeling.hyperparams.params_dict.py."""
+
+import os
+
+import tensorflow as tf
+import yaml
+
+from official.modeling.hyperparams import params_dict
+
+
+class ParamsDictTest(tf.test.TestCase):
+
+  def test_init_from_an_empty_dict(self):
+    params = params_dict.ParamsDict()
+    with self.assertRaises(AttributeError):
+      _ = params.a
+
+    with self.assertRaises(KeyError):
+      params.a = 'aa'
+
+  def test_init_from_a_dict(self):
+    params = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+
+  def test_init_from_a_param_dict(self):
+    params_init = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    params = params_dict.ParamsDict(params_init)
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+
+  def test_lock(self):
+    params = params_dict.ParamsDict({'a': 1, 'b': 2})
+    params.lock()
+    with self.assertRaises(ValueError):
+      params.a = 10
+    with self.assertRaises(ValueError):
+      params.override({'b': 20})
+
+  def test_setattr(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
+    params.c = 'ccc'
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c, 'ccc')
+
+  def test_getattr(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c, None)
+
+  def test_contains(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa'}, is_strict=False)
+    self.assertIn('a', params)
+    self.assertNotIn('b', params)
+
+  def test_get(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa'}, is_strict=False)
+    self.assertEqual(params.get('a'), 'aa')
+    self.assertEqual(params.get('b', 2), 2)
+    self.assertEqual(params.get('b'), None)
+
+  def test_override_is_strict_true(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 'cc', 'c2': 20}})
+    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c1, 'ccc')
+    with self.assertRaises(KeyError):
+      params.override({'d': 'ddd'}, is_strict=True)
+    with self.assertRaises(KeyError):
+      params.override({'c': {'c3': 30}}, is_strict=True)
+
+  def test_override_is_strict_false(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    params.override({'a': 2, 'c': {'c3': 3000}}, is_strict=False)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c3, 3000)
+    params.override({'d': 'ddd'}, is_strict=False)
+    self.assertEqual(params.d, 'ddd')
+    params.override({'c': {'c4': 4444}}, is_strict=False)
+    self.assertEqual(params.c.c4, 4444)
+
+  def test_as_dict(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    params_d = params.as_dict()
+    self.assertEqual(params_d['a'], 'aa')
+    self.assertEqual(params_d['b'], 2)
+    self.assertEqual(params_d['c']['c1'], 10)
+    self.assertEqual(params_d['c']['c2'], 20)
+
+  def test_validate(self):
+    # Raise error due to the unknown parameter.
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict(
+          {'a': 1, 'b': {'a': 11}}, ['a == c'])
+
+    # OK to check equality of two nested dicts.
+    params = params_dict.ParamsDict(
+        {'a': 1, 'b': {'a': 10}, 'c': {'a': 10}}, ['b == c'])
+
+    # Raise error due to inconsistency
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict(
+          {'a': 1, 'c': {'a': 10}}, ['a == c.a'])
+
+    # Valid rule.
+    params = params_dict.ParamsDict(
+        {'a': 1, 'c': {'a': 1}}, ['a == c.a'])
+
+    # Overridding violates the existing rule, raise error upon validate.
+    params.override({'a': 11})
+    with self.assertRaises(KeyError):
+      params.validate()
+
+
+class ParamsDictIOTest(tf.test.TestCase):
+
+  def write_temp_file(self, filename, text):
+    temp_file = os.path.join(self.get_temp_dir(), filename)
+    with tf.io.gfile.GFile(temp_file, 'w') as writer:
+      writer.write(text)
+    return temp_file
+
+  def test_save_params_dict_to_yaml(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    output_yaml_file = os.path.join(self.get_temp_dir(), 'params.yaml')
+    params_dict.save_params_dict_to_yaml(params, output_yaml_file)
+
+    with tf.io.gfile.GFile(output_yaml_file, 'r') as f:
+      params_d = yaml.load(f)
+      self.assertEqual(params.a, params_d['a'])
+      self.assertEqual(params.b, params_d['b'])
+      self.assertEqual(params.c.c1, params_d['c']['c1'])
+      self.assertEqual(params.c.c2, params_d['c']['c2'])
+
+  def test_read_yaml_to_params_dict(self):
+    input_yaml_file = self.write_temp_file(
+        'params.yaml', r"""
+        a: 'aa'
+        b: 2
+        c:
+          c1: 10
+          c2: 20
+    """)
+    params = params_dict.read_yaml_to_params_dict(input_yaml_file)
+
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c.c1, 10)
+    self.assertEqual(params.c.c2, 20)
+
+  def test_override_params_dict_using_dict(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    override_dict = {'b': 5.2, 'c': [30, 40]}
+    params = params_dict.override_params_dict(
+        params, override_dict, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_yaml_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    override_yaml_string = "'b': 5.2\n'c': [30, 40]"
+    params = params_dict.override_params_dict(
+        params, override_yaml_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_json_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
+        'd': {'d1': {'d2': 'hello'}}, 'e': False})
+    override_json_string = "{ b: { b2: [3, 4] }, d: { d1: { d2: 'hi' } } }"
+    params = params_dict.override_params_dict(
+        params, override_json_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(2, params.b.b1)
+    self.assertEqual([3, 4], params.b.b2)
+    self.assertEqual('hi', params.d.d1.d2)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_csv_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
+        'd': {'d1': {'d2': 'hello'}}, 'e': False})
+    override_csv_string = "b.b2=[3,4], d.d1.d2='hi, world', e=gs://test"
+    params = params_dict.override_params_dict(
+        params, override_csv_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(2, params.b.b1)
+    self.assertEqual([3, 4], params.b.b2)
+    self.assertEqual('hi, world', params.d.d1.d2)
+    self.assertEqual('gs://test', params.e)
+
+  def test_override_params_dict_using_yaml_file(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    override_yaml_file = self.write_temp_file(
+        'params.yaml', r"""
+        b: 5.2
+        c: [30, 40]
+        """)
+    params = params_dict.override_params_dict(
+        params, override_yaml_file, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+
+class IOTest(tf.test.TestCase):
+
+  def test_basic_csv_str_to_json_str(self):
+    csv_str = 'a=1,b=2,c=3'
+    json_str = '{a : 1, b : 2, c : 3}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_basic_csv_str_load(self):
+    csv_str = 'a=1,b=2,c=3'
+    expected_output = {'a': 1, 'b': 2, 'c': 3}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_basic_nested_csv_str_to_json_str(self):
+    csv_str = 'a=1,b.b1=2'
+    json_str = '{a : 1, b : {b1 : 2}}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_basic_nested_csv_str_load(self):
+    csv_str = 'a=1,b.b1=2,c.c1=3'
+    expected_output = {'a': 1, 'b': {'b1': 2}, 'c': {'c1': 3}}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_complex_nested_csv_str_to_json_str(self):
+    csv_str = 'a.aa.aaa.aaaaa.a=1'
+    json_str = '{a : {aa : {aaa : {aaaaa : {a : 1}}}}}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_complex_nested_csv_str_load(self):
+    csv_str = 'a.aa.aaa.aaaaa.a=1,a.a=2'
+    expected_output = {'a': {'aa': {'aaa': {'aaaaa': {'a': 1}}}, 'a': 2}}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_csv_str_load_supported_datatypes(self):
+    csv_str = 'a=1,b=2.,c=[1,2,3],d=\'hello, there\',e=\"Hi.\"'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertEqual(converted_dict['a'], 1)
+    self.assertEqual(converted_dict['b'], 2.)
+    self.assertEqual(converted_dict['c'], [1, 2, 3])
+    self.assertEqual(converted_dict['d'], 'hello, there')
+    self.assertEqual(converted_dict['e'], 'Hi.')
+
+  def test_csv_str_load_unsupported_datatypes(self):
+    csv_str = 'a=[[1,2,3],[4,5,6]]'
+    self.assertRaises(ValueError,
+                      params_dict.nested_csv_str_to_json_str,
+                      csv_str)
+
+  def test_csv_str_to_json_str_spacing(self):
+    csv_str1 = 'a=1,b=2,c=3'
+    csv_str2 = 'a = 1, b = 2, c = 3'
+    json_str = '{a : 1, b : 2, c : 3}'
+    converted_csv_str1 = params_dict.nested_csv_str_to_json_str(csv_str1)
+    converted_csv_str2 = params_dict.nested_csv_str_to_json_str(csv_str2)
+    self.assertEqual(converted_csv_str1, converted_csv_str2)
+    self.assertEqual(converted_csv_str1, json_str)
+    self.assertEqual(converted_csv_str2, json_str)
+
+  def test_gcs_added_quotes(self):
+    csv_str = 'a=gs://abc, b=gs://def'
+    expected_output = '{a : \'gs://abc\', b : \'gs://def\'}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, expected_output)
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,491 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A light weight utilities to train NLP models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import tempfile
+
+from absl import logging
+import tensorflow as tf
+from official.staging.training import grad_utils
+from official.utils.misc import distribution_utils
+
+_SUMMARY_TXT = 'training_summary.txt'
+_MIN_SUMMARY_STEPS = 10
+
+
+def _should_export_checkpoint(strategy):
+  return (not strategy) or strategy.extended.should_checkpoint
+
+
+def _should_export_summary(strategy):
+  return (not strategy) or strategy.extended.should_save_summary
+
+
+def _save_checkpoint(strategy, checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to with provided checkpoint prefix."""
+
+  if _should_export_checkpoint(strategy):
+    checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+    saved_path = checkpoint.save(checkpoint_path)
+    logging.info('Saving model as TF checkpoint: %s', saved_path)
+  else:
+    # In multi worker training we need every worker to save checkpoint, because
+    # variables can trigger synchronization on read and synchronization needs
+    # all workers to participate. To avoid workers overriding each other we save
+    # to a temporary directory on non-chief workers.
+    tmp_dir = tempfile.mkdtemp()
+    checkpoint.save(os.path.join(tmp_dir, 'ckpt'))
+    tf.io.gfile.rmtree(tmp_dir)
+  return
+
+
+def _get_input_iterator(input_fn, strategy):
+  """Returns distributed dataset iterator."""
+  # When training with TPU pods, datasets needs to be cloned across
+  # workers. Since Dataset instance cannot be cloned in eager mode, we instead
+  # pass callable that returns a dataset.
+  if not callable(input_fn):
+    raise ValueError('`input_fn` should be a closure that returns a dataset.')
+  iterator = iter(
+      strategy.experimental_distribute_datasets_from_function(input_fn))
+  return iterator
+
+
+def _float_metric_value(metric):
+  """Gets the value of a float-value keras metric."""
+  return metric.result().numpy().astype(float)
+
+
+def steps_to_run(current_step, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  remainder_in_epoch = current_step % steps_per_epoch
+  if remainder_in_epoch != 0:
+    return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
+  else:
+    return steps_per_loop
+
+
+def write_txt_summary(training_summary, summary_dir):
+  """Writes a summary text file to record stats."""
+  summary_path = os.path.join(summary_dir, _SUMMARY_TXT)
+  with tf.io.gfile.GFile(summary_path, 'wb') as f:
+    logging.info('Training Summary: \n%s', str(training_summary))
+    f.write(json.dumps(training_summary, indent=4))
+
+
+def run_customized_training_loop(
+    # pylint: disable=invalid-name
+    _sentinel=None,
+    # pylint: enable=invalid-name
+    strategy=None,
+    model_fn=None,
+    loss_fn=None,
+    scale_loss=True,
+    model_dir=None,
+    train_input_fn=None,
+    steps_per_epoch=None,
+    steps_per_loop=1,
+    epochs=1,
+    eval_input_fn=None,
+    eval_steps=None,
+    metric_fn=None,
+    init_checkpoint=None,
+    custom_callbacks=None,
+    run_eagerly=False,
+    sub_model_export_name=None,
+    explicit_allreduce=False,
+    pre_allreduce_callbacks=None,
+    post_allreduce_callbacks=None):
+  """Run BERT pretrain model training using low-level API.
+
+  Arguments:
+      _sentinel: Used to prevent positional parameters. Internal, do not use.
+      strategy: Distribution strategy on which to run low level training loop.
+      model_fn: Function that returns a tuple (model, sub_model). Caller of this
+        function should add optimizer to the `model` via calling
+        `model.compile()` API or manually setting `model.optimizer` attribute.
+        Second element of the returned tuple(sub_model) is an optional sub model
+        to be used for initial checkpoint -- if provided.
+      loss_fn: Function with signature func(labels, logits) and returns a loss
+        tensor.
+      scale_loss: Whether to divide the raw loss by number of replicas before
+        gradients calculation.
+      model_dir: Model directory used during training for restoring/saving model
+        weights.
+      train_input_fn: Function that returns a tf.data.Dataset used for training.
+      steps_per_epoch: Number of steps to run per epoch. At the end of each
+        epoch, model checkpoint will be saved and evaluation will be conducted
+        if evaluation dataset is provided.
+      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
+        communication in eager context, training logs are printed every
+        steps_per_loop.
+      epochs: Number of epochs to train.
+      eval_input_fn: Function that returns evaluation dataset. If none,
+        evaluation is skipped.
+      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
+        is not none.
+      metric_fn: A metrics function that returns a Keras Metric object to record
+        evaluation result using evaluation dataset or with training dataset
+        after every epoch.
+      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
+        `model_fn`.
+      custom_callbacks: A list of Keras Callbacks objects to run during
+        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
+        methods are invoked during training.
+      run_eagerly: Whether to run model training in pure eager execution. This
+        should be disable for TPUStrategy.
+      sub_model_export_name: If not None, will export `sub_model` returned by
+        `model_fn` into checkpoint files. The name of intermediate checkpoint
+        file is {sub_model_export_name}_step_{step}.ckpt and the last
+        checkpint's name is {sub_model_export_name}.ckpt;
+        if None, `sub_model` will not be exported as checkpoint.
+      explicit_allreduce: Whether to explicitly perform gradient allreduce,
+        instead of relying on implicit allreduce in optimizer.apply_gradients().
+        default is False. For now, if training using FP16 mixed precision,
+        explicit allreduce will aggregate gradients in FP16 format. For TPU and
+        GPU training using FP32, explicit allreduce will aggregate gradients in
+        FP32 format.
+      pre_allreduce_callbacks: A list of callback functions that takes gradients
+        and model variables pairs as input, manipulate them, and returns a new
+        gradients and model variables paris. The callback functions will be
+        invoked in the list order and before gradients are allreduced.
+        With mixed precision training, the pre_allreduce_allbacks will be
+        applied on scaled_gradients. Default is no callbacks.
+        Only used when explicit_allreduce=True.
+      post_allreduce_callbacks: A list of callback functions that takes
+        gradients and model variables pairs as input, manipulate them, and
+        returns a new gradients and model variables paris. The callback
+        functions will be invoked in the list order and right before gradients
+        are applied to variables for updates. Default is no callbacks. Only used
+        when explicit_allreduce=True.
+
+  Returns:
+      Trained model.
+
+  Raises:
+      ValueError: (1) When model returned by `model_fn` does not have optimizer
+        attribute or when required parameters are set to none. (2) eval args are
+        not specified correctly. (3) metric_fn must be a callable if specified.
+        (4) sub_model_checkpoint_name is specified, but `sub_model` returned
+        by `model_fn` is None.
+  """
+
+  if _sentinel is not None:
+    raise ValueError('only call `run_customized_training_loop()` '
+                     'with named arguments.')
+
+  required_arguments = [
+      strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
+  ]
+  if [arg for arg in required_arguments if arg is None]:
+    raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
+                     '`steps_per_loop` and `steps_per_epoch` are required '
+                     'parameters.')
+  if steps_per_loop > steps_per_epoch:
+    logging.error(
+        'steps_per_loop: %d is specified to be greater than '
+        ' steps_per_epoch: %d, we will use steps_per_epoch as'
+        ' steps_per_loop.', steps_per_loop, steps_per_epoch)
+    steps_per_loop = steps_per_epoch
+  assert tf.executing_eagerly()
+
+  if run_eagerly:
+    if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
+      raise ValueError(
+          'TPUStrategy should not run eagerly as it heavily relies on graph'
+          ' optimization for the distributed system.')
+
+  if eval_input_fn and (eval_steps is None or metric_fn is None):
+    raise ValueError(
+        '`eval_step` and `metric_fn` are required when `eval_input_fn ` '
+        'is not none.')
+  if metric_fn and not callable(metric_fn):
+    raise ValueError(
+        'if `metric_fn` is specified, metric_fn must be a callable.')
+
+  total_training_steps = steps_per_epoch * epochs
+  train_iterator = _get_input_iterator(train_input_fn, strategy)
+
+  with distribution_utils.get_strategy_scope(strategy):
+    # To correctly place the model weights on accelerators,
+    # model and optimizer should be created in scope.
+    model, sub_model = model_fn()
+    if not hasattr(model, 'optimizer'):
+      raise ValueError('User should set optimizer attribute to model '
+                       'inside `model_fn`.')
+    if sub_model_export_name and sub_model is None:
+      raise ValueError('sub_model_export_name is specified as %s, but '
+                       'sub_model is None.' % sub_model_export_name)
+
+    optimizer = model.optimizer
+
+    if init_checkpoint:
+      logging.info(
+          'Checkpoint file %s found and restoring from '
+          'initial checkpoint for core model.', init_checkpoint)
+      checkpoint = tf.train.Checkpoint(model=sub_model)
+      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+      logging.info('Loading from checkpoint file completed')
+
+    train_loss_metric = tf.keras.metrics.Mean(
+        'training_loss', dtype=tf.float32)
+    eval_metrics = [metric_fn()] if metric_fn else []
+    # If evaluation is required, make a copy of metric as it will be used by
+    # both train and evaluation.
+    train_metrics = [
+        metric.__class__.from_config(metric.get_config())
+        for metric in eval_metrics
+    ]
+
+    # Create summary writers
+    if _should_export_summary(strategy):
+      summary_dir = os.path.join(model_dir, 'summaries')
+    else:
+      # In multi worker training we need every worker to write summary, because
+      # variables can trigger synchronization on read and synchronization needs
+      # all workers to participate.
+      summary_dir = tempfile.mkdtemp()
+    eval_summary_writer = tf.summary.create_file_writer(
+        os.path.join(summary_dir, 'eval'))
+    if steps_per_loop >= _MIN_SUMMARY_STEPS:
+      # Only writes summary when the stats are collected sufficiently over
+      # enough steps.
+      train_summary_writer = tf.summary.create_file_writer(
+          os.path.join(summary_dir, 'train'))
+    else:
+      train_summary_writer = None
+
+    # Collects training variables.
+    training_vars = model.trainable_variables
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+
+      inputs, labels = inputs
+      with tf.GradientTape() as tape:
+        model_outputs = model(inputs, training=True)
+        loss = loss_fn(labels, model_outputs)
+        # Raw loss is used for reporting in metrics/logs.
+        raw_loss = loss
+        if scale_loss:
+          # Scales down the loss for gradients to be invariant from replicas.
+          loss = loss / strategy.num_replicas_in_sync
+
+      if explicit_allreduce:
+        grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
+                                                     training_vars,
+                                                     pre_allreduce_callbacks,
+                                                     post_allreduce_callbacks)
+      else:
+        if isinstance(optimizer,
+                      tf.keras.mixed_precision.experimental.LossScaleOptimizer):
+          with tape:
+            scaled_loss = optimizer.get_scaled_loss(loss)
+          scaled_grads = tape.gradient(scaled_loss, training_vars)
+          grads = optimizer.get_unscaled_gradients(scaled_grads)
+        else:
+          grads = tape.gradient(loss, training_vars)
+        optimizer.apply_gradients(zip(grads, training_vars))
+      # For reporting, the metric takes the mean of losses.
+      train_loss_metric.update_state(raw_loss)
+      for metric in train_metrics:
+        metric.update_state(labels, model_outputs)
+
+    @tf.function
+    def train_steps(iterator, steps):
+      """Performs distributed training steps in a loop.
+
+      Args:
+        iterator: the distributed iterator of training datasets.
+        steps: an tf.int32 integer tensor to specify number of steps to run
+          inside host training loop.
+
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      """
+      if not isinstance(steps, tf.Tensor):
+        raise ValueError('steps should be an Tensor. Python object may cause '
+                         'retracing.')
+
+      for _ in tf.range(steps):
+        strategy.run(_replicated_step, args=(next(iterator),))
+
+    def train_single_step(iterator):
+      """Performs a distributed training step.
+
+      Args:
+        iterator: the distributed iterator of training datasets.
+
+      Raises:
+        ValueError: Any of the arguments or tensor shapes are invalid.
+      """
+      strategy.run(_replicated_step, args=(next(iterator),))
+
+    def test_step(iterator):
+      """Calculates evaluation metrics on distributed devices."""
+
+      def _test_step_fn(inputs):
+        """Replicated accuracy calculation."""
+
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        for metric in eval_metrics:
+          metric.update_state(labels, model_outputs)
+
+      strategy.run(_test_step_fn, args=(next(iterator),))
+
+    if not run_eagerly:
+      train_single_step = tf.function(train_single_step)
+      test_step = tf.function(test_step)
+
+    def _run_evaluation(current_training_step, test_iterator):
+      """Runs validation steps and aggregate metrics."""
+      for _ in range(eval_steps):
+        test_step(test_iterator)
+
+      with eval_summary_writer.as_default():
+        for metric in eval_metrics + model.metrics:
+          metric_value = _float_metric_value(metric)
+          logging.info('Step: [%d] Validation %s = %f', current_training_step,
+                       metric.name, metric_value)
+          tf.summary.scalar(
+              metric.name, metric_value, step=current_training_step)
+        eval_summary_writer.flush()
+
+    def _run_callbacks_on_batch_begin(batch):
+      """Runs custom callbacks at the start of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        callback.on_batch_begin(batch)
+
+    def _run_callbacks_on_batch_end(batch, logs):
+      """Runs custom callbacks at the end of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        callback.on_batch_end(batch, logs)
+
+    # Training loop starts here.
+    checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+    sub_model_checkpoint = tf.train.Checkpoint(
+        model=sub_model) if sub_model_export_name else None
+
+    latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+    if latest_checkpoint_file:
+      logging.info(
+          'Checkpoint file %s found and restoring from '
+          'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(latest_checkpoint_file)
+      logging.info('Loading from checkpoint file completed')
+
+    current_step = optimizer.iterations.numpy()
+    checkpoint_name = 'ctl_step_{step}.ckpt'
+
+    while current_step < total_training_steps:
+      # Training loss/metric are taking average over steps inside micro
+      # training loop. We reset the their values before each round.
+      train_loss_metric.reset_states()
+      for metric in train_metrics + model.metrics:
+        metric.reset_states()
+
+      _run_callbacks_on_batch_begin(current_step)
+      # Runs several steps in the host while loop.
+      steps = steps_to_run(current_step, steps_per_epoch, steps_per_loop)
+
+      if tf.test.is_built_with_cuda():
+        # TODO(zongweiz): merge with train_steps once tf.while_loop
+        # GPU performance bugs are fixed.
+        for _ in range(steps):
+          train_single_step(train_iterator)
+      else:
+        # Converts steps to a Tensor to avoid tf.function retracing.
+        train_steps(train_iterator,
+                    tf.convert_to_tensor(steps, dtype=tf.int32))
+      train_loss = _float_metric_value(train_loss_metric)
+      current_step += steps
+      _run_callbacks_on_batch_end(current_step - 1, {'loss': train_loss})
+
+      # Updates training logging.
+      training_status = 'Train Step: %d/%d  / loss = %s' % (
+          current_step, total_training_steps, train_loss)
+
+      if train_summary_writer:
+        with train_summary_writer.as_default():
+          tf.summary.scalar(
+              train_loss_metric.name, train_loss, step=current_step)
+          for metric in train_metrics + model.metrics:
+            metric_value = _float_metric_value(metric)
+            training_status += '  %s = %f' % (metric.name, metric_value)
+            tf.summary.scalar(metric.name, metric_value, step=current_step)
+          train_summary_writer.flush()
+      logging.info(training_status)
+
+      # Saves model checkpoints and run validation steps at every epoch end.
+      if current_step % steps_per_epoch == 0:
+        # To avoid repeated model saving, we do not save after the last
+        # step of training.
+        if current_step < total_training_steps:
+          _save_checkpoint(strategy, checkpoint, model_dir,
+                           checkpoint_name.format(step=current_step))
+          if sub_model_export_name:
+            _save_checkpoint(
+                strategy, sub_model_checkpoint, model_dir,
+                '%s_step_%d.ckpt' % (sub_model_export_name, current_step))
+        if eval_input_fn:
+          logging.info('Running evaluation after step: %s.', current_step)
+          _run_evaluation(current_step,
+                          _get_input_iterator(eval_input_fn, strategy))
+          # Re-initialize evaluation metric.
+          for metric in eval_metrics + model.metrics:
+            metric.reset_states()
+
+    _save_checkpoint(strategy, checkpoint, model_dir,
+                     checkpoint_name.format(step=current_step))
+    if sub_model_export_name:
+      _save_checkpoint(strategy, sub_model_checkpoint, model_dir,
+                       '%s.ckpt' % sub_model_export_name)
+
+    if eval_input_fn:
+      logging.info('Running final evaluation after training is complete.')
+      _run_evaluation(current_step,
+                      _get_input_iterator(eval_input_fn, strategy))
+
+    training_summary = {
+        'total_training_steps': total_training_steps,
+        'train_loss': _float_metric_value(train_loss_metric),
+    }
+    if eval_metrics:
+      # TODO(hongkuny): Cleans up summary reporting in text.
+      training_summary['last_train_metrics'] = _float_metric_value(
+          train_metrics[0])
+      training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])
+
+    write_txt_summary(training_summary, summary_dir)
+
+    if not _should_export_summary(strategy):
+      tf.io.gfile.rmtree(summary_dir)
+
+    return model
@@ -0,0 +1,235 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.modeling.training.model_training_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+from absl.testing.absltest import mock
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.modeling import model_training_utils
+
+
+def eager_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+
+
+def eager_gpu_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ],
+      mode='eager',
+  )
+
+
+def create_fake_data_input_fn(batch_size, features_shape, num_classes):
+  """Creates a dummy input function with the given feature and label shapes.
+
+  Args:
+    batch_size: integer.
+    features_shape: list[int]. Feature shape for an individual example.
+    num_classes: integer. Number of labels.
+
+  Returns:
+    An input function that is usable in the executor.
+  """
+
+  def _dataset_fn(input_context=None):
+    """An input function for generating fake data."""
+    local_batch_size = input_context.get_per_replica_batch_size(batch_size)
+    features = np.random.rand(64, *features_shape)
+    labels = np.random.randint(2, size=[64, num_classes])
+    # Convert the inputs to a Dataset.
+    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+
+    def _assign_dtype(features, labels):
+      features = tf.cast(features, tf.float32)
+      labels = tf.cast(labels, tf.float32)
+      return features, labels
+
+    # Shuffle, repeat, and batch the examples.
+    dataset = dataset.map(_assign_dtype)
+    dataset = dataset.shuffle(64).repeat()
+    dataset = dataset.batch(local_batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(buffer_size=64)
+    return dataset
+
+  return _dataset_fn
+
+
+def create_model_fn(input_shape, num_classes, use_float16=False):
+
+  def _model_fn():
+    """A one-layer softmax model suitable for testing."""
+    input_layer = tf.keras.layers.Input(shape=input_shape)
+    x = tf.keras.layers.Dense(num_classes, activation='relu')(input_layer)
+    output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
+    sub_model = tf.keras.models.Model(input_layer, x, name='sub_model')
+    model = tf.keras.models.Model(input_layer, output_layer, name='model')
+    model.add_metric(
+        tf.reduce_mean(input_layer), name='mean_input', aggregation='mean')
+    model.optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
+    if use_float16:
+      model.optimizer = (
+          tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+              model.optimizer, loss_scale='dynamic'))
+    return model, sub_model
+
+  return _model_fn
+
+
+def metric_fn():
+  """Gets a tf.keras metric object."""
+  return tf.keras.metrics.CategoricalAccuracy(name='accuracy', dtype=tf.float32)
+
+
+def summaries_with_matching_keyword(keyword, summary_dir):
+  """Yields summary protos matching given keyword from event file."""
+  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, 'events*'))
+  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
+        if keyword in value.tag:
+          tf.compat.v1.logging.error(event)
+          yield event.summary
+
+
+def check_eventfile_for_keyword(keyword, summary_dir):
+  """Checks event files for the keyword."""
+  return any(summaries_with_matching_keyword(keyword, summary_dir))
+
+
+class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super(ModelTrainingUtilsTest, self).setUp()
+    self._model_fn = create_model_fn(input_shape=[128], num_classes=3)
+
+  def run_training(self, strategy, model_dir, steps_per_loop, run_eagerly):
+    input_fn = create_fake_data_input_fn(
+        batch_size=8, features_shape=[128], num_classes=3)
+    model_training_utils.run_customized_training_loop(
+        strategy=strategy,
+        model_fn=self._model_fn,
+        loss_fn=tf.keras.losses.categorical_crossentropy,
+        model_dir=model_dir,
+        steps_per_epoch=20,
+        steps_per_loop=steps_per_loop,
+        epochs=2,
+        train_input_fn=input_fn,
+        eval_input_fn=input_fn,
+        eval_steps=10,
+        init_checkpoint=None,
+        metric_fn=metric_fn,
+        custom_callbacks=None,
+        run_eagerly=run_eagerly)
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_eager_single_step(self, distribution):
+    model_dir = self.get_temp_dir()
+    if isinstance(distribution, tf.distribute.experimental.TPUStrategy):
+      with self.assertRaises(ValueError):
+        self.run_training(
+            distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+    else:
+      self.run_training(
+          distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+
+  @combinations.generate(eager_gpu_strategy_combinations())
+  def test_train_eager_mixed_precision(self, distribution):
+    model_dir = self.get_temp_dir()
+    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    self._model_fn = create_model_fn(
+        input_shape=[128], num_classes=3, use_float16=True)
+    self.run_training(
+        distribution, model_dir, steps_per_loop=1, run_eagerly=True)
+
+  @combinations.generate(eager_strategy_combinations())
+  def test_train_check_artifacts(self, distribution):
+    model_dir = self.get_temp_dir()
+    self.run_training(
+        distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+
+    # Two checkpoints should be saved after two epochs.
+    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*')))
+    self.assertNotEmpty(
+        tf.io.gfile.glob(
+            os.path.join(model_dir, 'summaries/training_summary*')))
+
+    # Loss and accuracy values should be written into summaries.
+    self.assertTrue(
+        check_eventfile_for_keyword('loss',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('accuracy',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('mean_input',
+                                    os.path.join(model_dir, 'summaries/train')))
+    self.assertTrue(
+        check_eventfile_for_keyword('accuracy',
+                                    os.path.join(model_dir, 'summaries/eval')))
+    self.assertTrue(
+        check_eventfile_for_keyword('mean_input',
+                                    os.path.join(model_dir, 'summaries/eval')))
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          mode='eager',
+      ))
+  def test_train_check_artifacts_non_chief(self, distribution):
+    # We shouldn't export artifacts on non-chief workers. Since there's no easy
+    # way to test with real MultiWorkerMirroredStrategy, we patch the strategy
+    # to make it as if it's MultiWorkerMirroredStrategy on non-chief workers.
+    extended = distribution.extended
+    with mock.patch.object(extended.__class__, 'should_checkpoint',
+                           new_callable=mock.PropertyMock, return_value=False), \
+         mock.patch.object(extended.__class__, 'should_save_summary',
+                           new_callable=mock.PropertyMock, return_value=False):
+      model_dir = self.get_temp_dir()
+      self.run_training(
+          distribution, model_dir, steps_per_loop=10, run_eagerly=False)
+      self.assertEmpty(tf.io.gfile.listdir(model_dir))
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,56 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale="dynamic"):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    # Wraps optimizer with a LossScaleOptimizer. This is done automatically
+    # in compile() with the "mixed_float16" policy, but since we do not call
+    # compile(), we must wrap the optimizer manually.
+    optimizer = (
+        tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+            optimizer, loss_scale=loss_scale))
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.ckeras.mixed_precision and
+    # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
+    # up.
+    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+        optimizer)
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets mix precision policy."""
+  if dtype == tf.float16:
+    policy = tf.keras.mixed_precision.experimental.Policy(
+        'mixed_float16', loss_scale=loss_scale)
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+  elif dtype == tf.bfloat16:
+    policy = tf.keras.mixed_precision.experimental.Policy(
+        'mixed_bfloat16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.experimental.set_policy('float32')
+  else:
+    raise ValueError("Unexpected dtype: %s" % dtype)
@@ -0,0 +1,175 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common TF utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import tensorflow as tf
+
+from tensorflow.python.util import deprecation
+from official.modeling import activations
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed."
+)
+def pack_inputs(inputs):
+  """Pack a list of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is None, replace it with a special constant
+    tensor.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if x is None:
+      outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
+    else:
+      outputs.append(x)
+  return tuple(outputs)
+
+
+@deprecation.deprecated(
+    None,
+    "tf.keras.layers.Layer supports multiple positional args and kwargs as "
+    "input tensors. pack/unpack inputs to override __call__ is no longer "
+    "needed."
+)
+def unpack_inputs(inputs):
+  """unpack a tuple of `inputs` tensors to a tuple.
+
+  Args:
+    inputs: a list of tensors.
+
+  Returns:
+    a tuple of tensors. if any input is a special constant tensor, replace it
+    with None.
+  """
+  inputs = tf.nest.flatten(inputs)
+  outputs = []
+  for x in inputs:
+    if is_special_none_tensor(x):
+      outputs.append(None)
+    else:
+      outputs.append(x)
+  x = tuple(outputs)
+
+  # To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
+  # from triggering.
+  if len(x) == 1:
+    return x[0]
+  return tuple(outputs)
+
+
+def is_special_none_tensor(tensor):
+  """Checks if a tensor is a special None Tensor."""
+  return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
+
+
+# TODO(hongkuny): consider moving custom string-map lookup to keras api.
+def get_activation(identifier):
+  """Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
+
+  It checks string first and if it is one of customized activation not in TF,
+  the corresponding activation will be returned. For non-customized activation
+  names and callable identifiers, always fallback to tf.keras.activations.get.
+
+  Args:
+    identifier: String name of the activation function or callable.
+
+  Returns:
+    A Python function corresponding to the activation function.
+  """
+  if isinstance(identifier, six.string_types):
+    name_to_fn = {
+        "gelu": activations.gelu,
+        "simple_swish": activations.simple_swish,
+        "hard_swish": activations.hard_swish,
+        "identity": activations.identity,
+    }
+    identifier = str(identifier).lower()
+    if identifier in name_to_fn:
+      return tf.keras.activations.get(name_to_fn[identifier])
+  return tf.keras.activations.get(identifier)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+  """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+  if expected_rank is not None:
+    assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for (index, dim) in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
+    return shape
+
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
+
+def assert_rank(tensor, expected_rank, name=None):
+  """Raises an exception if the tensor rank is not of the expected rank.
+
+  Args:
+    tensor: A tf.Tensor to check the rank of.
+    expected_rank: Python integer or list of integers, expected rank.
+    name: Optional name of the tensor for the error message.
+
+  Raises:
+    ValueError: If the expected shape doesn't match the actual shape.
+  """
+  expected_rank_dict = {}
+  if isinstance(expected_rank, six.integer_types):
+    expected_rank_dict[expected_rank] = True
+  else:
+    for x in expected_rank:
+      expected_rank_dict[x] = True
+
+  actual_rank = tensor.shape.ndims
+  if actual_rank not in expected_rank_dict:
+    raise ValueError(
+        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
+        "equal to the expected tensor rank `%s`" %
+        (name, actual_rank, str(tensor.shape), str(expected_rank)))
@@ -0,0 +1,735 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Custom training loop for running TensorFlow 2.0 models."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import json
+import os
+
+from absl import flags
+from absl import logging
+
+import numpy as np
+import tensorflow as tf
+
+# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
+from typing import Optional, Dict, List, Text, Callable, Union, Iterator, Any
+from official.modeling.hyperparams import params_dict
+from official.utils.misc import distribution_utils
+from official.utils import hyperparams_flags
+
+FLAGS = flags.FLAGS
+
+strategy_flags_dict = hyperparams_flags.strategy_flags_dict
+hparam_flags_dict = hyperparams_flags.hparam_flags_dict
+
+
+def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to model_dir with provided checkpoint prefix."""
+
+  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+  saved_path = checkpoint.save(checkpoint_path)
+  logging.info('Saving model as TF checkpoint: %s', saved_path)
+
+
+def _steps_to_run(current_step, total_steps, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  return min(total_steps - current_step, steps_per_loop)
+
+
+def _no_metric():
+  return None
+
+
+class SummaryWriter(object):
+  """Simple SummaryWriter for writing dictionary of metrics.
+
+  Attributes:
+    writer: The tf.SummaryWriter.
+  """
+
+  def __init__(self, model_dir: Text, name: Text):
+    """Inits SummaryWriter with paths.
+
+    Arguments:
+      model_dir: the model folder path.
+      name: the summary subfolder name.
+    """
+    self.writer = tf.summary.create_file_writer(os.path.join(model_dir, name))
+
+  def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
+    """Write metrics to summary with the given writer.
+
+    Args:
+      metrics: a dictionary of metrics values. Prefer dictionary.
+      step: integer. The training step.
+    """
+    if not isinstance(metrics, dict):
+      # Support scalar metric without name.
+      logging.warning('Warning: summary writer prefer metrics as dictionary.')
+      metrics = {'metric': metrics}
+
+    with self.writer.as_default():
+      for k, v in metrics.items():
+        tf.summary.scalar(k, v, step=step)
+      self.writer.flush()
+
+
+class DistributedExecutor(object):
+  """Interface to train and eval models with tf.distribute.Strategy.
+
+  Arguments:
+    strategy: an instance of tf.distribute.Strategy.
+    params: Model configuration needed to run distribution strategy.
+    model_fn: Keras model function. Signature:
+      (params: ParamsDict) -> tf.keras.models.Model.
+    loss_fn: loss function. Signature:
+      (y_true: Tensor, y_pred: Tensor) -> Tensor
+    metric_fn: metric function. Signature: () -> tf.keras.metrics.Metric.
+    is_multi_host: Set to True when using multi hosts for training, like multi
+      worker GPU or TPU pod (slice). Otherwise, False.
+  """
+
+  def __init__(self,
+               strategy,
+               params,
+               model_fn,
+               loss_fn,
+               is_multi_host=False):
+
+    self._params = params
+    self._model_fn = model_fn
+    self._loss_fn = loss_fn
+    self._strategy = strategy
+    self._checkpoint_name = 'ctl_step_{step}.ckpt'
+    self._is_multi_host = is_multi_host
+    self.train_summary_writer = None
+    self.eval_summary_writer = None
+    self.global_train_step = None
+
+  @property
+  def checkpoint_name(self):
+    """Returns default checkpoint name."""
+    return self._checkpoint_name
+
+  @checkpoint_name.setter
+  def checkpoint_name(self, name):
+    """Sets default summary writer for the current thread."""
+    self._checkpoint_name = name
+
+  def loss_fn(self):
+    return self._loss_fn()
+
+  def model_fn(self, params):
+    return self._model_fn(params)
+
+  def _save_config(self, model_dir):
+    """Save parameters to config files if model_dir is defined."""
+
+    logging.info('Save config to model_dir %s.', model_dir)
+    if model_dir:
+      if not tf.io.gfile.exists(model_dir):
+        tf.io.gfile.makedirs(model_dir)
+      self._params.lock()
+      params_dict.save_params_dict_to_yaml(self._params,
+                                           model_dir + '/params.yaml')
+    else:
+      logging.warning('model_dir is empty, so skip the save config.')
+
+  def _get_input_iterator(
+      self, input_fn: Callable[..., tf.data.Dataset],
+      strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
+    """Returns distributed dataset iterator.
+
+    Args:
+      input_fn: (params: dict) -> tf.data.Dataset.
+      strategy: an instance of tf.distribute.Strategy.
+
+    Returns:
+      An iterator that yields input tensors.
+    """
+
+    if input_fn is None:
+      return None
+    # When training with multiple TPU workers, datasets needs to be cloned
+    # across workers. Since Dataset instance cannot be cloned in eager mode,
+    # we instead pass callable that returns a dataset.
+    if self._is_multi_host:
+      return iter(
+          strategy.experimental_distribute_datasets_from_function(input_fn))
+    else:
+      input_data = input_fn()
+      return iter(strategy.experimental_distribute_dataset(input_data))
+
+  def _create_replicated_step(self,
+                              strategy,
+                              model,
+                              loss_fn,
+                              optimizer,
+                              metric=None):
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, labels = inputs
+
+      with tf.GradientTape() as tape:
+        outputs = model(inputs, training=True)
+        prediction_loss = loss_fn(labels, outputs)
+        loss = tf.reduce_mean(prediction_loss)
+        loss = loss / strategy.num_replicas_in_sync
+        if isinstance(metric, tf.keras.metrics.Metric):
+          metric.update_state(labels, outputs)
+        else:
+          logging.error('train metric is not an instance of '
+                        'tf.keras.metrics.Metric.')
+
+      grads = tape.gradient(loss, model.trainable_variables)
+      optimizer.apply_gradients(zip(grads, model.trainable_variables))
+      return loss
+
+    return _replicated_step
+
+  def _create_train_step(self,
+                         strategy,
+                         model,
+                         loss_fn,
+                         optimizer,
+                         metric=None):
+    """Creates a distributed training step.
+
+      Args:
+        strategy: an instance of tf.distribute.Strategy.
+        model: (Tensor, bool) -> Tensor. model function.
+        loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
+        optimizer: tf.keras.optimizers.Optimizer.
+        iterator: an iterator that yields input tensors.
+        metric: tf.keras.metrics.Metric subclass.
+
+      Returns:
+        The training step callable.
+    """
+    _replicated_step = self._create_replicated_step(strategy, model, loss_fn,
+                                                    optimizer, metric)
+
+    @tf.function
+    def train_step(iterator, num_steps):
+      """Performs a distributed training step.
+
+      Args:
+        iterator: an iterator that yields input tensors.
+
+      Returns:
+        The loss tensor.
+      """
+      if not isinstance(num_steps, tf.Tensor):
+        raise ValueError('steps should be an Tensor. Python object may cause '
+                         'retracing.')
+
+      per_replica_losses = strategy.run(
+          _replicated_step, args=(next(iterator),))
+      for _ in tf.range(num_steps - 1):
+        per_replica_losses = strategy.run(
+            _replicated_step, args=(next(iterator),))
+
+      # For reporting, we returns the mean of losses.
+      losses = tf.nest.map_structure(
+          lambda x: strategy.reduce(tf.distribute.ReduceOp.MEAN, x, axis=None),
+          per_replica_losses)
+      return losses
+
+    return train_step
+
+  def _create_test_step(self, strategy, model, metric):
+    """Creates a distributed test step."""
+
+    @tf.function
+    def test_step(iterator):
+      """Calculates evaluation metrics on distributed devices."""
+      if not metric:
+        logging.info('Skip test_step because metric is None (%s)', metric)
+        return None, None
+      if not isinstance(metric, tf.keras.metrics.Metric):
+        raise ValueError(
+            'Metric must be an instance of tf.keras.metrics.Metric '
+            'for running in test_step. Actual {}'.format(metric))
+
+      def _test_step_fn(inputs):
+        """Replicated accuracy calculation."""
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        metric.update_state(labels, model_outputs)
+        return labels, model_outputs
+
+      return strategy.run(_test_step_fn, args=(next(iterator),))
+
+    return test_step
+
+  def train(self,
+            train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+            eval_input_fn: Callable[[params_dict.ParamsDict],
+                                    tf.data.Dataset] = None,
+            model_dir: Text = None,
+            total_steps: int = 1,
+            iterations_per_loop: int = 1,
+            train_metric_fn: Callable[[], Any] = None,
+            eval_metric_fn: Callable[[], Any] = None,
+            summary_writer_fn: Callable[[Text, Text],
+                                        SummaryWriter] = SummaryWriter,
+            init_checkpoint: Callable[[tf.keras.Model], Any] = None,
+            custom_callbacks: List[tf.keras.callbacks.Callback] = None,
+            save_config: bool = True):
+    """Runs distributed training.
+
+    Args:
+      train_input_fn: (params: dict) -> tf.data.Dataset training data input
+        function.
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      model_dir: the folder path for model checkpoints.
+      total_steps: total training steps.
+      iterations_per_loop: train steps per loop. After each loop, this job will
+        update metrics like loss and save checkpoint.
+      train_metric_fn: metric_fn for evaluation in train_step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      summary_writer_fn: function to create summary writer.
+      init_checkpoint: function to load checkpoint.
+      custom_callbacks: A list of Keras Callbacks objects to run during
+        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
+        methods are invoked during training.
+      save_config: bool. Whether to save params to model_dir.
+
+    Returns:
+      The training loss and eval metrics.
+    """
+    assert train_input_fn is not None
+    if train_metric_fn and not callable(train_metric_fn):
+      raise ValueError('if `train_metric_fn` is specified, '
+                       'train_metric_fn must be a callable.')
+    if eval_metric_fn and not callable(eval_metric_fn):
+      raise ValueError('if `eval_metric_fn` is specified, '
+                       'eval_metric_fn must be a callable.')
+    train_metric_fn = train_metric_fn or _no_metric
+    eval_metric_fn = eval_metric_fn or _no_metric
+
+    if custom_callbacks and iterations_per_loop != 1:
+      logging.error(
+          'It is sematically wrong to run callbacks when '
+          'iterations_per_loop is not one (%s)', iterations_per_loop)
+
+    def _run_callbacks_on_batch_begin(batch):
+      """Runs custom callbacks at the start of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        if callback:
+          callback.on_batch_begin(batch)
+
+    def _run_callbacks_on_batch_end(batch):
+      """Runs custom callbacks at the end of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        if callback:
+          callback.on_batch_end(batch)
+
+    if save_config:
+      self._save_config(model_dir)
+
+    if FLAGS.save_checkpoint_freq:
+      save_freq = FLAGS.save_checkpoint_freq
+    else:
+      save_freq = iterations_per_loop
+
+    params = self._params
+    strategy = self._strategy
+    # To reduce unnecessary send/receive input pipeline operation, we place
+    # input pipeline ops in worker task.
+    train_iterator = self._get_input_iterator(train_input_fn, strategy)
+    train_loss = None
+    eval_metric_result = None
+    with strategy.scope():
+      # To correctly place the model weights on accelerators,
+      # model and optimizer should be created in scope.
+      model = self.model_fn(params.as_dict())
+      if not hasattr(model, 'optimizer'):
+        raise ValueError('User should set optimizer attribute to model '
+                         'inside `model_fn`.')
+      optimizer = model.optimizer
+
+      # Training loop starts here.
+      checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+      latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+      initial_step = 0
+      if latest_checkpoint_file:
+        logging.info(
+            'Checkpoint file %s found and restoring from '
+            'checkpoint', latest_checkpoint_file)
+        checkpoint.restore(latest_checkpoint_file)
+        initial_step = optimizer.iterations.numpy()
+        logging.info('Loading from checkpoint file completed. Init step %d',
+                     initial_step)
+      elif init_checkpoint:
+        logging.info('Restoring from init checkpoint function')
+        init_checkpoint(model)
+        logging.info('Loading from init checkpoint file completed')
+
+      current_step = optimizer.iterations.numpy()
+      checkpoint_name = self.checkpoint_name
+
+      eval_metric = eval_metric_fn()
+      train_metric = train_metric_fn()
+      train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
+      self.train_summary_writer = train_summary_writer.writer
+
+      test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
+      self.eval_summary_writer = test_summary_writer.writer
+
+    # Continue training loop.
+    train_step = self._create_train_step(
+        strategy=strategy,
+        model=model,
+        loss_fn=self.loss_fn(),
+        optimizer=optimizer,
+        metric=train_metric)
+    test_step = None
+    if eval_input_fn and eval_metric:
+      self.global_train_step = model.optimizer.iterations
+      test_step = self._create_test_step(strategy, model, metric=eval_metric)
+
+    logging.info('Training started')
+    last_save_checkpoint_step = current_step
+    while current_step < total_steps:
+
+      num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
+      _run_callbacks_on_batch_begin(current_step)
+      train_loss = train_step(train_iterator,
+                              tf.convert_to_tensor(num_steps, dtype=tf.int32))
+      _run_callbacks_on_batch_end(current_step)
+      current_step += num_steps
+
+      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
+                                         train_loss)
+      if not isinstance(train_loss, dict):
+        train_loss = {'total_loss': train_loss}
+      if np.isnan(train_loss['total_loss']):
+        raise ValueError('total loss is NaN.')
+
+      if train_metric:
+        train_metric_result = train_metric.result()
+        if isinstance(train_metric, tf.keras.metrics.Metric):
+          train_metric_result = tf.nest.map_structure(
+              lambda x: x.numpy().astype(float), train_metric_result)
+        if not isinstance(train_metric_result, dict):
+          train_metric_result = {'metric': train_metric_result}
+        train_metric_result.update(train_loss)
+      else:
+        train_metric_result = train_loss
+      if callable(optimizer.lr):
+        train_metric_result.update(
+            {'learning_rate': optimizer.lr(current_step).numpy()})
+      else:
+        train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
+      logging.info('Train Step: %d/%d  / loss = %s / training metric = %s',
+                   current_step, total_steps, train_loss,
+                   train_metric_result)
+
+      train_summary_writer(
+          metrics=train_metric_result, step=optimizer.iterations)
+
+      # Saves model checkpoints and run validation steps at every
+      # iterations_per_loop steps.
+      # To avoid repeated model saving, we do not save after the last
+      # step of training.
+      if save_freq > 0 and current_step < total_steps and (
+          current_step - last_save_checkpoint_step) >= save_freq:
+        _save_checkpoint(checkpoint, model_dir,
+                         checkpoint_name.format(step=current_step))
+        last_save_checkpoint_step = current_step
+
+      if test_step:
+        eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+        eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                  eval_metric, eval_iterator)
+        logging.info('Step: %s evalation metric = %s.', current_step,
+                     eval_metric_result)
+        test_summary_writer(
+            metrics=eval_metric_result, step=optimizer.iterations)
+
+      # Re-initialize evaluation metric, except the last step.
+      if eval_metric and current_step < total_steps:
+        eval_metric.reset_states()
+      if train_metric and current_step < total_steps:
+        train_metric.reset_states()
+
+    # Reaches the end of training and saves the last checkpoint.
+    if last_save_checkpoint_step < total_steps:
+      _save_checkpoint(checkpoint, model_dir,
+                       checkpoint_name.format(step=current_step))
+
+    if test_step:
+      logging.info('Running final evaluation after training is complete.')
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Final evaluation metric = %s.', eval_metric_result)
+      test_summary_writer(
+          metrics=eval_metric_result, step=optimizer.iterations)
+
+    return train_loss, eval_metric_result
+
+  def _run_evaluation(self, test_step, current_training_step, metric,
+                      test_iterator):
+    """Runs validation steps and aggregate metrics."""
+    if not test_iterator or not metric:
+      logging.warning(
+          'Both test_iterator (%s) and metrics (%s) must not be None.',
+          test_iterator, metric)
+      return None
+    logging.info('Running evaluation after step: %s.', current_training_step)
+    while True:
+      try:
+        test_step(test_iterator)
+      except (StopIteration, tf.errors.OutOfRangeError):
+        break
+
+    metric_result = metric.result()
+    if isinstance(metric, tf.keras.metrics.Metric):
+      metric_result = metric_result.numpy().astype(float)
+    logging.info('Step: [%d] Validation metric = %f', current_training_step,
+                 metric_result)
+    return metric_result
+
+  def evaluate_from_model_dir(
+      self,
+      model_dir: Text,
+      eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+      eval_metric_fn: Callable[[], Any],
+      total_steps: int = -1,
+      eval_timeout: int = None,
+      min_eval_interval: int = 180,
+      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
+    """Runs distributed evaluation on model folder.
+
+    Args:
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      model_dir: the folder for storing model checkpoints.
+      total_steps: total training steps. If the current step reaches the
+        total_steps, the evaluation loop will stop.
+      eval_timeout: The maximum number of seconds to wait between checkpoints.
+        If left as None, then the process will wait indefinitely. Used by
+        tf.train.checkpoints_iterator.
+      min_eval_interval: The minimum number of seconds between yielding
+        checkpoints. Used by tf.train.checkpoints_iterator.
+      summary_writer_fn: function to create summary writer.
+
+    Returns:
+      Eval metrics dictionary of the last checkpoint.
+    """
+
+    if not model_dir:
+      raise ValueError('model_dir must be set.')
+
+    def terminate_eval():
+      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
+                      eval_timeout)
+      return True
+
+    summary_writer = summary_writer_fn(model_dir, 'eval')
+    self.eval_summary_writer = summary_writer.writer
+
+    # Read checkpoints from the given model directory
+    # until `eval_timeout` seconds elapses.
+    for checkpoint_path in tf.train.checkpoints_iterator(
+        model_dir,
+        min_interval_secs=min_eval_interval,
+        timeout=eval_timeout,
+        timeout_fn=terminate_eval):
+      eval_metric_result, current_step = self.evaluate_checkpoint(
+          checkpoint_path=checkpoint_path,
+          eval_input_fn=eval_input_fn,
+          eval_metric_fn=eval_metric_fn,
+          summary_writer=summary_writer)
+      if total_steps > 0 and current_step >= total_steps:
+        logging.info('Evaluation finished after training step %d', current_step)
+        break
+    return eval_metric_result
+
+  def evaluate_checkpoint(self,
+                          checkpoint_path: Text,
+                          eval_input_fn: Callable[[params_dict.ParamsDict],
+                                                  tf.data.Dataset],
+                          eval_metric_fn: Callable[[], Any],
+                          summary_writer: SummaryWriter = None):
+    """Runs distributed evaluation on the one checkpoint.
+
+    Args:
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      checkpoint_path: the checkpoint to evaluate.
+      summary_writer_fn: function to create summary writer.
+
+    Returns:
+      Eval metrics dictionary of the last checkpoint.
+    """
+    if not callable(eval_metric_fn):
+      raise ValueError('if `eval_metric_fn` is specified, '
+                       'eval_metric_fn must be a callable.')
+
+    params = self._params
+    strategy = self._strategy
+    # To reduce unnecessary send/receive input pipeline operation, we place
+    # input pipeline ops in worker task.
+    with strategy.scope():
+
+      # To correctly place the model weights on accelerators,
+      # model and optimizer should be created in scope.
+      model = self.model_fn(params.as_dict())
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      eval_metric = eval_metric_fn()
+      assert eval_metric, 'eval_metric does not exist'
+      test_step = self._create_test_step(strategy, model, metric=eval_metric)
+
+      logging.info('Starting to evaluate.')
+      if not checkpoint_path:
+        raise ValueError('checkpoint path is empty')
+      reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
+      current_step = reader.get_tensor(
+          'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
+      logging.info(
+          'Checkpoint file %s found and restoring from '
+          'checkpoint', checkpoint_path)
+      checkpoint.restore(checkpoint_path)
+
+      self.global_train_step = model.optimizer.iterations
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Step: %s evalation metric = %s.', current_step,
+                   eval_metric_result)
+      summary_writer(metrics=eval_metric_result, step=current_step)
+      eval_metric.reset_states()
+
+    return eval_metric_result, current_step
+
+  def predict(self):
+    return NotImplementedError('Unimplmented function.')
+
+
+class ExecutorBuilder(object):
+  """Builder of DistributedExecutor.
+
+  Example 1: Builds an executor with supported Strategy.
+    builder = ExecutorBuilder(
+        strategy_type='tpu',
+        strategy_config={'tpu': '/bns/xxx'})
+    dist_executor = builder.build_executor(
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Example 2: Builds an executor with customized Strategy.
+    builder = ExecutorBuilder()
+    builder.strategy = <some customized Strategy>
+    dist_executor = builder.build_executor(
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Example 3: Builds a customized executor with customized Strategy.
+    class MyDistributedExecutor(DistributedExecutor):
+      # implementation ...
+
+    builder = ExecutorBuilder()
+    builder.strategy = <some customized Strategy>
+    dist_executor = builder.build_executor(
+        class_ctor=MyDistributedExecutor,
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Args:
+    strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'. If
+      None. User is responsible to set the strategy before calling
+      build_executor(...).
+    strategy_config: necessary config for constructing the proper Strategy.
+      Check strategy_flags_dict() for examples of the structure.
+  """
+
+  def __init__(self, strategy_type=None, strategy_config=None):
+    _ = distribution_utils.configure_cluster(
+        strategy_config.worker_hosts, strategy_config.task_index)
+    self._strategy = distribution_utils.get_distribution_strategy(
+        distribution_strategy=strategy_type,
+        num_gpus=strategy_config.num_gpus,
+        all_reduce_alg=strategy_config.all_reduce_alg,
+        num_packs=strategy_config.num_packs,
+        tpu_address=strategy_config.tpu)
+
+  @property
+  def strategy(self):
+    """Returns default checkpoint name."""
+    return self._strategy
+
+  @strategy.setter
+  def strategy(self, new_strategy):
+    """Sets default summary writer for the current thread."""
+    self._strategy = new_strategy
+
+
+  def build_executor(self,
+                     class_ctor=DistributedExecutor,
+                     params=None,
+                     model_fn=None,
+                     loss_fn=None,
+                     **kwargs):
+    """Creates an executor according to strategy type.
+
+    See doc string of the DistributedExecutor.__init__ for more information of
+    the
+    input arguments.
+
+    Args:
+      class_ctor: A constructor of executor (default: DistributedExecutor).
+      params: ParamsDict, all the model parameters and runtime parameters.
+      model_fn: Keras model function.
+      loss_fn: loss function.
+      **kwargs: other arguments to the executor constructor.
+
+    Returns:
+      An instance of DistributedExecutor or its subclass.
+    """
+    if self._strategy is None:
+      raise ValueError('`strategy` should not be None. You need to specify '
+                       '`strategy_type` in the builder contructor or directly '
+                       'set the `strategy` property of the builder.')
+    return class_ctor(
+        strategy=self._strategy,
+        params=params,
+        model_fn=model_fn,
+        loss_fn=loss_fn,
+        **kwargs)
@@ -0,0 +1,88 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sets up TensorFlow Official Models."""
+import datetime
+import os
+import sys
+
+from setuptools import find_packages
+from setuptools import setup
+
+version = '2.2.0'
+
+project_name = 'tf-models-official'
+
+long_description = """The TensorFlow official models are a collection of
+models that use TensorFlow's high-level APIs.
+They are intended to be well-maintained, tested, and kept up to date with the
+latest TensorFlow API. They should also be reasonably optimized for fast
+performance while still being easy to read."""
+
+if '--project_name' in sys.argv:
+  project_name_idx = sys.argv.index('--project_name')
+  project_name = sys.argv[project_name_idx + 1]
+  sys.argv.remove('--project_name')
+  sys.argv.pop(project_name_idx)
+
+
+def _get_requirements():
+  """Parses requirements.txt file."""
+  install_requires_tmp = []
+  dependency_links_tmp = []
+  with open(
+      os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
+    for line in f:
+      package_name = line.strip()
+      if package_name.startswith('-e '):
+        dependency_links_tmp.append(package_name[3:].strip())
+      else:
+        install_requires_tmp.append(package_name)
+  return install_requires_tmp, dependency_links_tmp
+
+install_requires, dependency_links = _get_requirements()
+
+if project_name == 'tf-models-nightly':
+  version += '.dev' + datetime.datetime.now().strftime('%Y%m%d')
+  install_requires.append('tf-nightly')
+else:
+  install_requires.append('tensorflow>=2.1.0')
+
+print('install_requires: ', install_requires)
+print('dependency_links: ', dependency_links)
+
+setup(
+    name=project_name,
+    version=version,
+    description='TensorFlow Official Models',
+    long_description=long_description,
+    author='Google Inc.',
+    author_email='no-reply@google.com',
+    url='https://github.com/tensorflow/models',
+    license='Apache 2.0',
+    packages=find_packages(exclude=[
+        'research*',
+        'tutorials*',
+        'samples*',
+        'official.r1*',
+        'official.pip_package*',
+        'official.benchmark*',
+    ]),
+    exclude_package_data={
+        '': ['*_test.py',],
+    },
+    install_requires=install_requires,
+    dependency_links=dependency_links,
+    python_requires='>=3.6',
+)
@@ -0,0 +1,23 @@
+# Legacy Models Collection
+
+The R1 folder contains legacy model implmentation and models that will not
+update to TensorFlow 2.x. They do not have solid performance tracking.
+
+**Note: models will be removed from the master branch by 2020/06.**
+
+After removal, you can still access to these legacy models in the previous
+released tags, e.g. [v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0).
+
+
+## Legacy model implmentation
+
+Transformer and MNIST implementation uses pure TF 1.x TF-Estimator.
+Users should follow the corresponding TF 2.x implmentation inside the
+official model garden.
+
+## Models that will not update to TensorFlow 2.x
+
+*   [boosted_trees](boosted_trees): A Gradient Boosted Trees model to
+    classify higgs boson process from HIGGS Data Set.
+*   [wide_deep](wide_deep): A model that combines a wide model and deep
+    network to classify census income data.
@@ -0,0 +1,112 @@
+# Classifying Higgs boson processes in the HIGGS Data Set
+## Overview
+The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) contains 11 million samples with 28 features, and is for the classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not.
+
+We use Gradient Boosted Trees algorithm to distinguish the two classes.
+
+---
+
+The code sample uses the high level `tf.estimator.Estimator` and `tf.data.Dataset`.  These APIs are great for fast iteration and quickly adapting models to your own datasets without major code overhauls.  It allows you to move from single-worker training to distributed training, and makes it easy to export model binaries for prediction.  Here, for further simplicity and faster execution, we use a utility function `tf.contrib.estimator.boosted_trees_classifier_train_in_memory`.  This utility function is especially effective when the input is provided as in-memory data sets like numpy arrays.
+
+An input function for the `Estimator` typically uses `tf.data.Dataset` API, which can handle various data control like streaming, batching, transform and shuffling. However `boosted_trees_classifier_train_in_memory()` utility function requires that the entire data is provided as a single batch (i.e. without using `batch()` API). Thus in this practice, simply `Dataset.from_tensors()` is used to convert numpy arrays into structured tensors, and `Dataset.zip()` is used to put features and label together.
+For further references of `Dataset`, [Read more here](https://www.tensorflow.org/guide/datasets).
+
+## Running the code
+First make sure you've [added the models folder to your Python path](/official/#running-the-models); otherwise you may encounter an error like `ImportError: No module named official.boosted_trees`.
+
+### Setup
+The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) that this sample uses for training is hosted by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/). We have provided a script that downloads and cleans the necessary files.
+
+```
+python data_download.py
+```
+
+This will download a file and store the processed file under the directory designated by `--data_dir` (defaults to `/tmp/higgs_data/`). To change the target directory, set the `--data_dir` flag. The directory could be network storages that Tensorflow supports (like Google Cloud Storage, `gs://<bucket>/<path>/`).
+The file downloaded to the local temporary folder is about 2.8 GB, and the processed file is about 0.8 GB, so there should be enough storage to handle them.
+
+
+### Training
+
+This example uses about 3 GB of RAM during training.
+You can run the code locally as follows:
+
+```
+python train_higgs.py
+```
+
+The model is by default saved to `/tmp/higgs_model`, which can be changed using the `--model_dir` flag.
+Note that the model_dir is cleaned up before every time training starts.
+
+Model parameters can be adjusted by flags, like `--n_trees`, `--max_depth`, `--learning_rate` and so on.  Check out the code for details.
+
+The final accuracy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters.
+
+By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation.
+The training/evaluation data can be selected as index ranges by flags `--train_start`, `--train_count`, `--eval_start`, `--eval_count`, etc.
+
+### TensorBoard
+
+Run TensorBoard to inspect the details about the graph and training progression.
+
+```
+tensorboard --logdir=/tmp/higgs_model  # set logdir as --model_dir set during training.
+```
+
+## Inference with SavedModel
+You can export the model into Tensorflow [SavedModel](https://www.tensorflow.org/guide/saved_model) format by using the argument `--export_dir`:
+
+```
+python train_higgs.py --export_dir /tmp/higgs_boosted_trees_saved_model
+```
+
+After the model finishes training, use [`saved_model_cli`](https://www.tensorflow.org/guide/saved_model#cli_to_inspect_and_execute_savedmodel) to inspect and execute the SavedModel.
+
+Try the following commands to inspect the SavedModel:
+
+**Replace `${TIMESTAMP}` with the folder produced (e.g. 1524249124)**
+```
+# List possible tag_sets. Only one metagraph is saved, so there will be one option.
+saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/
+
+# Show SignatureDefs for tag_set=serve. SignatureDefs define the outputs to show.
+saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/ \
+    --tag_set serve --all
+```
+
+### Inference
+Let's use the model to predict the income group of two examples.
+Note that this model exports SavedModel with the custom parsing module that accepts csv lines as features. (Each line is an example with 28 columns; be careful to not add a label column, unlike in the training data.)
+
+```
+saved_model_cli run --dir /tmp/boosted_trees_higgs_saved_model/${TIMESTAMP}/ \
+    --tag_set serve --signature_def="predict" \
+    --input_exprs='inputs=["0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.0,1.374992,-0.653674,0.930349,1.107436,1.138904,-1.578198,-1.046985,0.0,0.657930,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678", "1.595839,-0.607811,0.007075,1.818450,-0.111906,0.847550,-0.566437,1.581239,2.173076,0.755421,0.643110,1.426367,0.0,0.921661,-1.190432,-1.615589,0.0,0.651114,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818"]'
+```
+
+This will print out the predicted classes and class probabilities. Something like:
+
+```
+Result for output key class_ids:
+[[1]
+ [0]]
+Result for output key classes:
+[['1']
+ ['0']]
+Result for output key logistic:
+[[0.6440273 ]
+ [0.10902369]]
+Result for output key logits:
+[[ 0.59288704]
+ [-2.1007526 ]]
+Result for output key probabilities:
+[[0.3559727 0.6440273]
+ [0.8909763 0.1090237]]
+```
+
+Please note that "predict" signature_def gives out different (more detailed) results than "classification" or "serving_default".
+
+## Additional Links
+
+If you are interested in distributed training, take a look at [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
+
+You can also [train models on Cloud ML Engine](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction), which provides [hyperparameter tuning](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction#hyperparameter_tuning) to maximize your model's results and enables [deploying your model for prediction](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction#deploy_a_model_to_support_prediction).
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Downloads the UCI HIGGS Dataset and prepares train data.
+
+The details on the dataset are in https://archive.ics.uci.edu/ml/datasets/HIGGS
+
+It takes a while as it needs to download 2.8 GB over the network, process, then
+store it into the specified location as a compressed numpy file.
+
+Usage:
+$ python data_download.py --data_dir=/tmp/higgs_data
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import tempfile
+
+# pylint: disable=g-bad-import-order
+import numpy as np
+import pandas as pd
+from six.moves import urllib
+from absl import app as absl_app
+from absl import flags
+import tensorflow as tf
+
+from official.utils.flags import core as flags_core
+
+URL_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
+INPUT_FILE = "HIGGS.csv.gz"
+NPZ_FILE = "HIGGS.csv.gz.npz"  # numpy compressed file to contain "data" array.
+
+
+def _download_higgs_data_and_save_npz(data_dir):
+  """Download higgs data and store as a numpy compressed file."""
+  input_url = URL_ROOT + "/" + INPUT_FILE
+  np_filename = os.path.join(data_dir, NPZ_FILE)
+  if tf.gfile.Exists(np_filename):
+    raise ValueError("data_dir already has the processed data file: {}".format(
+        np_filename))
+  if not tf.gfile.Exists(data_dir):
+    tf.gfile.MkDir(data_dir)
+  # 2.8 GB to download.
+  try:
+    tf.logging.info("Data downloading...")
+    temp_filename, _ = urllib.request.urlretrieve(input_url)
+    # Reading and parsing 11 million csv lines takes 2~3 minutes.
+    tf.logging.info("Data processing... taking multiple minutes...")
+    with gzip.open(temp_filename, "rb") as csv_file:
+      data = pd.read_csv(
+          csv_file,
+          dtype=np.float32,
+          names=["c%02d" % i for i in range(29)]  # label + 28 features.
+      ).as_matrix()
+  finally:
+    tf.gfile.Remove(temp_filename)
+
+  # Writing to temporary location then copy to the data_dir (0.8 GB).
+  f = tempfile.NamedTemporaryFile()
+  np.savez_compressed(f, data=data)
+  tf.gfile.Copy(f.name, np_filename)
+  tf.logging.info("Data saved to: {}".format(np_filename))
+
+
+def main(unused_argv):
+  if not tf.gfile.Exists(FLAGS.data_dir):
+    tf.gfile.MkDir(FLAGS.data_dir)
+  _download_higgs_data_and_save_npz(FLAGS.data_dir)
+
+
+def define_data_download_flags():
+  """Add flags specifying data download arguments."""
+  flags.DEFINE_string(
+      name="data_dir", default="/tmp/higgs_data",
+      help=flags_core.help_wrap(
+          "Directory to download higgs dataset and store training/eval data."))
+
+
+if __name__ == "__main__":
+  tf.logging.set_verbosity(tf.logging.INFO)
+  define_data_download_flags()
+  FLAGS = flags.FLAGS
+  absl_app.run(main)
@@ -0,0 +1,297 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A script that builds boosted trees over higgs data.
+
+If you haven't, please run data_download.py beforehand to prepare the data.
+
+For some more details on this example, please refer to README.md as well.
+
+Note that the model_dir is cleaned up before starting the training.
+
+Usage:
+$ python train_higgs.py --n_trees=100 --max_depth=6 --learning_rate=0.1 \
+    --model_dir=/tmp/higgs_model
+
+Note that BoostedTreesClassifier is available since Tensorflow 1.8.0.
+So you need to install recent enough version of Tensorflow to use this example.
+
+The training data is by default the first million examples out of 11M examples,
+and eval data is by default the last million examples.
+They are controlled by --train_start, --train_count, --eval_start, --eval_count.
+e.g. to train over the first 10 million examples instead of 1 million:
+$ python train_higgs.py --n_trees=100 --max_depth=6 --learning_rate=0.1 \
+    --model_dir=/tmp/higgs_model --train_count=10000000
+
+Training history and metrics can be inspected using tensorboard.
+Set --logdir as the --model_dir set by flag when training
+(or the default /tmp/higgs_model).
+$ tensorboard --logdir=/tmp/higgs_model
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# pylint: disable=g-bad-import-order
+import numpy as np
+from absl import app as absl_app
+from absl import flags
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+
+from official.utils.flags import core as flags_core
+from official.utils.flags._conventions import help_wrap
+from official.utils.logs import logger
+
+NPZ_FILE = "HIGGS.csv.gz.npz"  # numpy compressed file containing "data" array
+
+
+def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
+  """Reads higgs data from csv and returns train and eval data.
+
+  Args:
+    data_dir: A string, the directory of higgs dataset.
+    train_start: An integer, the start index of train examples within the data.
+    train_count: An integer, the number of train examples within the data.
+    eval_start: An integer, the start index of eval examples within the data.
+    eval_count: An integer, the number of eval examples within the data.
+
+  Returns:
+    Numpy array of train data and eval data.
+  """
+  npz_filename = os.path.join(data_dir, NPZ_FILE)
+  try:
+    # gfile allows numpy to read data from network data sources as well.
+    with tf.gfile.Open(npz_filename, "rb") as npz_file:
+      with np.load(npz_file) as npz:
+        data = npz["data"]
+  except tf.errors.NotFoundError as e:
+    raise RuntimeError(
+        "Error loading data; use data_download.py to prepare the data.\n{}: {}"
+        .format(type(e).__name__, e))
+  return (data[train_start:train_start+train_count],
+          data[eval_start:eval_start+eval_count])
+
+
+# This showcases how to make input_fn when the input data is available in the
+# form of numpy arrays.
+def make_inputs_from_np_arrays(features_np, label_np):
+  """Makes and returns input_fn and feature_columns from numpy arrays.
+
+  The generated input_fn will return tf.data.Dataset of feature dictionary and a
+  label, and feature_columns will consist of the list of
+  tf.feature_column.BucketizedColumn.
+
+  Note, for in-memory training, tf.data.Dataset should contain the whole data
+  as a single tensor. Don't use batch.
+
+  Args:
+    features_np: A numpy ndarray (shape=[batch_size, num_features]) for
+        float32 features.
+    label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
+
+  Returns:
+    input_fn: A function returning a Dataset of feature dict and label.
+    feature_names: A list of feature names.
+    feature_column: A list of tf.feature_column.BucketizedColumn.
+  """
+  num_features = features_np.shape[1]
+  features_np_list = np.split(features_np, num_features, axis=1)
+  # 1-based feature names.
+  feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
+
+  # Create source feature_columns and bucketized_columns.
+  def get_bucket_boundaries(feature):
+    """Returns bucket boundaries for feature by percentiles."""
+    return np.unique(np.percentile(feature, range(0, 100))).tolist()
+  source_columns = [
+      tf.feature_column.numeric_column(
+          feature_name, dtype=tf.float32,
+          # Although higgs data have no missing values, in general, default
+          # could be set as 0 or some reasonable value for missing values.
+          default_value=0.0)
+      for feature_name in feature_names
+  ]
+  bucketized_columns = [
+      tf.feature_column.bucketized_column(
+          source_columns[i],
+          boundaries=get_bucket_boundaries(features_np_list[i]))
+      for i in range(num_features)
+  ]
+
+  # Make an input_fn that extracts source features.
+  def input_fn():
+    """Returns features as a dictionary of numpy arrays, and a label."""
+    features = {
+        feature_name: tf.constant(features_np_list[i])
+        for i, feature_name in enumerate(feature_names)
+    }
+    return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features),
+                                tf.data.Dataset.from_tensors(label_np),))
+
+  return input_fn, feature_names, bucketized_columns
+
+
+def make_eval_inputs_from_np_arrays(features_np, label_np):
+  """Makes eval input as streaming batches."""
+  num_features = features_np.shape[1]
+  features_np_list = np.split(features_np, num_features, axis=1)
+  # 1-based feature names.
+  feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
+
+  def input_fn():
+    features = {
+        feature_name: tf.constant(features_np_list[i])
+        for i, feature_name in enumerate(feature_names)
+    }
+    return tf.data.Dataset.zip((
+        tf.data.Dataset.from_tensor_slices(features),
+        tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000)
+
+  return input_fn
+
+
+def _make_csv_serving_input_receiver_fn(column_names, column_defaults):
+  """Returns serving_input_receiver_fn for csv.
+
+  The input arguments are relevant to `tf.decode_csv()`.
+
+  Args:
+    column_names: a list of column names in the order within input csv.
+    column_defaults: a list of default values with the same size of
+        column_names. Each entity must be either a list of one scalar, or an
+        empty list to denote the corresponding column is required.
+        e.g. [[""], [2.5], []] indicates the third column is required while
+            the first column must be string and the second must be float/double.
+
+  Returns:
+    a serving_input_receiver_fn that handles csv for serving.
+  """
+  def serving_input_receiver_fn():
+    csv = tf.placeholder(dtype=tf.string, shape=[None], name="csv")
+    features = dict(zip(column_names, tf.decode_csv(csv, column_defaults)))
+    receiver_tensors = {"inputs": csv}
+    return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
+
+  return serving_input_receiver_fn
+
+
+def train_boosted_trees(flags_obj):
+  """Train boosted_trees estimator on HIGGS data.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+  """
+  # Clean up the model directory if present.
+  if tf.gfile.Exists(flags_obj.model_dir):
+    tf.gfile.DeleteRecursively(flags_obj.model_dir)
+  tf.logging.info("## Data loading...")
+  train_data, eval_data = read_higgs_data(
+      flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
+      flags_obj.eval_start, flags_obj.eval_count)
+  tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
+      train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
+  # Data consists of one label column followed by 28 feature columns.
+  train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
+      features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
+  eval_input_fn = make_eval_inputs_from_np_arrays(
+      features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
+  tf.logging.info("## Features prepared. Training starts...")
+
+  # Create benchmark logger to log info about the training and metric values
+  run_params = {
+      "train_start": flags_obj.train_start,
+      "train_count": flags_obj.train_count,
+      "eval_start": flags_obj.eval_start,
+      "eval_count": flags_obj.eval_count,
+      "n_trees": flags_obj.n_trees,
+      "max_depth": flags_obj.max_depth,
+  }
+  benchmark_logger = logger.config_benchmark_logger(flags_obj)
+  benchmark_logger.log_run_info(
+      model_name="boosted_trees",
+      dataset_name="higgs",
+      run_params=run_params,
+      test_id=flags_obj.benchmark_test_id)
+
+  # Though BoostedTreesClassifier is under tf.estimator, faster in-memory
+  # training is yet provided as a contrib library.
+  from tensorflow.contrib import estimator as contrib_estimator  # pylint: disable=g-import-not-at-top
+  classifier = contrib_estimator.boosted_trees_classifier_train_in_memory(
+      train_input_fn,
+      feature_columns,
+      model_dir=flags_obj.model_dir or None,
+      n_trees=flags_obj.n_trees,
+      max_depth=flags_obj.max_depth,
+      learning_rate=flags_obj.learning_rate)
+
+  # Evaluation.
+  eval_results = classifier.evaluate(eval_input_fn)
+  # Benchmark the evaluation results
+  benchmark_logger.log_evaluation_result(eval_results)
+
+  # Exporting the savedmodel with csv parsing.
+  if flags_obj.export_dir is not None:
+    classifier.export_savedmodel(
+        flags_obj.export_dir,
+        _make_csv_serving_input_receiver_fn(
+            column_names=feature_names,
+            # columns are all floats.
+            column_defaults=[[0.0]] * len(feature_names)),
+        strip_default_attrs=True)
+
+
+def main(_):
+  train_boosted_trees(flags.FLAGS)
+
+
+def define_train_higgs_flags():
+  """Add tree related flags as well as training/eval configuration."""
+  flags_core.define_base(clean=False, stop_threshold=False, batch_size=False,
+                         num_gpu=False, export_dir=True)
+  flags_core.define_benchmark()
+  flags.adopt_module_key_flags(flags_core)
+
+  flags.DEFINE_integer(
+      name="train_start", default=0,
+      help=help_wrap("Start index of train examples within the data."))
+  flags.DEFINE_integer(
+      name="train_count", default=1000000,
+      help=help_wrap("Number of train examples within the data."))
+  flags.DEFINE_integer(
+      name="eval_start", default=10000000,
+      help=help_wrap("Start index of eval examples within the data."))
+  flags.DEFINE_integer(
+      name="eval_count", default=1000000,
+      help=help_wrap("Number of eval examples within the data."))
+
+  flags.DEFINE_integer(
+      "n_trees", default=100, help=help_wrap("Number of trees to build."))
+  flags.DEFINE_integer(
+      "max_depth", default=6, help=help_wrap("Maximum depths of each tree."))
+  flags.DEFINE_float(
+      "learning_rate", default=0.1,
+      help=help_wrap("The learning rate."))
+
+  flags_core.set_defaults(data_dir="/tmp/higgs_data",
+                          model_dir="/tmp/higgs_model")
+
+
+if __name__ == "__main__":
+  # Training progress and eval results are shown as logging.INFO; so enables it.
+  tf.logging.set_verbosity(tf.logging.INFO)
+  define_train_higgs_flags()
+  absl_app.run(main)
@@ -0,0 +1,152 @@
+# ResNet in TensorFlow
+
+Deep residual networks, or ResNets for short, provided the breakthrough idea of
+identity mappings in order to enable training of very deep convolutional neural
+networks. This folder contains an implementation of ResNet for the ImageNet
+dataset written in TensorFlow.
+
+See the following papers for more background:
+
+[1] [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385.pdf) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
+
+[2] [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027.pdf) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
+
+In code, v1 refers to the ResNet defined in [1] but where a stride 2 is used on
+the 3x3 conv rather than the first 1x1 in the bottleneck. This change results
+in higher and more stable accuracy with less epochs than the original v1 and has
+shown to scale to higher batch sizes with minimal degradation in accuracy.
+There is no originating paper. The first mention we are aware of was in the
+torch version of [ResNetv1](https://github.com/facebook/fb.resnet.torch). Most
+popular v1 implementations are this implementation which we call ResNetv1.5.
+
+In testing we found v1.5 requires ~12% more compute to train and has 6% reduced
+throughput for inference compared to ResNetv1. CIFAR-10 ResNet does not use the
+bottleneck and is thus the same for v1 as v1.5.
+
+v2 refers to [2]. The principle difference between the two versions is that v1
+applies batch normalization and activation after convolution, while v2 applies
+batch normalization, then activation, and finally convolution. A schematic
+comparison is presented in Figure 1 (left) of [2].
+
+Please proceed according to which dataset you would like to train/evaluate on:
+
+
+## CIFAR-10
+
+### Setup
+
+You need to have the latest version of TensorFlow installed.
+First, make sure [the models folder is in your Python path](/official/#running-the-models); otherwise you may encounter `ImportError: No module named official.resnet`.
+
+Then, download and extract the CIFAR-10 data from Alex's website, specifying the location with the `--data_dir` flag. Run the following:
+
+```bash
+python cifar10_download_and_extract.py --data_dir <DATA_DIR>
+```
+
+Then, to train the model:
+
+```bash
+python cifar10_main.py --data_dir <DATA_DIR>/cifar-10-batches-bin --model_dir <MODEL_DIR>
+```
+
+Use `--data_dir` to specify the location of the CIFAR-10 data used in the previous step. There are more flag options as described in `cifar10_main.py`.
+
+To export a `SavedModel` from the trained checkpoint:
+
+```bash
+python cifar10_main.py --data_dir <DATA_DIR>/cifar-10-batches-bin --model_dir <MODEL_DIR> --eval_only --export_dir <EXPORT_DIR>
+```
+
+Note: The `<EXPORT_DIR>` must be present. You might want to run `mkdir <EXPORT_DIR>` beforehand.
+
+The `SavedModel` can then be [loaded](https://www.tensorflow.org/guide/saved_model#loading_a_savedmodel_in_python) in order to use the ResNet for prediction.
+
+
+## ImageNet
+
+### Setup
+To begin, you will need to download the ImageNet dataset and convert it to
+TFRecord format. The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
+and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
+provide a few options.
+
+Once your dataset is ready, you can begin training the model as follows:
+
+```bash
+python imagenet_main.py --data_dir=/path/to/imagenet
+```
+
+The model will begin training and will automatically evaluate itself on the
+validation data roughly once per epoch.
+
+Note that there are a number of other options you can specify, including
+`--model_dir` to choose where to store the model and `--resnet_size` to choose
+the model size (options include ResNet-18 through ResNet-200). See
+[`resnet_run_loop.py`](resnet_run_loop.py) for the full list of options.
+
+
+## Compute Devices
+Training is accomplished using the DistributionStrategies API. (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md)
+
+The appropriate distribution strategy is chosen based on the `--num_gpus` flag.
+By default this flag is one if TensorFlow is compiled with CUDA, and zero
+otherwise.
+
+num_gpus:
+ 0:  Use OneDeviceStrategy and train on CPU.
+ 1:  Use OneDeviceStrategy and train on GPU.
+ 2+: Use MirroredStrategy (data parallelism) to distribute a batch between devices.
+
+### Pre-trained model
+You can download pre-trained versions of ResNet-50. Reported accuracies are top-1 single-crop accuracy for the ImageNet validation set.
+Models are reported as both checkpoints produced by Estimator during training, and as SavedModels which are more portable. Checkpoints are fragile,
+and these are not guaranteed to work with future versions of the code. Both ResNet v1
+and ResNet v2 have been trained in both fp16 and fp32 precision. (Here v1 refers to "v1.5". See the note above.) Furthermore, SavedModels
+are generated to accept either tensor or JPG inputs, and with channels_first (NCHW) and channels_last (NHWC) convolutions. NCHW is generally
+better for GPUs, while NHWC is generally better for CPUs. See the TensorFlow [performance guide](https://www.tensorflow.org/performance/performance_guide#data_formats)
+for more details.
+
+ResNet-50 v2 (fp32, Accuracy 76.47%):
+* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v2_fp32_20181001.tar.gz)
+* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NCHW.tar.gz),
+[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NCHW_jpg.tar.gz),
+[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NHWC.tar.gz),
+[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NHWC_jpg.tar.gz)
+
+ResNet-50 v2 (fp16, Accuracy 76.56%):
+* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v2_fp16_20180928.tar.gz)
+* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NCHW.tar.gz),
+[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NCHW_jpg.tar.gz),
+[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NHWC.tar.gz),
+[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NHWC_jpg.tar.gz)
+
+ResNet-50 v1 (fp32, Accuracy 76.53%):
+* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v1_fp32_20181001.tar.gz)
+* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW.tar.gz),
+[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW_jpg.tar.gz),
+[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NHWC.tar.gz),
+[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NHWC_jpg.tar.gz)
+
+ResNet-50 v1 (fp16, Accuracy 76.18%):
+* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v1_fp16_20181001.tar.gz)
+* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NCHW.tar.gz),
+[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NCHW_jpg.tar.gz),
+[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NHWC.tar.gz),
+[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NHWC_jpg.tar.gz)
+
+### Transfer Learning
+You can use a pretrained model to initialize a training process. In addition you are able to freeze all but the final fully connected layers to fine tune your model. Transfer Learning is useful when training on your own small datasets. For a brief look at transfer learning in the context of convolutional neural networks, we recommend reading these [short notes](http://cs231n.github.io/transfer-learning/).
+
+
+To fine tune a pretrained resnet you must make three changes to your training procedure:
+
+1) Build the exact same model as previously except we change the number of labels in the final classification layer.
+
+2) Restore all weights from the pre-trained resnet except for the final classification layer; this will get randomly initialized instead.
+
+3) Freeze earlier layers of the network
+
+We can perform these three operations by specifying two flags: ```--pretrained_model_checkpoint_path``` and ```--fine_tune```. The first flag is a string that points to the path of a pre-trained resnet model. If this flag is specified, it will load all but the final classification layer. A key thing to note: if both ```--pretrained_model_checkpoint_path``` and a non empty ```model_dir``` directory are passed, the tensorflow estimator will load only the ```model_dir```. For more on this please see [WarmStartSettings](https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/WarmStartSettings) and [Estimators](https://www.tensorflow.org/guide/estimators).
+
+The second flag ```--fine_tune``` is a boolean that indicates whether earlier layers of the network should be frozen. You may set this flag to false if you wish to continue training a pre-trained model from a checkpoint. If you set this flag to true, you can train a new classification layer from scratch.
@@ -0,0 +1,499 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Estimator benchmarks and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.r1.resnet import cifar10_main as cifar_main
+from official.r1.resnet import imagenet_main
+from official.utils.flags import core as flags_core
+from official.utils.logs import hooks
+
+IMAGENET_DATA_DIR_NAME = 'imagenet'
+CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
+FLAGS = flags.FLAGS
+
+
+class EstimatorBenchmark(tf.test.Benchmark):
+  """Base class to hold methods common to test classes in the module.
+
+     Code under test for Estimator models (ResNet50 and 56) report mostly the
+     same data and require the same FLAG setup.
+  """
+
+  local_flags = None
+
+  def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
+    if not output_dir:
+      output_dir = '/tmp'
+    self.output_dir = output_dir
+    self.default_flags = default_flags or {}
+    self.flag_methods = flag_methods or {}
+
+  def _get_model_dir(self, folder_name):
+    """Returns directory to store info, e.g. saved model and event log."""
+    return os.path.join(self.output_dir, folder_name)
+
+  def _setup(self):
+    """Sets up and resets flags before each test."""
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+    if EstimatorBenchmark.local_flags is None:
+      for flag_method in self.flag_methods:
+        flag_method()
+      # Loads flags to get defaults to then override. List cannot be empty.
+      flags.FLAGS(['foo'])
+      # Overrides flag values with defaults for the class of tests.
+      for k, v in self.default_flags.items():
+        setattr(FLAGS, k, v)
+      saved_flag_values = flagsaver.save_flag_values()
+      EstimatorBenchmark.local_flags = saved_flag_values
+    else:
+      flagsaver.restore_flag_values(EstimatorBenchmark.local_flags)
+
+  def _report_benchmark(self,
+                        stats,
+                        wall_time_sec,
+                        top_1_max=None,
+                        top_1_min=None):
+    """Report benchmark results by writing to local protobuf file.
+
+    Args:
+      stats: dict returned from estimator models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      top_1_max: highest passing level for top_1 accuracy.
+      top_1_min: lowest passing level for top_1 accuracy.
+    """
+
+    examples_per_sec_hook = None
+    for hook in stats['train_hooks']:
+      if isinstance(hook, hooks.ExamplesPerSecondHook):
+        examples_per_sec_hook = hook
+        break
+
+    eval_results = stats['eval_results']
+    metrics = []
+    if 'accuracy' in eval_results:
+      metrics.append({'name': 'accuracy_top_1',
+                      'value': float(eval_results['accuracy']),
+                      'min_value': top_1_min,
+                      'max_value': top_1_max})
+    if 'accuracy_top_5' in eval_results:
+      metrics.append({'name': 'accuracy_top_5',
+                      'value': float(eval_results['accuracy_top_5'])})
+
+    if examples_per_sec_hook:
+      exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
+      # ExamplesPerSecondHook skips the first 10 steps.
+      exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
+      metrics.append({'name': 'exp_per_second',
+                      'value': exp_per_sec})
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=eval_results.get('global_step', None),
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
+
+
+class Resnet50EstimatorAccuracy(EstimatorBenchmark):
+  """Benchmark accuracy tests for ResNet50 w/ Estimator."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """Benchmark accuracy tests for ResNet50 w/ Estimator.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [imagenet_main.define_imagenet_flags]
+
+    self.data_dir = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
+    super(Resnet50EstimatorAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)
+
+  def benchmark_graph_8_gpu(self):
+    """Test 8 GPUs graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
+    FLAGS.dtype = 'fp32'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu(self):
+    """Test FP16 8 GPUs graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 256 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_graph_rewrite_8_gpu(self):
+    """Test FP16 graph rewrite 8 GPUs graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 256 * 8
+    FLAGS.train_epochs = 90
+    FLAGS.epochs_between_evals = 10
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_graph_fp16_graph_rewrite_8_gpu')
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = imagenet_main.run_imagenet(flags.FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    self._report_benchmark(stats,
+                           wall_time_sec,
+                           top_1_min=0.762,
+                           top_1_max=0.766)
+
+
+class Resnet50EstimatorBenchmarkBase(EstimatorBenchmark):
+  """Base class for benchmarks for ResNet50 using Estimator."""
+  local_flags = None
+
+  def __init__(self, output_dir=None, default_flags=None):
+    flag_methods = [imagenet_main.define_imagenet_flags]
+
+    super(Resnet50EstimatorBenchmarkBase, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=flag_methods)
+
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = imagenet_main.run_imagenet(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    print(stats)
+    # Remove values to skip triggering accuracy check.
+    stats['eval_results'].pop('accuracy', None)
+    stats['eval_results'].pop('accuracy_top_5', None)
+
+    self._report_benchmark(stats, wall_time_sec)
+
+
+class Resnet50EstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
+  """Benchmarks for ResNet50 using Estimator with 1 worker."""
+
+  def __init__(self, output_dir=None, default_flags=None):
+    super(Resnet50EstimatorBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags)
+
+  def benchmark_graph_fp16_1_gpu(self):
+    """Benchmarks graph fp16 1 gpu."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
+    FLAGS.batch_size = 128
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_1_gpu_tweaked(self):
+    """Benchmarks graph fp16 1 gpu tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu_tweaked')
+    FLAGS.batch_size = 256
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked(self):
+    """Benchmarks graph fp16 graph rewrite 1 gpu tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked')
+    FLAGS.batch_size = 256
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_1_gpu(self):
+    """Benchmarks graph 1 gpu."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.batch_size = 128
+    FLAGS.dtype = 'fp32'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_8_gpu(self):
+    """Benchmarks graph 8 gpus."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
+    FLAGS.batch_size = 128*8
+    FLAGS.dtype = 'fp32'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu(self):
+    """Benchmarks graph fp16 8 gpus."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu_tweaked(self):
+    """Benchmarks graph fp16 8 gpus tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu_tweaked')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked(self):
+    """Benchmarks graph fp16 graph rewrite 8 gpus tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.fp16_implementation = 'graph_rewrite'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+
+class Resnet50EstimatorBenchmarkSynth(Resnet50EstimatorBenchmark):
+  """Resnet50 synthetic benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['use_synthetic_data'] = True
+    def_flags['max_train_steps'] = 110
+    def_flags['train_epochs'] = 1
+
+    super(Resnet50EstimatorBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
+class Resnet50EstimatorBenchmarkReal(Resnet50EstimatorBenchmark):
+  """Resnet50 real data benchmark tests."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['data_dir'] = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
+    def_flags['max_train_steps'] = 110
+    def_flags['train_epochs'] = 1
+
+    super(Resnet50EstimatorBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
+class Resnet50MultiWorkerEstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
+  """Benchmarks for ResNet50 using Estimator with multiple workers."""
+
+  def __init__(self, output_dir=None, default_flags=None):
+    super(Resnet50MultiWorkerEstimatorBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags)
+
+  def benchmark_graph_fp16_8_gpu_ring_tweaked(self):
+    """Benchmarks graph fp16 8 gpus with ring collective tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'multi_worker_mirrored'
+    FLAGS.all_reduce_alg = 'ring'
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.datasets_num_private_threads = 32
+    FLAGS.model_dir = self._get_model_dir(
+        folder_name='benchmark_graph_fp16_8_gpu_ring_tweaked')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_8_gpu_nccl_tweaked(self):
+    """Benchmarks graph fp16 8 gpus with nccl collective tweaked."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'multi_worker_mirrored'
+    FLAGS.all_reduce_alg = 'nccl'
+    FLAGS.tf_gpu_thread_mode = 'gpu_private'
+    FLAGS.intra_op_parallelism_threads = 1
+    FLAGS.datasets_num_private_threads = 32
+    FLAGS.model_dir = self._get_model_dir(
+        folder_name='benchmark_graph_fp16_8_gpu_nccl_tweaked')
+    FLAGS.batch_size = 256*8
+    FLAGS.dtype = 'fp16'
+    FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+
+class Resnet50MultiWorkerEstimatorBenchmarkSynth(
+    Resnet50MultiWorkerEstimatorBenchmark):
+  """ResNet50, multi-worker, Estimator, synthetic data."""
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    def_flags = {}
+    def_flags['use_synthetic_data'] = True
+    def_flags['max_train_steps'] = 110
+    def_flags['train_epochs'] = 1
+
+    super(Resnet50MultiWorkerEstimatorBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=def_flags)
+
+
+class Resnet56EstimatorAccuracy(EstimatorBenchmark):
+  """Accuracy tests for Estimator ResNet56."""
+
+  local_flags = None
+
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """A benchmark class.
+
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+                constructor forward compatible in case PerfZero provides more
+                named arguments before updating the constructor.
+    """
+    flag_methods = [cifar_main.define_cifar_flags]
+
+    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
+    super(Resnet56EstimatorAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)
+
+  def benchmark_graph_1_gpu(self):
+    """Test layers model with Estimator and distribution strategies."""
+    self._setup()
+    flags.FLAGS.num_gpus = 1
+    flags.FLAGS.data_dir = self.data_dir
+    flags.FLAGS.batch_size = 128
+    flags.FLAGS.train_epochs = 182
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    flags.FLAGS.resnet_size = 56
+    flags.FLAGS.dtype = 'fp32'
+    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_1_gpu(self):
+    """Test layers FP16 model with Estimator and distribution strategies."""
+    self._setup()
+    flags.FLAGS.num_gpus = 1
+    flags.FLAGS.data_dir = self.data_dir
+    flags.FLAGS.batch_size = 128
+    flags.FLAGS.train_epochs = 182
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
+    flags.FLAGS.resnet_size = 56
+    flags.FLAGS.dtype = 'fp16'
+    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_2_gpu(self):
+    """Test layers model with Estimator and dist_strat. 2 GPUs."""
+    self._setup()
+    flags.FLAGS.num_gpus = 2
+    flags.FLAGS.data_dir = self.data_dir
+    flags.FLAGS.batch_size = 128
+    flags.FLAGS.train_epochs = 182
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
+    flags.FLAGS.resnet_size = 56
+    flags.FLAGS.dtype = 'fp32'
+    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def benchmark_graph_fp16_2_gpu(self):
+    """Test layers FP16 model with Estimator and dist_strat. 2 GPUs."""
+    self._setup()
+    flags.FLAGS.num_gpus = 2
+    flags.FLAGS.data_dir = self.data_dir
+    flags.FLAGS.batch_size = 128
+    flags.FLAGS.train_epochs = 182
+    flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
+    flags.FLAGS.resnet_size = 56
+    flags.FLAGS.dtype = 'fp16'
+    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def unit_test(self):
+    """A lightweight test that can finish quickly."""
+    self._setup()
+    flags.FLAGS.num_gpus = 1
+    flags.FLAGS.data_dir = self.data_dir
+    flags.FLAGS.batch_size = 128
+    flags.FLAGS.train_epochs = 1
+    flags.FLAGS.model_dir = self._get_model_dir('unit_test')
+    flags.FLAGS.resnet_size = 8
+    flags.FLAGS.dtype = 'fp32'
+    flags.FLAGS.hooks = ['ExamplesPerSecondHook']
+    self._run_and_report_benchmark()
+
+  def _run_and_report_benchmark(self):
+    """Executes benchmark and reports result."""
+    start_time_sec = time.time()
+    stats = cifar_main.run_cifar(flags.FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+
+    self._report_benchmark(stats,
+                           wall_time_sec,
+                           top_1_min=0.926,
+                           top_1_max=0.938)
@@ -0,0 +1,433 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import datetime
+
+import time
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../../../')
+
+# import pydevd_pycharm
+# pydevd_pycharm.settrace('10.174.181.209', port=8008, stdoutToServer=True, stderrToServer=True)
+
+
+from absl import app as absl_app
+from absl import flags
+from six.moves import range
+import tensorflow as tf
+
+from official.r1.resnet import imagenet_preprocessing
+from official.r1.resnet import resnet_model
+from official.r1.resnet import resnet_run_loop
+from official.utils.flags import core as flags_core
+from official.utils.logs import logger
+import logging
+############## npu modify begin #############
+from npu_bridge.estimator import npu_ops
+from hccl.manage.api import get_local_rank_id
+from hccl.manage.api import get_rank_size
+from hccl.manage.api import get_rank_id
+from tensorflow.core.protobuf import rewriter_config_pb2
+tf.compat.v1.logging.set_verbosity(tf.logging.INFO)
+############## npu modify end ###############
+
+DEFAULT_IMAGE_SIZE = 224
+NUM_CHANNELS = 3
+NUM_CLASSES = 1001
+
+NUM_IMAGES = {
+    'train': 1281167,
+    'validation': 50000,
+}
+
+_NUM_TRAIN_FILES = 1024
+_SHUFFLE_BUFFER = 10000
+
+DATASET_NAME = 'ImageNet'
+#log_file1 = 'result/logger_resnet50.log'
+#log_file1 = os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../result/logger_resnet50.log')
+
+
+###############################################################################
+# Data processing
+###############################################################################
+def get_filenames(is_training, data_dir):
+  """Return filenames for dataset."""
+  if is_training:
+    return [
+        os.path.join(data_dir, 'train-%05d-of-01024' % i)
+        for i in range(_NUM_TRAIN_FILES)]
+  else:
+    return [
+        os.path.join(data_dir, 'validation-%05d-of-00128' % i)
+        for i in range(128)]
+
+
+def _parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+      'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
+                                                 default_value=-1),
+      'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
+                                                default_value=''),
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                   'image/object/bbox/ymin',
+                                   'image/object/bbox/xmax',
+                                   'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized,
+                                        features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
+
+  return features['image/encoded'], label, bbox
+
+
+def parse_record(raw_record, is_training, dtype):
+  """Parses a record containing a training example of an image.
+
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: data type to use for images/features.
+
+  Returns:
+    Tuple with processed image tensor and one-hot-encoded label tensor.
+  """
+  image_buffer, label, bbox = _parse_example_proto(raw_record)
+  #work_num, root_dir, datatime, resnet_logger = hwlog.env(log_file1)
+  # add 预处理
+  #resnet_logger.info("namespace:%s,time_ts:%s, event_type:pre_process_event" % (work_num, date_time))
+  #resnet_logger.info("namespace:%s,time_ts:%s,event_type:init_start" % (work_num, date_time))
+  image = imagenet_preprocessing.preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
+      output_height=DEFAULT_IMAGE_SIZE,
+      output_width=DEFAULT_IMAGE_SIZE,
+      num_channels=NUM_CHANNELS,
+      is_training=is_training)
+  # resnet_logger.info("namespace:%s,time_ts:%d,event_type:init_end, root_dir:%s" % (work_num, datatime, root_dir))
+  image = tf.cast(image, dtype)
+
+  return image, label
+
+
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             num_epochs=1,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             parse_record_fn=parse_record,
+             input_context=None,
+             drop_remainder=False,
+             tf_data_experimental_slack=False):
+  """Input function which provides batches for train or eval.
+
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    num_epochs: The number of epochs to repeat the dataset.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    parse_record_fn: Function to use for parsing the records.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.Dataset.from_tensor_slices(filenames)
+
+  if input_context:
+    ############## npu modify begin #############
+    dataset = dataset.shard(get_rank_size(),
+                              get_rank_id())
+    ############## npu modify end ###############
+
+  if is_training:
+    # Shuffle the input files
+    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
+
+  # Convert to individual records.
+  # cycle_length = 10 means that up to 10 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=10,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  return resnet_run_loop.process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=_SHUFFLE_BUFFER,
+      parse_record_fn=parse_record_fn,
+      num_epochs=num_epochs,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=tf_data_experimental_slack,
+  )
+
+
+def get_synth_input_fn(dtype):
+  return resnet_run_loop.get_synth_input_fn(
+      DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS, NUM_CLASSES,
+      dtype=dtype)
+
+
+###############################################################################
+# Running the model
+###############################################################################
+class ImagenetModel(resnet_model.Model):
+  """Model class with appropriate defaults for Imagenet data."""
+
+  def __init__(self, resnet_size, data_format=None, num_classes=NUM_CLASSES,
+               resnet_version=resnet_model.DEFAULT_VERSION,
+               dtype=resnet_model.DEFAULT_DTYPE):
+    """These are the parameters that work for Imagenet data.
+
+    Args:
+      resnet_size: The number of convolutional layers needed in the model.
+      data_format: Either 'channels_first' or 'channels_last', specifying which
+        data format to use when setting up the model.
+      num_classes: The number of output classes needed from the model. This
+        enables users to extend the same model to their own datasets.
+      resnet_version: Integer representing which version of the ResNet network
+        to use. See README for details. Valid values: [1, 2]
+      dtype: The TensorFlow dtype to use for calculations.
+    """
+
+    # For bigger models, we want to use "bottleneck" layers
+    if resnet_size < 50:
+      bottleneck = False
+    else:
+      bottleneck = True
+
+    super(ImagenetModel, self).__init__(
+        resnet_size=resnet_size,
+        bottleneck=bottleneck,
+        num_classes=num_classes,
+        num_filters=64,
+        kernel_size=7,
+        conv_stride=2,
+        first_pool_size=3,
+        first_pool_stride=2,
+        block_sizes=_get_block_sizes(resnet_size),
+        block_strides=[1, 2, 2, 2],
+        resnet_version=resnet_version,
+        data_format=data_format,
+        dtype=dtype
+    )
+
+
+def _get_block_sizes(resnet_size):
+  """Retrieve the size of each block_layer in the ResNet model.
+
+  The number of block layers used for the Resnet model varies according
+  to the size of the model. This helper grabs the layer set we want, throwing
+  an error if a non-standard size has been selected.
+
+  Args:
+    resnet_size: The number of convolutional layers needed in the model.
+
+  Returns:
+    A list of block sizes to use in building the model.
+
+  Raises:
+    KeyError: if invalid resnet_size is received.
+  """
+  choices = {
+      18: [2, 2, 2, 2],
+      34: [3, 4, 6, 3],
+      50: [3, 4, 6, 3],
+      101: [3, 4, 23, 3],
+      152: [3, 8, 36, 3],
+      200: [3, 24, 36, 3]
+  }
+
+  try:
+    return choices[resnet_size]
+  except KeyError:
+    err = ('Could not find layers for selected Resnet size.\n'
+           'Size received: {}; sizes allowed: {}.'.format(
+               resnet_size, list(choices.keys())))
+    raise ValueError(err)
+
+
+def imagenet_model_fn(features, labels, mode, params):
+  """Our model_fn for ResNet to be used with our Estimator."""
+
+  # Warmup and higher lr may not be valid for fine tuning with small batches
+  # and smaller numbers of training images.
+  if params['fine_tune']:
+    warmup = False
+    base_lr = .1
+  else:
+    warmup = True
+    base_lr = .128
+
+  learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
+      batch_size=params['num_gpus']*params['batch_size'],
+      batch_denom=256, num_images=NUM_IMAGES['train'],
+      boundary_epochs=[30, 60, 80, 90], decay_rates=[1, 0.1, 0.01, 0.001, 1e-4],
+      warmup=warmup, base_lr=base_lr)
+
+  return resnet_run_loop.resnet_model_fn(
+      features=features,
+      labels=labels,
+      mode=mode,
+      model_class=ImagenetModel,
+      resnet_size=params['resnet_size'],
+      weight_decay=flags.FLAGS.weight_decay,
+      learning_rate_fn=learning_rate_fn,
+      momentum=0.9,
+      data_format=params['data_format'],
+      resnet_version=params['resnet_version'],
+      loss_scale=params['loss_scale'],
+      loss_filter_fn=None,
+      dtype=params['dtype'],
+      fine_tune=params['fine_tune'],
+      label_smoothing=flags.FLAGS.label_smoothing
+  )
+
+
+def define_imagenet_flags():
+  resnet_run_loop.define_resnet_flags(
+      resnet_size_choices=['18', '34', '50', '101', '152', '200'],
+      dynamic_loss_scale=True,
+      fp16_implementation=True)
+  flags.adopt_module_key_flags(resnet_run_loop)
+  flags_core.set_defaults(train_epochs=90)
+
+  #Loss scale is defautt used because Davinci core supports mixed precision naturally
+  flags_core.set_defaults(loss_scale='512')
+
+def run_imagenet(flags_obj):
+  """Run ResNet ImageNet training and eval loop.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Returns:
+    Dict of results of the run.  Contains the keys `eval_results` and
+      `train_hooks`. `eval_results` contains accuracy (top_1) and
+      accuracy_top_5. `train_hooks` is a list the instances of hooks used during
+      training.
+  """
+  input_function = (flags_obj.use_synthetic_data and
+                    get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
+                    input_fn)
+
+  result = resnet_run_loop.resnet_main(
+      flags_obj, imagenet_model_fn, input_function, DATASET_NAME,NUM_IMAGES,
+      shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS],)
+
+  return result
+def main(flags_obj):
+  ############## npu modify begin #############
+  # Init NPU ,then can call HCCL Interface
+  init_sess,npu_init=resnet_run_loop.init_npu()
+
+  init_sess.run(npu_init)
+  # i=1
+  # while(1):
+  #   i+=1
+  ############## npu modify end ###############
+
+  with logger.benchmark_context(flags.FLAGS):
+    run_imagenet(flags.FLAGS)
+
+def benchmark_main():
+  absl_app.run(main)
+
+def benchmark_pre():
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  define_imagenet_flags()
+
+if __name__ == '__main__':
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  define_imagenet_flags()
+  absl_app.run(main)
@@ -0,0 +1,262 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to preprocess images.
+
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+
+All images undergo mean color subtraction.
+
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256
+
+
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
+  """Crops the given image to a random part of the image, and randomly flips.
+
+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.
+
+  Returns:
+    3-D tensor with cropped image.
+
+  """
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      tf.image.extract_jpeg_shape(image_buffer),
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+
+  # Use the fused decode and crop op here, which is faster than each in series.
+  cropped = tf.image.decode_and_crop_jpeg(
+      image_buffer, crop_window, channels=num_channels)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def _central_crop(image, crop_height, crop_width):
+  """Performs central crops of the given image list.
+
+  Args:
+    image: a 3-D image tensor
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+
+  Returns:
+    3-D tensor with cropped image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
+  return tf.slice(
+      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+
+
+def _mean_image_subtraction(image, means, num_channels):
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image))
+
+  return image - means
+
+
+def _smallest_size_at_least(height, width, resize_min):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    resize_min: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: an int32 scalar tensor indicating the new width.
+  """
+  resize_min = tf.cast(resize_min, tf.float32)
+
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+
+  smaller_dim = tf.minimum(height, width)
+  scale_ratio = resize_min / smaller_dim
+
+  # Convert back to ints to make heights and widths that TF ops will accept.
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
+
+  return new_height, new_width
+
+
+def _aspect_preserving_resize(image, resize_min):
+  """Resize images preserving the original aspect ratio.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    resize_min: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+
+  return _resize_image(image, new_height, new_width)
+
+
+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images.
+
+  This is primarily to make sure we use the same `ResizeMethod` and other
+  details each time.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.compat.v1.image.resize(
+      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_image(image_buffer, bbox, output_height, output_width,
+                     num_channels, is_training=False):
+  """Preprocesses the given image.
+
+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    # For training, we want to randomize some of the distortions.
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
+  else:
+    # For validation, we want to decode, resize, then just crop the middle.
+    image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)
+
+  image.set_shape([output_height, output_width, num_channels])
+
+  return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
@@ -0,0 +1,326 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.r1.resnet import imagenet_main
+from official.utils.misc import keras_utils
+from official.utils.testing import integration
+
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+
+_BATCH_SIZE = 32
+_LABEL_CLASSES = 1001
+
+
+class BaseTest(tf.test.TestCase):
+
+  _num_validation_images = None
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(BaseTest, cls).setUpClass()
+    imagenet_main.define_imagenet_flags()
+
+  def setUp(self):
+    super(BaseTest, self).setUp()
+    if keras_utils.is_v2_0:
+      tf.compat.v1.disable_eager_execution()
+    self._num_validation_images = imagenet_main.NUM_IMAGES['validation']
+    imagenet_main.NUM_IMAGES['validation'] = 4
+
+  def tearDown(self):
+    super(BaseTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+    imagenet_main.NUM_IMAGES['validation'] = self._num_validation_images
+
+  def _tensor_shapes_helper(self, resnet_size, resnet_version, dtype, with_gpu):
+    """Checks the tensor shapes after each phase of the ResNet model."""
+    def reshape(shape):
+      """Returns the expected dimensions depending on if a GPU is being used."""
+
+      # If a GPU is used for the test, the shape is returned (already in NCHW
+      # form). When GPU is not used, the shape is converted to NHWC.
+      if with_gpu:
+        return shape
+      return shape[0], shape[2], shape[3], shape[1]
+
+    graph = tf.Graph()
+
+    with graph.as_default(), self.test_session(
+        graph=graph, use_gpu=with_gpu, force_gpu=with_gpu):
+      model = imagenet_main.ImagenetModel(
+          resnet_size=resnet_size,
+          data_format='channels_first' if with_gpu else 'channels_last',
+          resnet_version=resnet_version,
+          dtype=dtype
+      )
+      inputs = tf.random.uniform([1, 224, 224, 3])
+      output = model(inputs, training=True)
+
+      initial_conv = graph.get_tensor_by_name('resnet_model/initial_conv:0')
+      max_pool = graph.get_tensor_by_name('resnet_model/initial_max_pool:0')
+      block_layer1 = graph.get_tensor_by_name('resnet_model/block_layer1:0')
+      block_layer2 = graph.get_tensor_by_name('resnet_model/block_layer2:0')
+      block_layer3 = graph.get_tensor_by_name('resnet_model/block_layer3:0')
+      block_layer4 = graph.get_tensor_by_name('resnet_model/block_layer4:0')
+      reduce_mean = graph.get_tensor_by_name('resnet_model/final_reduce_mean:0')
+      dense = graph.get_tensor_by_name('resnet_model/final_dense:0')
+
+      self.assertAllEqual(initial_conv.shape, reshape((1, 64, 112, 112)))
+      self.assertAllEqual(max_pool.shape, reshape((1, 64, 56, 56)))
+
+      # The number of channels after each block depends on whether we're
+      # using the building_block or the bottleneck_block.
+      if resnet_size < 50:
+        self.assertAllEqual(block_layer1.shape, reshape((1, 64, 56, 56)))
+        self.assertAllEqual(block_layer2.shape, reshape((1, 128, 28, 28)))
+        self.assertAllEqual(block_layer3.shape, reshape((1, 256, 14, 14)))
+        self.assertAllEqual(block_layer4.shape, reshape((1, 512, 7, 7)))
+        self.assertAllEqual(reduce_mean.shape, reshape((1, 512, 1, 1)))
+      else:
+        self.assertAllEqual(block_layer1.shape, reshape((1, 256, 56, 56)))
+        self.assertAllEqual(block_layer2.shape, reshape((1, 512, 28, 28)))
+        self.assertAllEqual(block_layer3.shape, reshape((1, 1024, 14, 14)))
+        self.assertAllEqual(block_layer4.shape, reshape((1, 2048, 7, 7)))
+        self.assertAllEqual(reduce_mean.shape, reshape((1, 2048, 1, 1)))
+
+      self.assertAllEqual(dense.shape, (1, _LABEL_CLASSES))
+      self.assertAllEqual(output.shape, (1, _LABEL_CLASSES))
+
+  def tensor_shapes_helper(self, resnet_size, resnet_version, with_gpu=False):
+    self._tensor_shapes_helper(resnet_size=resnet_size,
+                               resnet_version=resnet_version,
+                               dtype=tf.float32, with_gpu=with_gpu)
+    self._tensor_shapes_helper(resnet_size=resnet_size,
+                               resnet_version=resnet_version,
+                               dtype=tf.float16, with_gpu=with_gpu)
+
+  def test_tensor_shapes_resnet_18_v1(self):
+    self.tensor_shapes_helper(18, resnet_version=1)
+
+  def test_tensor_shapes_resnet_18_v2(self):
+    self.tensor_shapes_helper(18, resnet_version=2)
+
+  def test_tensor_shapes_resnet_34_v1(self):
+    self.tensor_shapes_helper(34, resnet_version=1)
+
+  def test_tensor_shapes_resnet_34_v2(self):
+    self.tensor_shapes_helper(34, resnet_version=2)
+
+  def test_tensor_shapes_resnet_50_v1(self):
+    self.tensor_shapes_helper(50, resnet_version=1)
+
+  def test_tensor_shapes_resnet_50_v2(self):
+    self.tensor_shapes_helper(50, resnet_version=2)
+
+  def test_tensor_shapes_resnet_101_v1(self):
+    self.tensor_shapes_helper(101, resnet_version=1)
+
+  def test_tensor_shapes_resnet_101_v2(self):
+    self.tensor_shapes_helper(101, resnet_version=2)
+
+  def test_tensor_shapes_resnet_152_v1(self):
+    self.tensor_shapes_helper(152, resnet_version=1)
+
+  def test_tensor_shapes_resnet_152_v2(self):
+    self.tensor_shapes_helper(152, resnet_version=2)
+
+  def test_tensor_shapes_resnet_200_v1(self):
+    self.tensor_shapes_helper(200, resnet_version=1)
+
+  def test_tensor_shapes_resnet_200_v2(self):
+    self.tensor_shapes_helper(200, resnet_version=2)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_18_with_gpu_v1(self):
+    self.tensor_shapes_helper(18, resnet_version=1, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_18_with_gpu_v2(self):
+    self.tensor_shapes_helper(18, resnet_version=2, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_34_with_gpu_v1(self):
+    self.tensor_shapes_helper(34, resnet_version=1, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_34_with_gpu_v2(self):
+    self.tensor_shapes_helper(34, resnet_version=2, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_50_with_gpu_v1(self):
+    self.tensor_shapes_helper(50, resnet_version=1, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_50_with_gpu_v2(self):
+    self.tensor_shapes_helper(50, resnet_version=2, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_101_with_gpu_v1(self):
+    self.tensor_shapes_helper(101, resnet_version=1, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_101_with_gpu_v2(self):
+    self.tensor_shapes_helper(101, resnet_version=2, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_152_with_gpu_v1(self):
+    self.tensor_shapes_helper(152, resnet_version=1, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_152_with_gpu_v2(self):
+    self.tensor_shapes_helper(152, resnet_version=2, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_200_with_gpu_v1(self):
+    self.tensor_shapes_helper(200, resnet_version=1, with_gpu=True)
+
+  @unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
+  def test_tensor_shapes_resnet_200_with_gpu_v2(self):
+    self.tensor_shapes_helper(200, resnet_version=2, with_gpu=True)
+
+  def resnet_model_fn_helper(self, mode, resnet_version, dtype):
+    """Tests that the EstimatorSpec is given the appropriate arguments."""
+    tf.compat.v1.train.create_global_step()
+
+    input_fn = imagenet_main.get_synth_input_fn(dtype)
+    dataset = input_fn(True, '', _BATCH_SIZE)
+    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
+    features, labels = iterator.get_next()
+    spec = imagenet_main.imagenet_model_fn(
+        features, labels, mode, {
+            'dtype': dtype,
+            'resnet_size': 50,
+            'data_format': 'channels_last',
+            'batch_size': _BATCH_SIZE,
+            'resnet_version': resnet_version,
+            'loss_scale': 128 if dtype == tf.float16 else 1,
+            'fine_tune': False,
+        })
+
+    predictions = spec.predictions
+    self.assertAllEqual(predictions['probabilities'].shape,
+                        (_BATCH_SIZE, _LABEL_CLASSES))
+    self.assertEqual(predictions['probabilities'].dtype, tf.float32)
+    self.assertAllEqual(predictions['classes'].shape, (_BATCH_SIZE,))
+    self.assertEqual(predictions['classes'].dtype, tf.int64)
+
+    if mode != tf.estimator.ModeKeys.PREDICT:
+      loss = spec.loss
+      self.assertAllEqual(loss.shape, ())
+      self.assertEqual(loss.dtype, tf.float32)
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+      eval_metric_ops = spec.eval_metric_ops
+      self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
+      self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
+      self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
+      self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
+
+  def test_resnet_model_fn_train_mode_v1(self):
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=1,
+                                dtype=tf.float32)
+
+  def test_resnet_model_fn_train_mode_v2(self):
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=2,
+                                dtype=tf.float32)
+
+  def test_resnet_model_fn_eval_mode_v1(self):
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=1,
+                                dtype=tf.float32)
+
+  def test_resnet_model_fn_eval_mode_v2(self):
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=2,
+                                dtype=tf.float32)
+
+  def test_resnet_model_fn_predict_mode_v1(self):
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=1,
+                                dtype=tf.float32)
+
+  def test_resnet_model_fn_predict_mode_v2(self):
+    self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=2,
+                                dtype=tf.float32)
+
+  def _test_imagenetmodel_shape(self, resnet_version):
+    batch_size = 135
+    num_classes = 246
+
+    model = imagenet_main.ImagenetModel(
+        50, data_format='channels_last', num_classes=num_classes,
+        resnet_version=resnet_version)
+
+    fake_input = tf.random.uniform([batch_size, 224, 224, 3])
+    output = model(fake_input, training=True)
+
+    self.assertAllEqual(output.shape, (batch_size, num_classes))
+
+  def test_imagenetmodel_shape_v1(self):
+    self._test_imagenetmodel_shape(resnet_version=1)
+
+  def test_imagenetmodel_shape_v2(self):
+    self._test_imagenetmodel_shape(resnet_version=2)
+
+  def test_imagenet_end_to_end_synthetic_v1(self):
+    integration.run_synthetic(
+        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
+        extra_flags=['-resnet_version', '1', '-batch_size', '4',
+                     '--max_train_steps', '1']
+    )
+
+  def test_imagenet_end_to_end_synthetic_v2(self):
+    integration.run_synthetic(
+        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
+        extra_flags=['-resnet_version', '2', '-batch_size', '4',
+                     '--max_train_steps', '1']
+    )
+
+  def test_imagenet_end_to_end_synthetic_v1_tiny(self):
+    integration.run_synthetic(
+        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
+        extra_flags=['-resnet_version', '1', '-batch_size', '4',
+                     '-resnet_size', '18', '--max_train_steps', '1']
+    )
+
+  def test_imagenet_end_to_end_synthetic_v2_tiny(self):
+    integration.run_synthetic(
+        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
+        extra_flags=['-resnet_version', '2', '-batch_size', '4',
+                     '-resnet_size', '18', '--max_train_steps', '1']
+    )
+
+  def test_imagenet_end_to_end_synthetic_v1_huge(self):
+    integration.run_synthetic(
+        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
+        extra_flags=['-resnet_version', '1', '-batch_size', '4',
+                     '-resnet_size', '200', '--max_train_steps', '1']
+    )
+
+  def test_imagenet_end_to_end_synthetic_v2_huge(self):
+    integration.run_synthetic(
+        main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
+        extra_flags=['-resnet_version', '2', '-batch_size', '4',
+                     '-resnet_size', '200', '--max_train_steps', '1']
+    )
+
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,552 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains definitions for Residual Networks.
+
+Residual networks ('v1' ResNets) were originally proposed in:
+[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+
+The full preactivation 'v2' ResNet variant was introduced by:
+[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
+
+The key difference of the full preactivation 'v2' variant compared to the
+'v1' variant in [1] is the use of batch normalization before every weight layer
+rather than after.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+_BATCH_NORM_DECAY = 0.997
+_BATCH_NORM_EPSILON = 1e-5
+DEFAULT_VERSION = 2
+DEFAULT_DTYPE = tf.float32
+CASTABLE_TYPES = (tf.float16,)
+ALLOWED_TYPES = (DEFAULT_DTYPE,) + CASTABLE_TYPES
+
+
+################################################################################
+# Convenience functions for building the ResNet model.
+################################################################################
+def batch_norm(inputs, training, data_format):
+  """Performs a batch normalization using a standard set of parameters."""
+  # We set fused=True for a significant performance boost. See
+  # https://www.tensorflow.org/performance/performance_guide#common_fused_ops
+  return tf.compat.v1.layers.batch_normalization(
+      inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
+      momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
+      scale=True, training=training, fused=True)
+
+
+def fixed_padding(inputs, kernel_size, data_format):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
+                 Should be a positive integer.
+    data_format: The input format ('channels_last' or 'channels_first').
+
+  Returns:
+    A tensor with the same format as the input with the data either intact
+    (if kernel_size == 1) or padded (if kernel_size > 1).
+  """
+  pad_total = kernel_size - 1
+  pad_beg = pad_total // 2
+  pad_end = pad_total - pad_beg
+
+  if data_format == 'channels_first':
+    padded_inputs = tf.pad(tensor=inputs,
+                           paddings=[[0, 0], [0, 0], [pad_beg, pad_end],
+                                     [pad_beg, pad_end]])
+  else:
+    padded_inputs = tf.pad(tensor=inputs,
+                           paddings=[[0, 0], [pad_beg, pad_end],
+                                     [pad_beg, pad_end], [0, 0]])
+  return padded_inputs
+
+
+def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
+  """Strided 2-D convolution with explicit padding."""
+  # The padding is consistent and is based only on `kernel_size`, not on the
+  # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+  if strides > 1:
+    inputs = fixed_padding(inputs, kernel_size, data_format)
+
+  return tf.compat.v1.layers.conv2d(
+      inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
+      padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
+      kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
+      data_format=data_format)
+
+
+################################################################################
+# ResNet block definitions.
+################################################################################
+def _building_block_v1(inputs, filters, training, projection_shortcut, strides,
+                       data_format):
+  """A single block for ResNet v1, without a bottleneck.
+
+  Convolution then batch normalization then ReLU as described by:
+    Deep Residual Learning for Image Recognition
+    https://arxiv.org/pdf/1512.03385.pdf
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
+
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    filters: The number of filters for the convolutions.
+    training: A Boolean for whether the model is in training or inference
+      mode. Needed for batch normalization.
+    projection_shortcut: The function to use for projection shortcuts
+      (typically a 1x1 convolution when downsampling the input).
+    strides: The block's stride. If greater than 1, this block will ultimately
+      downsample the input.
+    data_format: The input format ('channels_last' or 'channels_first').
+
+  Returns:
+    The output tensor of the block; shape should match inputs.
+  """
+  shortcut = inputs
+
+  if projection_shortcut is not None:
+    shortcut = projection_shortcut(inputs)
+    shortcut = batch_norm(inputs=shortcut, training=training,
+                          data_format=data_format)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
+      data_format=data_format)
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=3, strides=1,
+      data_format=data_format)
+  inputs = batch_norm(inputs, training, data_format)
+  inputs += shortcut
+  inputs = tf.nn.relu(inputs)
+
+  return inputs
+
+
+def _building_block_v2(inputs, filters, training, projection_shortcut, strides,
+                       data_format):
+  """A single block for ResNet v2, without a bottleneck.
+
+  Batch normalization then ReLu then convolution as described by:
+    Identity Mappings in Deep Residual Networks
+    https://arxiv.org/pdf/1603.05027.pdf
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
+
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    filters: The number of filters for the convolutions.
+    training: A Boolean for whether the model is in training or inference
+      mode. Needed for batch normalization.
+    projection_shortcut: The function to use for projection shortcuts
+      (typically a 1x1 convolution when downsampling the input).
+    strides: The block's stride. If greater than 1, this block will ultimately
+      downsample the input.
+    data_format: The input format ('channels_last' or 'channels_first').
+
+  Returns:
+    The output tensor of the block; shape should match inputs.
+  """
+  shortcut = inputs
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+
+  # The projection shortcut should come after the first batch norm and ReLU
+  # since it performs a 1x1 convolution.
+  if projection_shortcut is not None:
+    shortcut = projection_shortcut(inputs)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
+      data_format=data_format)
+
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=3, strides=1,
+      data_format=data_format)
+
+  return inputs + shortcut
+
+
+def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
+                         strides, data_format):
+  """A single block for ResNet v1, with a bottleneck.
+
+  Similar to _building_block_v1(), except using the "bottleneck" blocks
+  described in:
+    Convolution then batch normalization then ReLU as described by:
+      Deep Residual Learning for Image Recognition
+      https://arxiv.org/pdf/1512.03385.pdf
+      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
+
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    filters: The number of filters for the convolutions.
+    training: A Boolean for whether the model is in training or inference
+      mode. Needed for batch normalization.
+    projection_shortcut: The function to use for projection shortcuts
+      (typically a 1x1 convolution when downsampling the input).
+    strides: The block's stride. If greater than 1, this block will ultimately
+      downsample the input.
+    data_format: The input format ('channels_last' or 'channels_first').
+
+  Returns:
+    The output tensor of the block; shape should match inputs.
+  """
+  shortcut = inputs
+
+  if projection_shortcut is not None:
+    shortcut = projection_shortcut(inputs)
+    shortcut = batch_norm(inputs=shortcut, training=training,
+                          data_format=data_format)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=1, strides=1,
+      data_format=data_format)
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
+      data_format=data_format)
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
+      data_format=data_format)
+  inputs = batch_norm(inputs, training, data_format)
+  inputs += shortcut
+  inputs = tf.nn.relu(inputs)
+
+  return inputs
+
+
+def _bottleneck_block_v2(inputs, filters, training, projection_shortcut,
+                         strides, data_format):
+  """A single block for ResNet v2, with a bottleneck.
+
+  Similar to _building_block_v2(), except using the "bottleneck" blocks
+  described in:
+    Convolution then batch normalization then ReLU as described by:
+      Deep Residual Learning for Image Recognition
+      https://arxiv.org/pdf/1512.03385.pdf
+      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
+
+  Adapted to the ordering conventions of:
+    Batch normalization then ReLu then convolution as described by:
+      Identity Mappings in Deep Residual Networks
+      https://arxiv.org/pdf/1603.05027.pdf
+      by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
+
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    filters: The number of filters for the convolutions.
+    training: A Boolean for whether the model is in training or inference
+      mode. Needed for batch normalization.
+    projection_shortcut: The function to use for projection shortcuts
+      (typically a 1x1 convolution when downsampling the input).
+    strides: The block's stride. If greater than 1, this block will ultimately
+      downsample the input.
+    data_format: The input format ('channels_last' or 'channels_first').
+
+  Returns:
+    The output tensor of the block; shape should match inputs.
+  """
+  shortcut = inputs
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+
+  # The projection shortcut should come after the first batch norm and ReLU
+  # since it performs a 1x1 convolution.
+  if projection_shortcut is not None:
+    shortcut = projection_shortcut(inputs)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=1, strides=1,
+      data_format=data_format)
+
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=filters, kernel_size=3, strides=strides,
+      data_format=data_format)
+
+  inputs = batch_norm(inputs, training, data_format)
+  inputs = tf.nn.relu(inputs)
+  inputs = conv2d_fixed_padding(
+      inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
+      data_format=data_format)
+
+  return inputs + shortcut
+
+
+def block_layer(inputs, filters, bottleneck, block_fn, blocks, strides,
+                training, name, data_format):
+  """Creates one layer of blocks for the ResNet model.
+
+  Args:
+    inputs: A tensor of size [batch, channels, height_in, width_in] or
+      [batch, height_in, width_in, channels] depending on data_format.
+    filters: The number of filters for the first convolution of the layer.
+    bottleneck: Is the block created a bottleneck block.
+    block_fn: The block to use within the model, either `building_block` or
+      `bottleneck_block`.
+    blocks: The number of blocks contained in the layer.
+    strides: The stride to use for the first convolution of the layer. If
+      greater than 1, this layer will ultimately downsample the input.
+    training: Either True or False, whether we are currently training the
+      model. Needed for batch norm.
+    name: A string name for the tensor output of the block layer.
+    data_format: The input format ('channels_last' or 'channels_first').
+
+  Returns:
+    The output tensor of the block layer.
+  """
+
+  # Bottleneck blocks end with 4x the number of filters as they start with
+  filters_out = filters * 4 if bottleneck else filters
+
+  def projection_shortcut(inputs):
+    return conv2d_fixed_padding(
+        inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
+        data_format=data_format)
+
+  # Only the first block per block_layer uses projection_shortcut and strides
+  inputs = block_fn(inputs, filters, training, projection_shortcut, strides,
+                    data_format)
+
+  for _ in range(1, blocks):
+    inputs = block_fn(inputs, filters, training, None, 1, data_format)
+
+  return tf.identity(inputs, name)
+
+
+class Model(object):
+  """Base class for building the Resnet Model."""
+
+  def __init__(self, resnet_size, bottleneck, num_classes, num_filters,
+               kernel_size,
+               conv_stride, first_pool_size, first_pool_stride,
+               block_sizes, block_strides,
+               resnet_version=DEFAULT_VERSION, data_format=None,
+               dtype=DEFAULT_DTYPE):
+    """Creates a model for classifying an image.
+
+    Args:
+      resnet_size: A single integer for the size of the ResNet model.
+      bottleneck: Use regular blocks or bottleneck blocks.
+      num_classes: The number of classes used as labels.
+      num_filters: The number of filters to use for the first block layer
+        of the model. This number is then doubled for each subsequent block
+        layer.
+      kernel_size: The kernel size to use for convolution.
+      conv_stride: stride size for the initial convolutional layer
+      first_pool_size: Pool size to be used for the first pooling layer.
+        If none, the first pooling layer is skipped.
+      first_pool_stride: stride size for the first pooling layer. Not used
+        if first_pool_size is None.
+      block_sizes: A list containing n values, where n is the number of sets of
+        block layers desired. Each value should be the number of blocks in the
+        i-th set.
+      block_strides: List of integers representing the desired stride size for
+        each of the sets of block layers. Should be same length as block_sizes.
+      resnet_version: Integer representing which version of the ResNet network
+        to use. See README for details. Valid values: [1, 2]
+      data_format: Input format ('channels_last', 'channels_first', or None).
+        If set to None, the format is dependent on whether a GPU is available.
+      dtype: The TensorFlow dtype to use for calculations. If not specified
+        tf.float32 is used.
+
+    Raises:
+      ValueError: if invalid version is selected.
+    """
+    self.resnet_size = resnet_size
+
+    if not data_format:
+      data_format = (
+          'channels_first' if tf.test.is_built_with_cuda() else 'channels_last')
+
+    self.resnet_version = resnet_version
+    if resnet_version not in (1, 2):
+      raise ValueError(
+          'Resnet version should be 1 or 2. See README for citations.')
+
+    self.bottleneck = bottleneck
+    if bottleneck:
+      if resnet_version == 1:
+        self.block_fn = _bottleneck_block_v1
+      else:
+        self.block_fn = _bottleneck_block_v2
+    else:
+      if resnet_version == 1:
+        self.block_fn = _building_block_v1
+      else:
+        self.block_fn = _building_block_v2
+
+    if dtype not in ALLOWED_TYPES:
+      raise ValueError('dtype must be one of: {}'.format(ALLOWED_TYPES))
+
+    self.data_format = data_format
+    self.num_classes = num_classes
+    self.num_filters = num_filters
+    self.kernel_size = kernel_size
+    self.conv_stride = conv_stride
+    self.first_pool_size = first_pool_size
+    self.first_pool_stride = first_pool_stride
+    self.block_sizes = block_sizes
+    self.block_strides = block_strides
+    self.dtype = dtype
+    self.pre_activation = resnet_version == 2
+
+  def _custom_dtype_getter(self, getter, name, shape=None, dtype=DEFAULT_DTYPE,
+                           *args, **kwargs):
+    """Creates variables in fp32, then casts to fp16 if necessary.
+
+    This function is a custom getter. A custom getter is a function with the
+    same signature as tf.get_variable, except it has an additional getter
+    parameter. Custom getters can be passed as the `custom_getter` parameter of
+    tf.variable_scope. Then, tf.get_variable will call the custom getter,
+    instead of directly getting a variable itself. This can be used to change
+    the types of variables that are retrieved with tf.get_variable.
+    The `getter` parameter is the underlying variable getter, that would have
+    been called if no custom getter was used. Custom getters typically get a
+    variable with `getter`, then modify it in some way.
+
+    This custom getter will create an fp32 variable. If a low precision
+    (e.g. float16) variable was requested it will then cast the variable to the
+    requested dtype. The reason we do not directly create variables in low
+    precision dtypes is that applying small gradients to such variables may
+    cause the variable not to change.
+
+    Args:
+      getter: The underlying variable getter, that has the same signature as
+        tf.get_variable and returns a variable.
+      name: The name of the variable to get.
+      shape: The shape of the variable to get.
+      dtype: The dtype of the variable to get. Note that if this is a low
+        precision dtype, the variable will be created as a tf.float32 variable,
+        then cast to the appropriate dtype
+      *args: Additional arguments to pass unmodified to getter.
+      **kwargs: Additional keyword arguments to pass unmodified to getter.
+
+    Returns:
+      A variable which is cast to fp16 if necessary.
+    """
+
+    if dtype in CASTABLE_TYPES:
+      var = getter(name, shape, tf.float32, *args, **kwargs)
+      return tf.cast(var, dtype=dtype, name=name + '_cast')
+    else:
+      return getter(name, shape, dtype, *args, **kwargs)
+
+  def _model_variable_scope(self):
+    """Returns a variable scope that the model should be created under.
+
+    If self.dtype is a castable type, model variable will be created in fp32
+    then cast to self.dtype before being used.
+
+    Returns:
+      A variable scope for the model.
+    """
+
+    return tf.compat.v1.variable_scope('resnet_model',
+                                       custom_getter=self._custom_dtype_getter)
+
+  def __call__(self, inputs, training):
+    """Add operations to classify a batch of input images.
+
+    Args:
+      inputs: A Tensor representing a batch of input images.
+      training: A boolean. Set to True to add operations required only when
+        training the classifier.
+
+    Returns:
+      A logits Tensor with shape [<batch_size>, self.num_classes].
+    """
+
+    with self._model_variable_scope():
+      if self.data_format == 'channels_first':
+        # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
+        # This provides a large performance boost on GPU. See
+        # https://www.tensorflow.org/performance/performance_guide#data_formats
+        inputs = tf.transpose(a=inputs, perm=[0, 3, 1, 2])
+
+      inputs = conv2d_fixed_padding(
+          inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
+          strides=self.conv_stride, data_format=self.data_format)
+      inputs = tf.identity(inputs, 'initial_conv')
+
+      # We do not include batch normalization or activation functions in V2
+      # for the initial conv1 because the first ResNet unit will perform these
+      # for both the shortcut and non-shortcut paths as part of the first
+      # block's projection. Cf. Appendix of [2].
+      if self.resnet_version == 1:
+        inputs = batch_norm(inputs, training, self.data_format)
+        inputs = tf.nn.relu(inputs)
+
+      if self.first_pool_size:
+        ############## npu modify begin #############
+        #max_pooling2d is replaced by max_pool_with_argmax for better performance
+        inputs,argmax = tf.compat.v1.nn.max_pool_with_argmax(
+            input=inputs, ksize=(1,self.first_pool_size,self.first_pool_size,1),
+            strides=(1,self.first_pool_stride,self.first_pool_stride,1), padding='SAME',
+            data_format='NCHW' if self.data_format == 'channels_first' else 'NHWC')
+        ############## npu modify end ###############
+
+        inputs = tf.identity(inputs, 'initial_max_pool')
+
+      for i, num_blocks in enumerate(self.block_sizes):
+        num_filters = self.num_filters * (2**i)
+        inputs = block_layer(
+            inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
+            block_fn=self.block_fn, blocks=num_blocks,
+            strides=self.block_strides[i], training=training,
+            name='block_layer{}'.format(i + 1), data_format=self.data_format)
+
+      # Only apply the BN and ReLU for model that does pre_activation in each
+      # building/bottleneck block, eg resnet V2.
+      if self.pre_activation:
+        inputs = batch_norm(inputs, training, self.data_format)
+        inputs = tf.nn.relu(inputs)
+
+      # The current top layer has shape
+      # `batch_size x pool_size x pool_size x final_size`.
+      # ResNet does an Average Pooling layer over pool_size,
+      # but that is the same as doing a reduce_mean. We do a reduce_mean
+      # here because it performs better than AveragePooling2D.
+      axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
+      inputs = tf.reduce_mean(input_tensor=inputs, axis=axes, keepdims=True)
+      inputs = tf.identity(inputs, 'final_reduce_mean')
+
+      inputs = tf.squeeze(inputs, axes)
+      inputs = tf.compat.v1.layers.dense(inputs=inputs, units=self.num_classes)
+      inputs = tf.identity(inputs, 'final_dense')
+      return inputs
@@ -0,0 +1,979 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains utility and supporting functions for ResNet.
+
+  This module contains ResNet code which does not directly build layers. This
+includes dataset management, hyperparameter and optimizer code, and argument
+parsing. Code for defining the ResNet layers can be found in resnet_model.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+import multiprocessing
+import os
+import datetime
+
+import time
+from absl import flags
+import tensorflow as tf
+
+import logging
+import sys
+
+
+############## npu modify begin #############
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator  import NPUEstimator
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator import npu_ops
+from npu_bridge.hccl import hccl_ops
+from hccl.manage.api import get_local_rank_id
+from hccl.manage.api import get_rank_size
+from hccl.manage.api import get_rank_id
+from tensorflow.core.protobuf import rewriter_config_pb2
+############## npu modify end ###############
+
+from official.r1.resnet import imagenet_preprocessing
+from official.r1.resnet import resnet_model
+from official.r1.utils import export
+from official.utils.flags import core as flags_core
+from official.utils.logs import hooks_helper
+from official.utils.logs import logger
+from official.utils.misc import distribution_utils
+from official.utils.misc import model_helpers
+from benchmark_log import hwlog
+
+
+################################################################################
+# Functions for input processing.
+################################################################################
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           parse_record_fn,
+                           num_epochs=1,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False):
+  """Given a Dataset with raw records, return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup
+      time and use less memory.
+    parse_record_fn: A function that takes a raw record and returns the
+      corresponding (image, label) pair.
+    num_epochs: The number of epochs to repeat the dataset.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private
+      threadpool created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    tf.compat.v1.logging.info('datasets_num_private_threads: %s',
+                              datasets_num_private_threads)
+
+  # Disable intra-op parallelism to optimize for throughput instead of latency.
+  options = tf.data.Options()
+  options.experimental_threading.max_intra_op_parallelism = 1
+  dataset = dataset.with_options(options)
+
+  # Prefetches a batch at a time to smooth out the time taken to load input
+  # files for shuffling and processing.
+  dataset = dataset.prefetch(buffer_size=batch_size)
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+
+  # Repeats the dataset for the number of epochs to train.
+  #dataset = dataset.repeat(num_epochs)
+  dataset = dataset.repeat()
+  # Parses the raw records into images and labels.
+  dataset = dataset.map(
+      lambda value: parse_record_fn(value, is_training, dtype),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+  if tf_data_experimental_slack:
+    options = tf.data.Options()
+    options.experimental_slack = True
+    dataset = dataset.with_options(options)
+
+  return dataset
+
+
+def get_synth_input_fn(height, width, num_channels, num_classes,
+                       dtype=tf.float32):
+  """Returns an input function that returns a dataset with random data.
+
+  This input_fn returns a data set that iterates over a set of random data and
+  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
+  copy is still included. This used to find the upper throughput bound when
+  tunning the full input pipeline.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
+    """Returns dataset filled with random data."""
+    # Synthetic input should be within [0, 255].
+    inputs = tf.random.truncated_normal(
+        [batch_size] + [height, width, num_channels],
+        dtype=dtype,
+        mean=127,
+        stddev=60,
+        name='synthetic_inputs')
+
+    labels = tf.random.uniform(
+        [batch_size],
+        minval=0,
+        maxval=num_classes - 1,
+        dtype=tf.int32,
+        name='synthetic_labels')
+    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
+    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return data
+
+  return input_fn
+
+
+def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
+  """Serving input fn for raw jpeg images."""
+
+  def _preprocess_image(image_bytes):
+    """Preprocess a single raw image."""
+    # Bounding box around the whole image.
+    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
+    height, width, num_channels = image_shape
+    image = imagenet_preprocessing.preprocess_image(
+        image_bytes, bbox, height, width, num_channels, is_training=False)
+    return image
+
+  image_bytes_list = tf.compat.v1.placeholder(
+      shape=[None], dtype=tf.string, name='input_tensor')
+  images = tf.map_fn(
+      _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
+  return tf.estimator.export.TensorServingInputReceiver(
+      images, {'image_bytes': image_bytes_list})
+
+
+def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
+  """Override flags and set env_vars for performance.
+
+  These settings exist to test the difference between using stock settings
+  and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
+  squeeze a few extra examples per second.  These settings are defaulted to the
+  current platform of interest, which changes over time.
+
+  On systems with small numbers of cpu cores, e.g. under 8 logical cores,
+  setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
+  poorly.
+
+  Args:
+    flags_obj: Current flags, which will be adjusted possibly overriding
+    what has been set by the user on the command-line.
+  """
+  cpu_count = multiprocessing.cpu_count()
+  tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count)
+
+  # Sets up thread pool for each GPU for op scheduling.
+  per_gpu_thread_count = 1
+  total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
+  os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
+  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+  tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s',
+                            os.environ['TF_GPU_THREAD_COUNT'])
+  tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s',
+                            os.environ['TF_GPU_THREAD_MODE'])
+
+  # Reduces general thread pool by number of threads used for GPU pool.
+  main_thread_count = cpu_count - total_gpu_thread_count
+  flags_obj.inter_op_parallelism_threads = main_thread_count
+
+  # Sets thread count for tf.data. Logical cores minus threads assign to the
+  # private GPU pool along with 2 thread per GPU for event monitoring and
+  # sending / receiving tensors.
+  num_monitoring_threads = 2 * flags_obj.num_gpus
+  flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
+                                            - num_monitoring_threads)
+
+
+################################################################################
+# Functions for running training/eval/validation loops for the model.
+################################################################################
+def learning_rate_with_decay(
+    batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
+    base_lr=0.1, warmup=False):
+  """Get a learning rate that decays step-wise as training progresses.
+
+  Args:
+    batch_size: the number of examples processed in each training batch.
+    batch_denom: this value will be used to scale the base learning rate.
+      `0.1 * batch size` is divided by this number, such that when
+      batch_denom == batch_size, the initial learning rate will be 0.1.
+    num_images: total number of images that will be used for training.
+    boundary_epochs: list of ints representing the epochs at which we
+      decay the learning rate.
+    decay_rates: list of floats representing the decay rates to be used
+      for scaling the learning rate. It should have one more element
+      than `boundary_epochs`, and all elements should have the same type.
+    base_lr: Initial learning rate scaled based on batch_denom.
+    warmup: Run a 5 epoch warmup to the initial lr.
+  Returns:
+    Returns a function that takes a single argument - the number of batches
+    trained so far (global_step)- and returns the learning rate to be used
+    for training the next batch.
+  """
+  initial_learning_rate = base_lr * batch_size / batch_denom
+  batches_per_epoch = num_images / batch_size
+
+  # Reduce the learning rate at certain epochs.
+  # CIFAR-10: divide by 10 at epoch 100, 150, and 200
+  # ImageNet: divide by 10 at epoch 30, 60, 80, and 90
+  boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
+  vals = [initial_learning_rate * decay for decay in decay_rates]
+
+  def learning_rate_fn(global_step):
+    """Builds scaled learning rate function with 5 epoch warm up."""
+
+    ############## npu modify begin #############
+    #Using int32 for better computing performance
+    global_step=tf.cast(global_step,tf.int32)
+    ############## npu modify end ###############
+
+    lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
+    if warmup:
+      warmup_steps = int(batches_per_epoch * 5)
+      warmup_lr = (
+          initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
+              warmup_steps, tf.float32))
+      return tf.cond(pred=global_step < warmup_steps,
+                     true_fn=lambda: warmup_lr,
+                     false_fn=lambda: lr)
+    return lr
+
+  def poly_rate_fn(global_step):
+    """Handles linear scaling rule, gradual warmup, and LR decay.
+
+    The learning rate starts at 0, then it increases linearly per step.  After
+    FLAGS.poly_warmup_epochs, we reach the base learning rate (scaled to account
+    for batch size). The learning rate is then decayed using a polynomial rate
+    decay schedule with power 2.0.
+
+    Args:
+      global_step: the current global_step
+
+    Returns:
+      returns the current learning rate
+    """
+
+    # Learning rate schedule for LARS polynomial schedule
+    if flags.FLAGS.batch_size < 8192:
+      plr = 5.0
+      w_epochs = 5
+    elif flags.FLAGS.batch_size < 16384:
+      plr = 10.0
+      w_epochs = 5
+    elif flags.FLAGS.batch_size < 32768:
+      plr = 25.0
+      w_epochs = 5
+    else:
+      plr = 32.0
+      w_epochs = 14
+
+    w_steps = int(w_epochs * batches_per_epoch)
+    wrate = (plr * tf.cast(global_step, tf.float32) / tf.cast(
+        w_steps, tf.float32))
+
+    # TODO(pkanwar): use a flag to help calc num_epochs.
+    num_epochs = 90
+    train_steps = batches_per_epoch * num_epochs
+
+    min_step = tf.constant(1, dtype=tf.int64)
+    decay_steps = tf.maximum(min_step, tf.subtract(global_step, w_steps))
+    poly_rate = tf.train.polynomial_decay(
+        plr,
+        decay_steps,
+        train_steps - w_steps + 1,
+        power=2.0)
+    return tf.where(global_step <= w_steps, wrate, poly_rate)
+
+  # For LARS we have a new learning rate schedule
+  if flags.FLAGS.enable_lars:
+    return poly_rate_fn
+
+  return learning_rate_fn
+
+
+def resnet_model_fn(features, labels, mode, model_class,
+                    resnet_size, weight_decay, learning_rate_fn, momentum,
+                    data_format, resnet_version, loss_scale,
+                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
+                    fine_tune=False, label_smoothing=0.0):
+  """Shared functionality for different resnet model_fns.
+
+  Initializes the ResnetModel representing the model layers
+  and uses that model to build the necessary EstimatorSpecs for
+  the `mode` in question. For training, this means building losses,
+  the optimizer, and the train op that get passed into the EstimatorSpec.
+  For evaluation and prediction, the EstimatorSpec is returned without
+  a train op, but with the necessary parameters for the given mode.
+
+  Args:
+    features: tensor representing input images
+    labels: tensor representing class labels for all input images
+    mode: current estimator mode; should be one of
+      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
+    model_class: a class representing a TensorFlow model that has a __call__
+      function. We assume here that this is a subclass of ResnetModel.
+    resnet_size: A single integer for the size of the ResNet model.
+    weight_decay: weight decay loss rate used to regularize learned variables.
+    learning_rate_fn: function that returns the current learning rate given
+      the current global_step
+    momentum: momentum term used for optimization
+    data_format: Input format ('channels_last', 'channels_first', or None).
+      If set to None, the format is dependent on whether a GPU is available.
+    resnet_version: Integer representing which version of the ResNet network to
+      use. See README for details. Valid values: [1, 2]
+    loss_scale: The factor to scale the loss for numerical stability. A detailed
+      summary is present in the arg parser help text.
+    loss_filter_fn: function that takes a string variable name and returns
+      True if the var should be included in loss calculation, and False
+      otherwise. If None, batch_normalization variables will be excluded
+      from the loss.
+    dtype: the TensorFlow dtype to use for calculations.
+    fine_tune: If True only train the dense layers(final layers).
+    label_smoothing: If greater than 0 then smooth the labels.
+
+  Returns:
+    EstimatorSpec parameterized according to the input params and the
+    current mode.
+  """
+
+  # Generate a summary node for the images
+  tf.compat.v1.summary.image('images', features, max_outputs=6)
+
+  ############## npu modify begin #############
+  # Checks that features/images have same data type being used for calculations.
+  if features.dtype != dtype:
+    features=tf.cast(features,dtype)
+  ############## npu modify end ###############
+
+  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
+                      dtype=dtype)
+
+  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
+
+  # This acts as a no-op if the logits are already in fp32 (provided logits are
+  # not a SparseTensor). If dtype is is low precision, logits must be cast to
+  # fp32 for numerical stability.
+  logits = tf.cast(logits, tf.float32)
+
+  predictions = {
+      'classes': tf.argmax(input=logits, axis=1),
+      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
+  }
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    # Return the predictions and the specification for serving a SavedModel
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        export_outputs={
+            'predict': tf.estimator.export.PredictOutput(predictions)
+        })
+
+  # Calculate loss, which includes softmax cross entropy and L2 regularization.
+  if label_smoothing != 0.0:
+    one_hot_labels = tf.one_hot(labels, 1001)
+    cross_entropy = tf.losses.softmax_cross_entropy(
+        logits=logits, onehot_labels=one_hot_labels,
+        label_smoothing=label_smoothing)
+  else:
+    cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
+        logits=logits, labels=labels)
+
+  # Create a tensor named cross_entropy for logging purposes.
+  tf.identity(cross_entropy, name='cross_entropy')
+  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
+
+  # If no loss_filter_fn is passed, assume we want the default behavior,
+  # which is that batch_normalization variables are excluded from loss.
+  def exclude_batch_norm(name):
+    return 'batch_normalization' not in name
+  loss_filter_fn = loss_filter_fn or exclude_batch_norm
+
+  # Add weight decay to the loss.
+  l2_loss = weight_decay * tf.add_n(
+      # loss is computed using fp32 for numerical stability.
+      [
+          tf.nn.l2_loss(tf.cast(v, tf.float32))
+          for v in tf.compat.v1.trainable_variables()
+          if loss_filter_fn(v.name)
+      ])
+  tf.compat.v1.summary.scalar('l2_loss', l2_loss)
+  loss = cross_entropy + l2_loss
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.compat.v1.train.get_or_create_global_step()
+
+    learning_rate = learning_rate_fn(global_step)
+
+    # Create a tensor named learning_rate for logging purposes
+    tf.identity(learning_rate, name='learning_rate')
+    tf.compat.v1.summary.scalar('learning_rate', learning_rate)
+
+    if flags.FLAGS.enable_lars:
+      from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-import-not-at-top
+      optimizer = contrib_opt.LARSOptimizer(
+          learning_rate,
+          momentum=momentum,
+          weight_decay=weight_decay,
+          skip_list=['batch_normalization', 'bias'])
+    else:
+      optimizer = tf.compat.v1.train.MomentumOptimizer(
+          learning_rate=learning_rate,
+          momentum=momentum
+      )
+
+    ############## npu modify begin #############
+    optimizer = NPUDistributedOptimizer(optimizer)
+    ############## npu modify end ###############
+
+    fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
+    if fp16_implementation == 'graph_rewrite':
+      optimizer = (
+          tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
+              optimizer, loss_scale=loss_scale))
+
+    def _dense_grad_filter(gvs):
+      """Only apply gradient updates to the final layer.
+
+      This function is used for fine tuning.
+
+      Args:
+        gvs: list of tuples with gradients and variable info
+      Returns:
+        filtered gradients so that only the dense layer remains
+      """
+      return [(g, v) for g, v in gvs if 'dense' in v.name]
+	#loss_scale = 512
+    if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
+      # When computing fp16 gradients, often intermediate tensor values are
+      # so small, they underflow to 0. To avoid this, we multiply the loss by
+      # loss_scale to make these tensor values loss_scale times bigger.
+      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
+      print(">>>>>>>>>>>>>>>>>>>")
+      print(loss_scale)
+      print("<<<<<<<<<<<<<<<<<<")
+      if fine_tune:
+        scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)
+
+      # Once the gradient computation is complete we can scale the gradients
+      # back to the correct scale before passing them to the optimizer.
+      unscaled_grad_vars = [(grad / loss_scale, var)
+                            for grad, var in scaled_grad_vars]
+      minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
+    else:
+      grad_vars = optimizer.compute_gradients(loss)
+      if fine_tune:
+        grad_vars = _dense_grad_filter(grad_vars)
+      minimize_op = optimizer.apply_gradients(grad_vars, global_step)
+    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
+    train_op = tf.group(minimize_op, update_ops)
+  else:
+    train_op = None
+
+  ############## npu modify begin #############
+  #Using float32 for better performance
+  accuracy = tf.compat.v1.metrics.accuracy(tf.cast(labels,tf.float32), predictions['classes'])
+  ############## npu modify end ###############
+
+  accuracy_top_5 = tf.compat.v1.metrics.mean(
+      tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
+
+  ############## npu modify begin #############
+  #Using for 8P
+  rank_size = int(os.getenv('RANK_SIZE'))
+  newaccuracy = (hccl_ops.allreduce(accuracy[0], "sum") / rank_size, accuracy[1])
+  newaccuracy_top_5 = (hccl_ops.allreduce(accuracy_top_5[0], "sum") / rank_size, accuracy_top_5[1])
+  ############## npu modify begin #############
+
+  metrics = {'accuracy': newaccuracy,
+             'accuracy_top_5': newaccuracy_top_5}
+
+  # Create a tensor named train_accuracy for logging purposes
+  tf.identity(accuracy[1], name='train_accuracy')
+  tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
+  tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
+  tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
+
+  return tf.estimator.EstimatorSpec(
+      mode=mode,
+      predictions=predictions,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops=metrics)
+
+############## npu modify begin #############
+def init_npu():
+  """Initialize npu manually.
+  Returns:
+    `init_sess` npu  init session config.
+    `npu_init` npu  init ops.
+  """
+  npu_init = npu_ops.initialize_system()
+  config = tf.ConfigProto()
+
+  #npu mix precision attribute set to true when using mix precision
+  config.graph_options.rewrite_options.remapping = rewriter_config_pb2.RewriterConfig.OFF
+  custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
+  custom_op.name = "NpuOptimizer"
+  #custom_op.parameter_map["precision_mode"].b = True
+  custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+  custom_op.parameter_map["use_off_line"].b = True
+
+  init_sess = tf.Session(config=config)
+  print("this is init sess config -------------  ",config)
+  print("this is npu_init -------------  ", npu_init)
+  # i=1
+  # while(1):
+  #   i+=1
+  return init_sess,npu_init
+############## npu modify end ###############
+
+def resnet_main(
+    flags_obj, model_function, input_function, dataset_name, num_images, shape=None):
+  """Shared main loop for ResNet Models.
+
+  Args:
+    flags_obj: An object containing parsed flags. See define_resnet_flags()
+      for details.
+    model_function: the function that instantiates the Model and builds the
+      ops for train/eval. This will be passed directly into the estimator.
+    input_function: the function that processes the dataset and returns a
+      dataset that the estimator can train on. This will be wrapped with
+      all the relevant flags for running and passed to estimator.
+    dataset_name: the name of the dataset for training and evaluation. This is
+      used for logging purpose.
+    shape: list of ints representing the shape of the images used for training.
+      This is only used if flags_obj.export_dir is passed.
+
+  Returns:
+     Dict of results of the run.  Contains the keys `eval_results` and
+    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
+    `train_hooks` is a list the instances of hooks used during training.
+  """
+  # Set other logger configurations
+  # work_num="work " + str(os.environ.get("DEVICE_INDEX"))
+  # hwlog.config(
+  #     default_namespace=work_num,
+  #     default_stack_offset=1,
+  #     default_clear_line=False,
+  #     root_dir=os.path.normpath(
+  #         os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))
+
+  # global logger1
+  # logger1 = get_logger('rizhi', log_file1)
+
+  # print("work_num is ", work_num)
+  # exit()
+  model_helpers.apply_clean(flags.FLAGS)
+
+  # Ensures flag override logic is only executed if explicitly triggered.
+  if flags_obj.tf_gpu_thread_mode:
+    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
+
+  # Configures cluster spec for distribution strategy.
+  num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
+                                                     flags_obj.task_index)
+
+  # Creates session config. allow_soft_placement = True, is required for
+  # multi-GPU and is not harmful for other modes.
+  session_config = tf.compat.v1.ConfigProto(
+      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
+      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
+      allow_soft_placement=True)
+
+  distribution_strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_core.get_num_gpus(flags_obj),
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs)
+
+  ############## npu modify begin #############
+  # Creates a `NPURunConfig` that checkpoints every 115200 steps
+  run_config = NPURunConfig(
+        model_dir=flags_obj.model_dir,
+        session_config=session_config,
+        keep_checkpoint_max=5,
+        save_summary_steps=0,
+        #save_checkpoints_steps=115200,
+        save_checkpoints_steps=flags_obj.save_checkpoints_steps,
+        enable_data_pre_proc=True,
+        #iterations_per_loop=100,
+        iterations_per_loop=flags_obj.iterations_per_loop,
+        #enable_auto_mix_precision=True,
+        precision_mode='allow_mix_precision',
+        hcom_parallel=True
+      )
+  ############## npu modify end ###############
+
+  # Initializes model with all but the dense layer from pretrained ResNet.
+  if flags_obj.pretrained_model_checkpoint_path is not None:
+    warm_start_settings = tf.estimator.WarmStartSettings(
+        flags_obj.pretrained_model_checkpoint_path,
+        vars_to_warm_start='^(?!.*dense)')
+  else:
+    warm_start_settings = None
+
+  ############## npu modify begin #############
+  # Creates a `NPUEstimator` instead of using tf.estimator.Estimator 
+  classifier = NPUEstimator(
+      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
+      params={
+          'resnet_size': int(flags_obj.resnet_size),
+          'data_format': flags_obj.data_format,
+          'batch_size': flags_obj.batch_size,
+          'resnet_version': int(flags_obj.resnet_version),
+          'loss_scale': flags_core.get_loss_scale(flags_obj,
+                                                  default_for_fp16=128),
+          'dtype': flags_core.get_tf_dtype(flags_obj),
+          'fine_tune': flags_obj.fine_tune,
+          'num_workers': num_workers,
+          'num_gpus' : flags_core.get_num_gpus(flags_obj),
+      })
+  ############## npu modify end ###############
+
+  run_params = {
+      'batch_size': flags_obj.batch_size,
+      'dtype': flags_core.get_tf_dtype(flags_obj),
+      'resnet_size': flags_obj.resnet_size,
+      'resnet_version': flags_obj.resnet_version,
+      'synthetic_data': flags_obj.use_synthetic_data,
+      'train_epochs': flags_obj.train_epochs,
+      'num_workers': num_workers,
+  }
+  if flags_obj.use_synthetic_data:
+    dataset_name = dataset_name + '-synthetic'
+
+  benchmark_logger = logger.get_benchmark_logger()
+  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
+                                test_id=flags_obj.benchmark_test_id)
+
+  train_hooks = hooks_helper.get_train_hooks(
+      flags_obj.hooks,
+      model_dir=flags_obj.model_dir,
+      batch_size=flags_obj.batch_size)
+
+  def input_fn_train(num_epochs, input_context=None):
+    ############## npu modify begin #############
+    # Using dtype=tf.float16 for higher data transmission performance
+    # drop_remainder currently only support true
+    # batch_size means single card batch instead of global batch size
+    return input_function(
+        is_training=True,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        num_epochs=num_epochs,
+        dtype=tf.float16,
+        input_context=input_context,
+        drop_remainder=True)
+
+  def input_fn_eval():
+    # batch_size means single card batch instead of global batch size
+    # Using dtype=tf.float16 for higher data transmission performance
+    # drop_remainder currently only support true 
+    return input_function(
+        is_training=False,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        num_epochs=1,
+        dtype=tf.float16,
+        input_context=True,
+        drop_remainder=True)
+    ############## npu modify end ###############
+
+  train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
+                  flags_obj.train_epochs)
+
+  use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
+
+  ############## npu_kai modify end ###############
+  # init_sess, npu_init = init_npu()
+  # npu_shutdown = npu_ops.shutdown_system()
+  ############## npu_kai modify end ###############
+
+  if use_train_and_evaluate:
+    train_spec = tf.estimator.TrainSpec(
+        input_fn=lambda input_context=None: input_fn_train(
+            train_epochs, input_context=input_context),
+        hooks=train_hooks,
+        max_steps=flags_obj.max_train_steps)
+    eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
+    tf.compat.v1.logging.info('Starting to train and evaluate.')
+    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
+    # tf.estimator.train_and_evalute doesn't return anything in multi-worker
+    # case.
+    eval_results = {}
+  else:
+    if train_epochs == 0:
+      # If --eval_only is set, perform a single loop with zero train epochs.
+      schedule, n_loops = [0], 1
+    else:
+      # Compute the number of times to loop while training. All but the last
+      # pass will train for `epochs_between_evals` epochs, while the last will
+      # train for the number needed to reach `training_epochs`. For instance if
+      #   train_epochs = 25 and epochs_between_evals = 10
+      # schedule will be set to [10, 10, 5]. That is to say, the loop will:
+      #   Train for 10 epochs and then evaluate.
+      #   Train for another 10 epochs and then evaluate.
+      #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
+      n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
+      schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
+      schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.
+
+    current_max_steps = 0
+    ############## npu modify begin #############
+    #if flags_obj.max_train_steps is None:
+    #   flags_obj.max_train_steps = (num_images['train']/flags_obj.batch_size)/flags_core.get_num_gpus(flags_obj)
+    #   max_eval_steps = num_images['validation']/flags_obj.batch_size
+    # else:
+    #   max_eval_steps = flags_obj.max_train_steps
+    # for cycle_index, num_train_epochs in enumerate(schedule):
+    #     print(cycle_index)
+    #     print(num_train_epochs)
+    ############## npu modify end #############
+    for cycle_index, num_train_epochs in enumerate(schedule):
+      tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
+                                int(n_loops))
+      ############## npu modify begin #############
+      if flags_obj.max_train_steps is None:
+          current_max_steps += (
+                               num_images['train'] / flags_obj.batch_size) * num_train_epochs / flags_core.get_num_gpus(
+              flags_obj)
+      else:
+          current_max_steps += flags_obj.max_train_steps
+      ############## npu modify end #############
+
+      # add zwx5326390训练开始
+      # hwlogger.event(key=hwlog.constants.GLOBAL_BATCH_SIZE, value=flags_obj.batch_size)
+      #work_num, root_dir, datatime, resnet_logger = hwlog.env(log_file1)
+      #date_time = hwlog.get_time()
+      #resnet_logger.info("namespace: %s,time_ts: %s, global_batch_size: %d, num_train_epochs: %d" %(\
+          #work_num, date_time, flags_obj.batch_size, num_train_epochs))
+      #remark_logger.info("ABK time_ts: %s, current_epoch: %d, batch_size: %d, file: %s, lineno: %s" % (date_time,
+      #      num_train_epochs, flags_obj.batch_size,file_name, sys._getframe().f_lineno))
+      hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=num_train_epochs)
+
+      if num_train_epochs:
+        # Since we are calling classifier.train immediately in each loop, the
+        # value of num_train_epochs in the lambda function will not be changed
+        # before it is used. So it is safe to ignore the pylint error here
+        # pylint: disable=cell-var-from-loop
+        # hwlogger.start(key=hwlog.constants.EPOCH_START)
+		
+        from hccl.split.api import set_split_strategy_by_idx
+        set_split_strategy_by_idx([86,160])
+        classifier.train(
+            input_fn=lambda input_context=True: input_fn_train(
+                num_train_epochs, input_context=input_context),
+            hooks=train_hooks,
+            max_steps=current_max_steps)
+
+        # hwlogger.end(key=hwlog.constants.EPOCH_STOP)
+      ############## npu modify begin #############
+      # npu resorce will be destoryed When the training is over
+      # Reinitialize is needed if using hccl interface before next process
+      init_sess,npu_init=init_npu()
+      npu_shutdown = npu_ops.shutdown_system()
+      init_sess.run(npu_shutdown)
+      init_sess.run(npu_init)
+      ############## npu modify end ###############
+
+      # flags_obj.max_train_steps is generally associated with testing and
+      # profiling. As a result it is frequently called with synthetic data,
+      # which will iterate forever. Passing steps=flags_obj.max_train_steps
+      # allows the eval (which is generally unimportant in those circumstances)
+      # to terminate.  Note that eval will run for max_train_steps each loop,
+      # regardless of the global_step count.
+      tf.compat.v1.logging.info('Starting to evaluate.')
+      eval_results = classifier.evaluate(input_fn=input_fn_eval,
+                                         steps=num_images['validation']/flags_obj.batch_size)
+      benchmark_logger.log_evaluation_result(eval_results)
+
+      #date_time = hwlog.get_time()
+      #remark_logger.info("ABK time_ts: %s, accuracy: %f, accuracy_top_5: %f, file: %s, lineno: %s" % (date_time,
+      #  float(eval_results.get("accuracy")),float(eval_results.get("accuracy_top_5")), file_name,sys._getframe().f_lineno))
+      hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_results.get("accuracy")))
+      hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_results.get("accuracy_top_5")))
+      if model_helpers.past_stop_threshold(
+          flags_obj.stop_threshold, eval_results['accuracy']):
+        break
+
+      ############## npu modify begin #############
+      # npu resorce will be destoryed when evaluate finish
+      # Reinitialize is needed before using hccl interface
+      if cycle_index < n_loops-1:
+          init_sess,npu_init=init_npu()
+          npu_shutdown = npu_ops.shutdown_system()
+          init_sess.run(npu_shutdown)
+          #from hccl.split.api import set_split_strategy_by_idx
+         # set_split_strategy_by_idx([86,160])
+          init_sess.run(npu_init)
+      ############## npu modify end ###############
+
+  if flags_obj.export_dir is not None:
+    # Exports a saved model for the given classifier.
+    export_dtype = flags_core.get_tf_dtype(flags_obj)
+    if flags_obj.image_bytes_as_serving_input:
+      input_receiver_fn = functools.partial(
+          image_bytes_serving_input_fn, shape, dtype=export_dtype)
+    else:
+      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
+          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
+    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
+                                 strip_default_attrs=True)
+
+  ############## npu modify begin #############
+  npu_shutdown = npu_ops.shutdown_system()
+  init_sess.run(npu_shutdown)
+  ############## npu modify end ###############
+
+  stats = {}
+  stats['eval_results'] = eval_results
+  stats['train_hooks'] = train_hooks
+
+  return stats
+
+
+def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
+                        fp16_implementation=False):
+  """Add flags and validators for ResNet."""
+  flags_core.define_base(clean=True, train_epochs=True,
+                         epochs_between_evals=True, stop_threshold=True,
+                         num_gpu=True, hooks=True, export_dir=True,
+                         distribution_strategy=True)
+  flags_core.define_performance(num_parallel_calls=False,
+                                inter_op=True,
+                                intra_op=True,
+                                synthetic_data=True,
+                                dtype=True,
+                                all_reduce_alg=True,
+                                num_packs=True,
+                                tf_gpu_thread_mode=True,
+                                datasets_num_private_threads=True,
+                                dynamic_loss_scale=dynamic_loss_scale,
+                                fp16_implementation=fp16_implementation,
+                                loss_scale=True,
+                                tf_data_experimental_slack=True,
+                                max_train_steps=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+  flags_core.define_distribution()
+  flags.adopt_module_key_flags(flags_core)
+
+  flags.DEFINE_enum(
+      name='resnet_version', short_name='rv', default='1',
+      enum_values=['1', '2'],
+      help=flags_core.help_wrap(
+          'Version of ResNet. (1 or 2) See README.md for details.'))
+  flags.DEFINE_bool(
+      name='fine_tune', short_name='ft', default=False,
+      help=flags_core.help_wrap(
+          'If True do not train any parameters except for the final layer.'))
+  flags.DEFINE_string(
+      name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
+      help=flags_core.help_wrap(
+          'If not None initialize all the network except the final layer with '
+          'these values'))
+  flags.DEFINE_boolean(
+      name='eval_only', default=False,
+      help=flags_core.help_wrap('Skip training and only perform evaluation on '
+                                'the latest checkpoint.'))
+  flags.DEFINE_boolean(
+      name='image_bytes_as_serving_input', default=False,
+      help=flags_core.help_wrap(
+          'If True exports savedmodel with serving signature that accepts '
+          'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
+          'represents the image. The former is easier to use for serving at '
+          'the expense of image resize/cropping being done as part of model '
+          'inference. Note, this flag only applies to ImageNet and cannot '
+          'be used for CIFAR.'))
+  flags.DEFINE_boolean(
+      name='use_train_and_evaluate', default=False,
+      help=flags_core.help_wrap(
+          'If True, uses `tf.estimator.train_and_evaluate` for the training '
+          'and evaluation loop, instead of separate calls to `classifier.train '
+          'and `classifier.evaluate`, which is the default behavior.'))
+  flags.DEFINE_bool(
+      name='enable_lars', default=False,
+      help=flags_core.help_wrap(
+          'Enable LARS optimizer for large batch training.'))
+  flags.DEFINE_float(
+      name='label_smoothing', default=0.0,
+      help=flags_core.help_wrap(
+          'Label smoothing parameter used in the softmax_cross_entropy'))
+  flags.DEFINE_float(
+      name='weight_decay', default=1e-4,
+      help=flags_core.help_wrap(
+          'Weight decay coefficiant for l2 regularization.'))
+
+  choice_kwargs = dict(
+      name='resnet_size', short_name='rs', default='50',
+      help=flags_core.help_wrap('The size of the ResNet model to use.'))
+
+  if resnet_size_choices is None:
+    flags.DEFINE_string(**choice_kwargs)
+  else:
+    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
@@ -0,0 +1,901 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains utility and supporting functions for ResNet.
+
+  This module contains ResNet code which does not directly build layers. This
+includes dataset management, hyperparameter and optimizer code, and argument
+parsing. Code for defining the ResNet layers can be found in resnet_model.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+import multiprocessing
+import os
+
+from absl import flags
+import tensorflow as tf
+
+############## npu modify begin #############
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator  import NPUEstimator
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator import npu_ops
+from hccl.manage.api import get_local_rank_id
+from hccl.manage.api import get_rank_size
+from hccl.manage.api import get_rank_id
+from tensorflow.core.protobuf import rewriter_config_pb2
+############## npu modify end ###############
+
+from official.r1.resnet import imagenet_preprocessing
+from official.r1.resnet import resnet_model
+from official.r1.utils import export
+from official.utils.flags import core as flags_core
+from official.utils.logs import hooks_helper
+from official.utils.logs import logger
+from official.utils.misc import distribution_utils
+from official.utils.misc import model_helpers
+
+
+################################################################################
+# Functions for input processing.
+################################################################################
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           parse_record_fn,
+                           num_epochs=1,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False):
+  """Given a Dataset with raw records, return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup
+      time and use less memory.
+    parse_record_fn: A function that takes a raw record and returns the
+      corresponding (image, label) pair.
+    num_epochs: The number of epochs to repeat the dataset.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private
+      threadpool created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    tf.compat.v1.logging.info('datasets_num_private_threads: %s',
+                              datasets_num_private_threads)
+
+  # Disable intra-op parallelism to optimize for throughput instead of latency.
+  options = tf.data.Options()
+  options.experimental_threading.max_intra_op_parallelism = 1
+  dataset = dataset.with_options(options)
+
+  # Prefetches a batch at a time to smooth out the time taken to load input
+  # files for shuffling and processing.
+  dataset = dataset.prefetch(buffer_size=batch_size)
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+
+  # Repeats the dataset for the number of epochs to train.
+  #dataset = dataset.repeat(num_epochs)
+  dataset = dataset.repeat()
+  # Parses the raw records into images and labels.
+  dataset = dataset.map(
+      lambda value: parse_record_fn(value, is_training, dtype),
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+  if tf_data_experimental_slack:
+    options = tf.data.Options()
+    options.experimental_slack = True
+    dataset = dataset.with_options(options)
+
+  return dataset
+
+
+def get_synth_input_fn(height, width, num_channels, num_classes,
+                       dtype=tf.float32):
+  """Returns an input function that returns a dataset with random data.
+
+  This input_fn returns a data set that iterates over a set of random data and
+  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
+  copy is still included. This used to find the upper throughput bound when
+  tunning the full input pipeline.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
+    """Returns dataset filled with random data."""
+    # Synthetic input should be within [0, 255].
+    inputs = tf.random.truncated_normal(
+        [batch_size] + [height, width, num_channels],
+        dtype=dtype,
+        mean=127,
+        stddev=60,
+        name='synthetic_inputs')
+
+    labels = tf.random.uniform(
+        [batch_size],
+        minval=0,
+        maxval=num_classes - 1,
+        dtype=tf.int32,
+        name='synthetic_labels')
+    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
+    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return data
+
+  return input_fn
+
+
+def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
+  """Serving input fn for raw jpeg images."""
+
+  def _preprocess_image(image_bytes):
+    """Preprocess a single raw image."""
+    # Bounding box around the whole image.
+    bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
+    height, width, num_channels = image_shape
+    image = imagenet_preprocessing.preprocess_image(
+        image_bytes, bbox, height, width, num_channels, is_training=False)
+    return image
+
+  image_bytes_list = tf.compat.v1.placeholder(
+      shape=[None], dtype=tf.string, name='input_tensor')
+  images = tf.map_fn(
+      _preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
+  return tf.estimator.export.TensorServingInputReceiver(
+      images, {'image_bytes': image_bytes_list})
+
+
+def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
+  """Override flags and set env_vars for performance.
+
+  These settings exist to test the difference between using stock settings
+  and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
+  squeeze a few extra examples per second.  These settings are defaulted to the
+  current platform of interest, which changes over time.
+
+  On systems with small numbers of cpu cores, e.g. under 8 logical cores,
+  setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
+  poorly.
+
+  Args:
+    flags_obj: Current flags, which will be adjusted possibly overriding
+    what has been set by the user on the command-line.
+  """
+  cpu_count = multiprocessing.cpu_count()
+  tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count)
+
+  # Sets up thread pool for each GPU for op scheduling.
+  per_gpu_thread_count = 1
+  total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
+  os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
+  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+  tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s',
+                            os.environ['TF_GPU_THREAD_COUNT'])
+  tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s',
+                            os.environ['TF_GPU_THREAD_MODE'])
+
+  # Reduces general thread pool by number of threads used for GPU pool.
+  main_thread_count = cpu_count - total_gpu_thread_count
+  flags_obj.inter_op_parallelism_threads = main_thread_count
+
+  # Sets thread count for tf.data. Logical cores minus threads assign to the
+  # private GPU pool along with 2 thread per GPU for event monitoring and
+  # sending / receiving tensors.
+  num_monitoring_threads = 2 * flags_obj.num_gpus
+  flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
+                                            - num_monitoring_threads)
+
+
+################################################################################
+# Functions for running training/eval/validation loops for the model.
+################################################################################
+def learning_rate_with_decay(
+    batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
+    base_lr=0.1, warmup=False):
+  """Get a learning rate that decays step-wise as training progresses.
+
+  Args:
+    batch_size: the number of examples processed in each training batch.
+    batch_denom: this value will be used to scale the base learning rate.
+      `0.1 * batch size` is divided by this number, such that when
+      batch_denom == batch_size, the initial learning rate will be 0.1.
+    num_images: total number of images that will be used for training.
+    boundary_epochs: list of ints representing the epochs at which we
+      decay the learning rate.
+    decay_rates: list of floats representing the decay rates to be used
+      for scaling the learning rate. It should have one more element
+      than `boundary_epochs`, and all elements should have the same type.
+    base_lr: Initial learning rate scaled based on batch_denom.
+    warmup: Run a 5 epoch warmup to the initial lr.
+  Returns:
+    Returns a function that takes a single argument - the number of batches
+    trained so far (global_step)- and returns the learning rate to be used
+    for training the next batch.
+  """
+  initial_learning_rate = base_lr * batch_size / batch_denom
+  batches_per_epoch = num_images / batch_size
+
+  # Reduce the learning rate at certain epochs.
+  # CIFAR-10: divide by 10 at epoch 100, 150, and 200
+  # ImageNet: divide by 10 at epoch 30, 60, 80, and 90
+  boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
+  vals = [initial_learning_rate * decay for decay in decay_rates]
+
+  def learning_rate_fn(global_step):
+    """Builds scaled learning rate function with 5 epoch warm up."""
+
+    ############## npu modify begin #############
+    #Using int32 for better computing performance
+    global_step=tf.cast(global_step,tf.int32)
+    ############## npu modify end ###############
+
+    lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
+    if warmup:
+      warmup_steps = int(batches_per_epoch * 5)
+      warmup_lr = (
+          initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
+              warmup_steps, tf.float32))
+      return tf.cond(pred=global_step < warmup_steps,
+                     true_fn=lambda: warmup_lr,
+                     false_fn=lambda: lr)
+    return lr
+
+  def poly_rate_fn(global_step):
+    """Handles linear scaling rule, gradual warmup, and LR decay.
+
+    The learning rate starts at 0, then it increases linearly per step.  After
+    FLAGS.poly_warmup_epochs, we reach the base learning rate (scaled to account
+    for batch size). The learning rate is then decayed using a polynomial rate
+    decay schedule with power 2.0.
+
+    Args:
+      global_step: the current global_step
+
+    Returns:
+      returns the current learning rate
+    """
+
+    # Learning rate schedule for LARS polynomial schedule
+    if flags.FLAGS.batch_size < 8192:
+      plr = 5.0
+      w_epochs = 5
+    elif flags.FLAGS.batch_size < 16384:
+      plr = 10.0
+      w_epochs = 5
+    elif flags.FLAGS.batch_size < 32768:
+      plr = 25.0
+      w_epochs = 5
+    else:
+      plr = 32.0
+      w_epochs = 14
+
+    w_steps = int(w_epochs * batches_per_epoch)
+    wrate = (plr * tf.cast(global_step, tf.float32) / tf.cast(
+        w_steps, tf.float32))
+
+    # TODO(pkanwar): use a flag to help calc num_epochs.
+    num_epochs = 90
+    train_steps = batches_per_epoch * num_epochs
+
+    min_step = tf.constant(1, dtype=tf.int64)
+    decay_steps = tf.maximum(min_step, tf.subtract(global_step, w_steps))
+    poly_rate = tf.train.polynomial_decay(
+        plr,
+        decay_steps,
+        train_steps - w_steps + 1,
+        power=2.0)
+    return tf.where(global_step <= w_steps, wrate, poly_rate)
+
+  # For LARS we have a new learning rate schedule
+  if flags.FLAGS.enable_lars:
+    return poly_rate_fn
+
+  return learning_rate_fn
+
+
+def resnet_model_fn(features, labels, mode, model_class,
+                    resnet_size, weight_decay, learning_rate_fn, momentum,
+                    data_format, resnet_version, loss_scale,
+                    loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
+                    fine_tune=False, label_smoothing=0.0):
+  """Shared functionality for different resnet model_fns.
+
+  Initializes the ResnetModel representing the model layers
+  and uses that model to build the necessary EstimatorSpecs for
+  the `mode` in question. For training, this means building losses,
+  the optimizer, and the train op that get passed into the EstimatorSpec.
+  For evaluation and prediction, the EstimatorSpec is returned without
+  a train op, but with the necessary parameters for the given mode.
+
+  Args:
+    features: tensor representing input images
+    labels: tensor representing class labels for all input images
+    mode: current estimator mode; should be one of
+      `tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
+    model_class: a class representing a TensorFlow model that has a __call__
+      function. We assume here that this is a subclass of ResnetModel.
+    resnet_size: A single integer for the size of the ResNet model.
+    weight_decay: weight decay loss rate used to regularize learned variables.
+    learning_rate_fn: function that returns the current learning rate given
+      the current global_step
+    momentum: momentum term used for optimization
+    data_format: Input format ('channels_last', 'channels_first', or None).
+      If set to None, the format is dependent on whether a GPU is available.
+    resnet_version: Integer representing which version of the ResNet network to
+      use. See README for details. Valid values: [1, 2]
+    loss_scale: The factor to scale the loss for numerical stability. A detailed
+      summary is present in the arg parser help text.
+    loss_filter_fn: function that takes a string variable name and returns
+      True if the var should be included in loss calculation, and False
+      otherwise. If None, batch_normalization variables will be excluded
+      from the loss.
+    dtype: the TensorFlow dtype to use for calculations.
+    fine_tune: If True only train the dense layers(final layers).
+    label_smoothing: If greater than 0 then smooth the labels.
+
+  Returns:
+    EstimatorSpec parameterized according to the input params and the
+    current mode.
+  """
+
+  # Generate a summary node for the images
+  tf.compat.v1.summary.image('images', features, max_outputs=6)
+
+  ############## npu modify begin #############
+  # Checks that features/images have same data type being used for calculations.
+  if features.dtype != dtype:
+    features=tf.cast(features,dtype)
+  ############## npu modify end ###############
+
+  model = model_class(resnet_size, data_format, resnet_version=resnet_version,
+                      dtype=dtype)
+
+  logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
+
+  # This acts as a no-op if the logits are already in fp32 (provided logits are
+  # not a SparseTensor). If dtype is is low precision, logits must be cast to
+  # fp32 for numerical stability.
+  logits = tf.cast(logits, tf.float32)
+
+  predictions = {
+      'classes': tf.argmax(input=logits, axis=1),
+      'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
+  }
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    # Return the predictions and the specification for serving a SavedModel
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        export_outputs={
+            'predict': tf.estimator.export.PredictOutput(predictions)
+        })
+
+  # Calculate loss, which includes softmax cross entropy and L2 regularization.
+  if label_smoothing != 0.0:
+    one_hot_labels = tf.one_hot(labels, 1001)
+    cross_entropy = tf.losses.softmax_cross_entropy(
+        logits=logits, onehot_labels=one_hot_labels,
+        label_smoothing=label_smoothing)
+  else:
+    cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
+        logits=logits, labels=labels)
+
+  # Create a tensor named cross_entropy for logging purposes.
+  tf.identity(cross_entropy, name='cross_entropy')
+  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
+
+  # If no loss_filter_fn is passed, assume we want the default behavior,
+  # which is that batch_normalization variables are excluded from loss.
+  def exclude_batch_norm(name):
+    return 'batch_normalization' not in name
+  loss_filter_fn = loss_filter_fn or exclude_batch_norm
+
+  # Add weight decay to the loss.
+  l2_loss = weight_decay * tf.add_n(
+      # loss is computed using fp32 for numerical stability.
+      [
+          tf.nn.l2_loss(tf.cast(v, tf.float32))
+          for v in tf.compat.v1.trainable_variables()
+          if loss_filter_fn(v.name)
+      ])
+  tf.compat.v1.summary.scalar('l2_loss', l2_loss)
+  loss = cross_entropy + l2_loss
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.compat.v1.train.get_or_create_global_step()
+
+    learning_rate = learning_rate_fn(global_step)
+
+    # Create a tensor named learning_rate for logging purposes
+    tf.identity(learning_rate, name='learning_rate')
+    tf.compat.v1.summary.scalar('learning_rate', learning_rate)
+
+    if flags.FLAGS.enable_lars:
+      from tensorflow.contrib import opt as contrib_opt  # pylint: disable=g-import-not-at-top
+      optimizer = contrib_opt.LARSOptimizer(
+          learning_rate,
+          momentum=momentum,
+          weight_decay=weight_decay,
+          skip_list=['batch_normalization', 'bias'])
+    else:
+      optimizer = tf.compat.v1.train.MomentumOptimizer(
+          learning_rate=learning_rate,
+          momentum=momentum
+      )
+
+    ############## npu modify begin #############
+    optimizer = NPUDistributedOptimizer(optimizer)
+    ############## npu modify end ###############
+
+    fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
+    if fp16_implementation == 'graph_rewrite':
+      optimizer = (
+          tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
+              optimizer, loss_scale=loss_scale))
+
+    def _dense_grad_filter(gvs):
+      """Only apply gradient updates to the final layer.
+
+      This function is used for fine tuning.
+
+      Args:
+        gvs: list of tuples with gradients and variable info
+      Returns:
+        filtered gradients so that only the dense layer remains
+      """
+      return [(g, v) for g, v in gvs if 'dense' in v.name]
+
+   # if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
+    # When computing fp16 gradients, often intermediate tensor values are
+    # so small, they underflow to 0. To avoid this, we multiply the loss by
+    # loss_scale to make these tensor values loss_scale times bigger.
+    loss_scale = 512
+    scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
+
+    if fine_tune:
+      scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)
+
+    # Once the gradient computation is complete we can scale the gradients
+    # back to the correct scale before passing them to the optimizer.
+    unscaled_grad_vars = [(grad / loss_scale, var)
+                          for grad, var in scaled_grad_vars]
+    minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
+    #else:
+    #  grad_vars = optimizer.compute_gradients(loss)
+    #  if fine_tune:
+    #    grad_vars = _dense_grad_filter(grad_vars)
+    #  minimize_op = optimizer.apply_gradients(grad_vars, global_step)
+
+    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
+    train_op = tf.group(minimize_op, update_ops)
+  else:
+    train_op = None
+
+  ############## npu modify begin #############
+  #Using float32 for better performance
+  accuracy = tf.compat.v1.metrics.accuracy(tf.cast(labels,tf.float32), predictions['classes'])
+  ############## npu modify end ###############
+
+  accuracy_top_5 = tf.compat.v1.metrics.mean(
+      tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
+  metrics = {'accuracy': accuracy,
+             'accuracy_top_5': accuracy_top_5}
+
+  # Create a tensor named train_accuracy for logging purposes
+  tf.identity(accuracy[1], name='train_accuracy')
+  tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
+  tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
+  tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
+
+  return tf.estimator.EstimatorSpec(
+      mode=mode,
+      predictions=predictions,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops=metrics)
+
+############## npu modify begin #############
+def init_npu():
+  """Initialize npu manually.
+  Returns:
+    `init_sess` npu  init session config.
+    `npu_init` npu  init ops.
+  """
+  npu_init = npu_ops.initialize_system()
+  config = tf.ConfigProto()
+
+  #npu mix precision attribute set to true when using mix precision
+  config.graph_options.rewrite_options.remapping = rewriter_config_pb2.RewriterConfig.OFF
+  custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
+  custom_op.name = "NpuOptimizer"
+  #custom_op.parameter_map["precision_mode"].b = True
+  custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
+  custom_op.parameter_map["use_off_line"].b = True
+
+  init_sess = tf.Session(config=config)
+  return init_sess,npu_init
+############## npu modify end ###############
+
+def resnet_main(
+    flags_obj, model_function, input_function, dataset_name, num_images, shape=None):
+  """Shared main loop for ResNet Models.
+
+  Args:
+    flags_obj: An object containing parsed flags. See define_resnet_flags()
+      for details.
+    model_function: the function that instantiates the Model and builds the
+      ops for train/eval. This will be passed directly into the estimator.
+    input_function: the function that processes the dataset and returns a
+      dataset that the estimator can train on. This will be wrapped with
+      all the relevant flags for running and passed to estimator.
+    dataset_name: the name of the dataset for training and evaluation. This is
+      used for logging purpose.
+    shape: list of ints representing the shape of the images used for training.
+      This is only used if flags_obj.export_dir is passed.
+
+  Returns:
+     Dict of results of the run.  Contains the keys `eval_results` and
+    `train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
+    `train_hooks` is a list the instances of hooks used during training.
+  """
+
+  model_helpers.apply_clean(flags.FLAGS)
+
+  # Ensures flag override logic is only executed if explicitly triggered.
+  if flags_obj.tf_gpu_thread_mode:
+    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
+
+  # Configures cluster spec for distribution strategy.
+  num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
+                                                     flags_obj.task_index)
+
+  # Creates session config. allow_soft_placement = True, is required for
+  # multi-GPU and is not harmful for other modes.
+  session_config = tf.compat.v1.ConfigProto(
+      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
+      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
+      allow_soft_placement=True)
+
+  distribution_strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_core.get_num_gpus(flags_obj),
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs)
+
+  ############## npu modify begin #############
+  # Creates a `NPURunConfig` that checkpoints every 115200 steps
+  run_config = NPURunConfig(
+        model_dir=flags_obj.model_dir,
+        session_config=session_config,
+        keep_checkpoint_max=5,
+        save_checkpoints_steps=115200,
+        enable_data_pre_proc=True,
+        iterations_per_loop=100,
+        #enable_auto_mix_precision=True,
+		precision_mode='allow_mix_precision',
+        hcom_parallel=True
+      )
+  ############## npu modify end ###############
+
+  # Initializes model with all but the dense layer from pretrained ResNet.
+  if flags_obj.pretrained_model_checkpoint_path is not None:
+    warm_start_settings = tf.estimator.WarmStartSettings(
+        flags_obj.pretrained_model_checkpoint_path,
+        vars_to_warm_start='^(?!.*dense)')
+  else:
+    warm_start_settings = None
+
+  ############## npu modify begin #############
+  # Creates a `NPUEstimator` instead of using tf.estimator.Estimator 
+  classifier = NPUEstimator(
+      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
+      params={
+          'resnet_size': int(flags_obj.resnet_size),
+          'data_format': flags_obj.data_format,
+          'batch_size': flags_obj.batch_size,
+          'resnet_version': int(flags_obj.resnet_version),
+          'loss_scale': flags_core.get_loss_scale(flags_obj,
+                                                  default_for_fp16=128),
+          'dtype': flags_core.get_tf_dtype(flags_obj),
+          'fine_tune': flags_obj.fine_tune,
+          'num_workers': num_workers,
+          'num_gpus' : flags_core.get_num_gpus(flags_obj),
+      })
+  ############## npu modify end ###############
+
+  run_params = {
+      'batch_size': flags_obj.batch_size,
+      'dtype': flags_core.get_tf_dtype(flags_obj),
+      'resnet_size': flags_obj.resnet_size,
+      'resnet_version': flags_obj.resnet_version,
+      'synthetic_data': flags_obj.use_synthetic_data,
+      'train_epochs': flags_obj.train_epochs,
+      'num_workers': num_workers,
+  }
+  if flags_obj.use_synthetic_data:
+    dataset_name = dataset_name + '-synthetic'
+
+  benchmark_logger = logger.get_benchmark_logger()
+  benchmark_logger.log_run_info('resnet', dataset_name, run_params,
+                                test_id=flags_obj.benchmark_test_id)
+
+  train_hooks = hooks_helper.get_train_hooks(
+      flags_obj.hooks,
+      model_dir=flags_obj.model_dir,
+      batch_size=flags_obj.batch_size)
+
+  def input_fn_train(num_epochs, input_context=None):
+    ############## npu modify begin #############
+    # Using dtype=tf.float16 for higher data transmission performance
+    # drop_remainder currently only support true
+    # batch_size means single card batch instead of global batch size
+    return input_function(
+        is_training=True,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        num_epochs=num_epochs,
+        dtype=tf.float16,
+        input_context=input_context,
+        drop_remainder=True)
+
+  def input_fn_eval():
+    # batch_size means single card batch instead of global batch size
+    # Using dtype=tf.float16 for higher data transmission performance
+    # drop_remainder currently only support true 
+    return input_function(
+        is_training=False,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        num_epochs=1,
+        dtype=tf.float16,
+        drop_remainder=True)
+    ############## npu modify end ###############
+
+  train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
+                  flags_obj.train_epochs)
+
+  use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
+  if use_train_and_evaluate:
+    train_spec = tf.estimator.TrainSpec(
+        input_fn=lambda input_context=None: input_fn_train(
+            train_epochs, input_context=input_context),
+        hooks=train_hooks,
+        max_steps=flags_obj.max_train_steps)
+    eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
+    tf.compat.v1.logging.info('Starting to train and evaluate.')
+    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
+    # tf.estimator.train_and_evalute doesn't return anything in multi-worker
+    # case.
+    eval_results = {}
+  else:
+    if train_epochs == 0:
+      # If --eval_only is set, perform a single loop with zero train epochs.
+      schedule, n_loops = [0], 1
+    else:
+      # Compute the number of times to loop while training. All but the last
+      # pass will train for `epochs_between_evals` epochs, while the last will
+      # train for the number needed to reach `training_epochs`. For instance if
+      #   train_epochs = 25 and epochs_between_evals = 10
+      # schedule will be set to [10, 10, 5]. That is to say, the loop will:
+      #   Train for 10 epochs and then evaluate.
+      #   Train for another 10 epochs and then evaluate.
+      #   Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
+      n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
+      schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
+      schedule[-1] = train_epochs - sum(schedule[:-1])  # over counting.
+
+    ############## npu modify begin #############
+    if flags_obj.max_train_steps is None:
+      flags_obj.max_train_steps = (num_images['train']/flags_obj.batch_size)/flags_core.get_num_gpus(flags_obj)
+      max_eval_steps = num_images['validation']/flags_obj.batch_size
+    else:
+      max_eval_steps = flags_obj.max_train_steps
+    ############## npu modify end #############
+    for cycle_index, num_train_epochs in enumerate(schedule):
+      tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
+                                int(n_loops))
+
+      if num_train_epochs:
+        # Since we are calling classifier.train immediately in each loop, the
+        # value of num_train_epochs in the lambda function will not be changed
+        # before it is used. So it is safe to ignore the pylint error here
+        # pylint: disable=cell-var-from-loop
+        classifier.train(
+            input_fn=lambda input_context=True: input_fn_train(
+                num_train_epochs, input_context=input_context),
+            hooks=train_hooks,
+            max_steps=flags_obj.max_train_steps*(cycle_index+1))
+
+      ############## npu modify begin #############
+      # npu resorce will be destoryed When the training is over
+      # Reinitialize is needed if using hccl interface before next process
+      init_sess,npu_init=init_npu()
+      npu_shutdown = npu_ops.shutdown_system()
+      init_sess.run(npu_shutdown)
+      init_sess.run(npu_init)
+      ############## npu modify end ###############
+
+      # flags_obj.max_train_steps is generally associated with testing and
+      # profiling. As a result it is frequently called with synthetic data,
+      # which will iterate forever. Passing steps=flags_obj.max_train_steps
+      # allows the eval (which is generally unimportant in those circumstances)
+      # to terminate.  Note that eval will run for max_train_steps each loop,
+      # regardless of the global_step count.
+      tf.compat.v1.logging.info('Starting to evaluate.')
+      eval_results = classifier.evaluate(input_fn=input_fn_eval,
+                                         steps=max_eval_steps)
+      benchmark_logger.log_evaluation_result(eval_results)
+
+
+      if model_helpers.past_stop_threshold(
+          flags_obj.stop_threshold, eval_results['accuracy']):
+        break
+
+      ############## npu modify begin #############
+      # npu resorce will be destoryed when evaluate finish
+      # Reinitialize is needed before using hccl interface
+      init_sess,npu_init=init_npu()
+      npu_shutdown = npu_ops.shutdown_system()
+      init_sess.run(npu_shutdown)
+      init_sess.run(npu_init)
+      ############## npu modify end ###############
+
+  if flags_obj.export_dir is not None:
+    # Exports a saved model for the given classifier.
+    export_dtype = flags_core.get_tf_dtype(flags_obj)
+    if flags_obj.image_bytes_as_serving_input:
+      input_receiver_fn = functools.partial(
+          image_bytes_serving_input_fn, shape, dtype=export_dtype)
+    else:
+      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
+          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
+    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
+                                 strip_default_attrs=True)
+
+  ############## npu modify begin #############
+  npu_shutdown = npu_ops.shutdown_system()
+  init_sess.run(npu_shutdown)
+  ############## npu modify end ###############
+
+  stats = {}
+  stats['eval_results'] = eval_results
+  stats['train_hooks'] = train_hooks
+
+  return stats
+
+
+def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
+                        fp16_implementation=False):
+  """Add flags and validators for ResNet."""
+  flags_core.define_base(clean=True, train_epochs=True,
+                         epochs_between_evals=True, stop_threshold=True,
+                         num_gpu=True, hooks=True, export_dir=True,
+                         distribution_strategy=True)
+  flags_core.define_performance(num_parallel_calls=False,
+                                inter_op=True,
+                                intra_op=True,
+                                synthetic_data=True,
+                                dtype=True,
+                                all_reduce_alg=True,
+                                num_packs=True,
+                                tf_gpu_thread_mode=True,
+                                datasets_num_private_threads=True,
+                                dynamic_loss_scale=dynamic_loss_scale,
+                                fp16_implementation=fp16_implementation,
+                                loss_scale=True,
+                                tf_data_experimental_slack=True,
+                                max_train_steps=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+  flags_core.define_distribution()
+  flags.adopt_module_key_flags(flags_core)
+
+  flags.DEFINE_enum(
+      name='resnet_version', short_name='rv', default='1',
+      enum_values=['1', '2'],
+      help=flags_core.help_wrap(
+          'Version of ResNet. (1 or 2) See README.md for details.'))
+  flags.DEFINE_bool(
+      name='fine_tune', short_name='ft', default=False,
+      help=flags_core.help_wrap(
+          'If True do not train any parameters except for the final layer.'))
+  flags.DEFINE_string(
+      name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
+      help=flags_core.help_wrap(
+          'If not None initialize all the network except the final layer with '
+          'these values'))
+  flags.DEFINE_boolean(
+      name='eval_only', default=False,
+      help=flags_core.help_wrap('Skip training and only perform evaluation on '
+                                'the latest checkpoint.'))
+  flags.DEFINE_boolean(
+      name='image_bytes_as_serving_input', default=False,
+      help=flags_core.help_wrap(
+          'If True exports savedmodel with serving signature that accepts '
+          'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
+          'represents the image. The former is easier to use for serving at '
+          'the expense of image resize/cropping being done as part of model '
+          'inference. Note, this flag only applies to ImageNet and cannot '
+          'be used for CIFAR.'))
+  flags.DEFINE_boolean(
+      name='use_train_and_evaluate', default=False,
+      help=flags_core.help_wrap(
+          'If True, uses `tf.estimator.train_and_evaluate` for the training '
+          'and evaluation loop, instead of separate calls to `classifier.train '
+          'and `classifier.evaluate`, which is the default behavior.'))
+  flags.DEFINE_bool(
+      name='enable_lars', default=False,
+      help=flags_core.help_wrap(
+          'Enable LARS optimizer for large batch training.'))
+  flags.DEFINE_float(
+      name='label_smoothing', default=0.0,
+      help=flags_core.help_wrap(
+          'Label smoothing parameter used in the softmax_cross_entropy'))
+  flags.DEFINE_float(
+      name='weight_decay', default=1e-4,
+      help=flags_core.help_wrap(
+          'Weight decay coefficiant for l2 regularization.'))
+
+  choice_kwargs = dict(
+      name='resnet_size', short_name='rs', default='50',
+      help=flags_core.help_wrap('The size of the ResNet model to use.'))
+
+  if resnet_size_choices is None:
+    flags.DEFINE_string(**choice_kwargs)
+  else:
+    flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
@@ -0,0 +1,581 @@
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:20.962.831 [tdt/device/../common/src/log.cpp:158][TSDaemon] begin to send heartbeat to appmon,[tdt/device/src/tsd/tsdaemon.cpp:1580:SendHeartBeatToAppMon]8462 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:20.963.000 [tdt/device/../common/src/log.cpp:149][TsdEVENT] send heartbeat to appmon success,[tdt/device/src/tsd/tsdaemon.cpp:1587:SendHeartBeatToAppMon]8462
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.682 [tdt/device/../common/src/log.cpp:158][HdcSever] drv accept an session,[tdt/device/../common/src/hdc_server.cpp:330:AcceptHdcSession]8454 Msg: running ok
+[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:22.243.730 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:1609][drvHdcSetSessionReference:1609] >>> session 55, pid 8380
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.752 [tdt/device/../common/src/log.cpp:158][HdcSever] drvHdcSetSessionReference success,[tdt/device/../common/src/hdc_server.cpp:342:AcceptHdcSession]8454 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.772 [tdt/device/../common/src/log.cpp:158][HdcSever] drv accept an session and drvHdcSetSessionReference success, sessionId=1,[tdt/device/../common/src/hdc_server.cpp:351:AcceptHdcSession]8454 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.788 [tdt/device/../common/src/log.cpp:158][HdcSever] accept an session sessionId=1, open recv thread,[tdt/device/../common/src/hdc_server.cpp:279:Accept]8454 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.823 [tdt/device/../common/src/log.cpp:158]HdcServer::AcceptConnection Start,[tdt/device/../common/src/hdc_server.cpp:310:AcceptHdcSession]8454 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.854 [tdt/device/../common/src/log.cpp:158]HdcServer::RecvData thread = 281470605762992,[tdt/device/../common/src/hdc_server.cpp:154:RecvData]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.945 [tdt/device/../common/src/log.cpp:158]tsdaemon get process sign successfully, procpid:40927 signSize:48,[tdt/device/src/tsd/tsdaemon.cpp:901:FmkToTsdMsg]30221 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.963 [tdt/device/../common/src/log.cpp:149][TsdEVENT]FmkToTsdMsg dev[0] msg[6] sessionId[1] realDev[0] fmkSignPid[40927] profilingMode[0] rankSize[1],[tdt/device/src/tsd/tsdaemon.cpp:905:FmkToTsdMsg]30221
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.982 [tdt/device/../common/src/log.cpp:149][TsdEVENT]From FMK Start >>>>>>>>>> TSD dev[0] sessionId[1] realDev[0] fmkPid[40927] rankSize[1],[tdt/device/src/tsd/tsdaemon.cpp:853:FmkToTsdMsgProc]30221
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.000 [tdt/device/../common/src/log.cpp:158][TSDaemon] isAllLastRcvThreadClean_ value:0,[tdt/device/src/tsd/tsdaemon.cpp:819:CleanAllLastRcvThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.013 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() enter [threadSize=1]!,[tdt/device/src/tsd/ppc_server.cpp:96:JoinAllPPCRcvThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.039 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() [ppc tid=281470588977584] [threadSize=1] [freeThreadSize=1].,[tdt/device/src/tsd/ppc_server.cpp:105:JoinAllPPCRcvThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.056 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() [free tid=281470588977584].,[tdt/device/src/tsd/ppc_server.cpp:111:JoinAllPPCRcvThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.071 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() Find free tid and joinable.,[tdt/device/src/tsd/ppc_server.cpp:114:JoinAllPPCRcvThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.086 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() exit [threadSize=0] [freeThreadSize=0].,[tdt/device/src/tsd/ppc_server.cpp:129:JoinAllPPCRcvThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.101 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() enter [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:333:CleanTsdRcvHdcThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.120 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() [tid=281470597370288] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:341:CleanTsdRcvHdcThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.139 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() [tid=281470572192176] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:341:CleanTsdRcvHdcThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.152 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() exit [threadSize=0]!,[tdt/device/src/tsd/tsdaemon.cpp:346:CleanTsdRcvHdcThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.167 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() enter [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:356:CleanTsdRcvPPCThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.181 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() [tid=281470580584880] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:364:CleanTsdRcvPPCThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.197 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() [tid=281470563799472] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:364:CleanTsdRcvPPCThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.241 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() exit [threadSize=0]!,[tdt/device/src/tsd/tsdaemon.cpp:369:CleanTsdRcvPPCThreads]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.259 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartSubProcess deviceId: 0, fmkPid: 40927, sessionId: 1, state: 0,[tdt/device/src/tsd/tsdaemon.cpp:630:StartSubProcess]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.273 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartSubProcess rankSize: 1,,[tdt/device/src/tsd/tsdaemon.cpp:635:StartSubProcess]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.285 [tdt/device/../common/src/log.cpp:158][TSDaemon] Process HCCP is abandoned to start, the rank size is 1,[tdt/device/src/tsd/tsdaemon.cpp:651:StartSubProcess]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.337 [tdt/device/../common/src/log.cpp:158][TSDaemon] start delete file, direct is /home/HwHiAiUser/hdcd/device0/,[tdt/device/src/tsd/tsdaemon.cpp:1878:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.367 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is .,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.382 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is ..,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.398 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is etc,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.412 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is upgrade,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.457 [tdt/device/../common/src/log.cpp:158][TSDaemon] ExecuteStart() [tid=281470563799472]!,[tdt/device/src/tsd/tsdaemon.cpp:477:ExecuteStart]30222 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.477 [tdt/device/../common/src/log.cpp:158][TSDaemon] check pathName[/var/aicpu_scheduler], pathLen[20] procName[aicpu_scheduler], len[15], MAX_LEN[256] ,[tdt/device/src/tsd/tsdaemon.cpp:1514:CheckProcessInputParam]30222 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.245.278 [tdt/device/../common/src/log.cpp:149][TsdEVENT]#### Start TSD->SubProcess[PROC] Start Msg Device[0] Proc[aicpu_scheduler] fmkPid[40927] fatherPid[8380] subPid[30223] #### profilingMode is[0],[tdt/device/src/tsd/tsdaemon.cpp:498:ExecuteStart]30222
+[OPLOG] TDT(8380,tsdaemon):2020-05-12-11:05:22.245.328 [tdt/device/../common/src/log.cpp:151][tdt/device/src/tsd/tsdaemon.cpp:499:ExecuteStart]30222 alloc resource {devOS:[30223]} for {dev:0}
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.245.390 [tdt/device/../common/src/log.cpp:158][TSDaemon] SetTsdToFmkMsg:deviceId[0], sessionId[1], subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:774:SetTsdToFmkMsg]30222 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.681 [hardware/dev_plat/../dev_plat/devhdc/hdc_cfg_parse.c:190][CfgFileOpen:190] >>> /etc/hdcBasic.cfg not exist
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.712 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:554][hdcInit:554] >>> HDC pcie init,use default segment(524288)
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.765 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:539][hdcPcieInit:539] >>> after init hdc segment 524224, socket segment 0
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.780 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:574][hdcInit:574] >>> HDC init success.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.281.768 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.646658]  [hdcdrv] [hdcdrv_config 2288] <aicpu_scheduler:30223> pid 30223 use segment 524224.
+[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.284.329 [tdt/device/../common/src/log.cpp:143]Register data type failed: hiaiSerializeFunc is existed,[tdt/common/common_inc/data_type_reg.h:192:Register]30223 Msg: func has already existed
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.723 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:185][AICPUFW] [main 185] Compute process(cloud) compile time is 02:13:48 Apr 26 2020
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.751 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:168][AICPUFW] [ParseArgs 168] Parse args success. deviceId=0, pid=40927, pidSign=e9b203cc443d80564c8c88a0d111cb95145fae36b00d1ec1, profilingMode=0.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.960 [hardware/dev_plat/../dev_core/devdrv/devdrv_container.c:193][devdrv] [devdrv_do_container 193] para.num(4).
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.983 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:83][AICPUFW] [Start 83] Aicpu_scheduler will start, hostpid=40927, deviceId=0, hostDeviceId = 0.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.999 [hardware/dev_plat/aicpufw/aicpufw_api.c:125][AICPUFW] [drvDevBindPid 125] drvDevBindPid enter: chip_id = 0, hostpid=40927, mode=0.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.285.015 [hardware/dev_plat/aicpufw/aicpufw_dev.c:348][AICPUFW] [aicpufw_dev_bind_pid 348] chip0 aicpu bind pid (40927).
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.805 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651711]  [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(0), dev(0)
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.825 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651716]  [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(1), dev(1)
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.836 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651719]  [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(2), dev(2)
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.846 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651722]  [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(3), dev(3)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.370.966 [hardware/dev_plat/aicpufw/aicpufw_dev.c:76][AICPUFW] [aicpufw_dev_open 76] chip_id:0 is opened success, fd=18.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.371.155 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:283][AICPUFW] [InitFd 283] InitFd begin, deviceId=0.
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.186 [datapreprocess/src/task_queue.cc:58][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:58] Begin create task queue eventfd.
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.209 [datapreprocess/src/task_queue.cc:70][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:70] End create task queue eventfd 19.
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.224 [datapreprocess/src/task_queue.cc:58][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:58] Begin create task queue eventfd.
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.241 [datapreprocess/src/task_queue.cc:70][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:70] End create task queue eventfd 20.
+[TRACE] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.256 [status:START] [datapreprocess/src/task_queue.cc:262]DP_PREPROCESS module has been initialized
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.371.275 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:315][AICPUFW] [InitFd 315] InitFd end, deviceId=0.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.371.288 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:100][AICPUFW] [Start 100] Begin start aicpu work task, deviceId=0, hostpid=40927.
+[INFO] DEVMM(30223,aicpu_scheduler):2020-05-12-11:05:22.371.300 [hardware/dev_plat/../dev_plat/devmm/agentmm/agentmm_svm.c:137][drvMemInitSvmDevice 137] <curpid:30223,0xfe8dc010> init svm start pid=40927.
+[INFO] DEVMM(30223,aicpu_scheduler):2020-05-12-11:05:22.372.570 [hardware/dev_plat/../dev_plat/devmm/agentmm/agentmm_svm.c:120][devmm_init_svm_device 120] <curpid:30223,0xfe8dc010> init svm (hpid:40927) succ.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.593 [hardware/dev_plat/aicpufw/aicpufw_api.c:89][AICPUFW] [drvCreateAicpuWorkTasks 89] drvCreateAicpuWorkTasks enter: chip_id = 0, pid=40927, mode=0.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.610 [hardware/dev_plat/aicpufw/aicpufw_api.c:103][AICPUFW] [drvCreateAicpuWorkTasks 103] chip[0] start load kernel serve, pid=40927.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.739 [hardware/dev_plat/aicpufw/aicpufw_dev.c:226][AICPUFW] [aicpufw_dev_register_pid 226] chip0 register pid (40927).
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.839 [hardware/dev_plat/../dev_core/devdrv/devdrv_manager.c:838][devdrv] [drvGetCpuInfo 838] Dev[1] cpu info:1 14 1 4 0 
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.875 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65531)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.891 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65515)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.902 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65530)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.914 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65514)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.923 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65529)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.935 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65513)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.943 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65528)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.955 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65512)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.964 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65527)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.976 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65511)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.985 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65526)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.998 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65510)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.007 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65525)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.019 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65509)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.029 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65524)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.041 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65508)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.050 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65523)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.061 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65507)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.070 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65522)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.081 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65506)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.090 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65521)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.101 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65505)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.110 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65520)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.122 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65504)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.130 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65519)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.142 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65503)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.150 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65518)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.162 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65502)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.170 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(0)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.182 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(0)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.191 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(0)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.202 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(0)
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.215 [hardware/dev_plat/aicpufw/aicpufw_dev.c:142][AICPUFW] [aicpufw_dev_mmap 142] mmap opened fd=18.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.227 [hardware/dev_plat/aicpufw/aicpufw_dev.c:152][AICPUFW] [aicpufw_dev_mmap 152] start mmap, fd=18, offset: 4096, size: 258048.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.255 [hardware/dev_plat/aicpufw/aicpufw_dev.c:155][AICPUFW] [aicpufw_dev_mmap 155] finish mmap, fd=18, offset: 4096, size: 258048, addr: 0x0xfffefe877000.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.270 [hardware/dev_plat/aicpufw/aicpufw_thread.c:121][AICPUFW] [aicpufw_thread_data_init 121] chip[0] ts[0] finish mmap sram_offset[4096] sram_size[258048], ret[0]
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.287 [hardware/dev_plat/aicpufw/aicpufw_dev.c:142][AICPUFW] [aicpufw_dev_mmap 142] mmap opened fd=18.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.297 [hardware/dev_plat/aicpufw/aicpufw_dev.c:152][AICPUFW] [aicpufw_dev_mmap 152] start mmap, fd=18, offset: 262144, size: 1048576.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.323 [hardware/dev_plat/aicpufw/aicpufw_dev.c:155][AICPUFW] [aicpufw_dev_mmap 155] finish mmap, fd=18, offset: 262144, size: 1048576, addr: 0x0xfffefe777000.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.338 [hardware/dev_plat/aicpufw/aicpufw_thread.c:142][AICPUFW] [aicpufw_thread_data_init 142] chip[0] chip_info.chip_id is 0x6528.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.354 [hardware/dev_plat/aicpufw/aicpufw_dev.c:142][AICPUFW] [aicpufw_dev_mmap 142] mmap opened fd=18.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.368 [hardware/dev_plat/aicpufw/aicpufw_dev.c:152][AICPUFW] [aicpufw_dev_mmap 152] start mmap, fd=18, offset: 1310720, size: 4096.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.385 [hardware/dev_plat/aicpufw/aicpufw_dev.c:155][AICPUFW] [aicpufw_dev_mmap 155] finish mmap, fd=18, offset: 1310720, size: 4096, addr: 0x0xfffeffffa000.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.400 [hardware/dev_plat/aicpufw/aicpufw_thread.c:165][AICPUFW] [aicpufw_thread_data_init 165] finish mmap chip[0] ts[0] sram[0x0xfffefe877000]
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.420 [hardware/dev_plat/aicpufw/aicpufw_thread.c:710][AICPUFW] [aicpufw_thread_create 710] chip0 aicpu num: 14, first_aicpu: 2.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.441 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=0 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.454 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:0, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.852 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:0, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.872 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[0] id 281470656012688
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.874 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.737826]  [aicpufw-drv] [aicpufw_init_dfx 467] <aicpu_scheduler:30223>there are 2 processes open,current tgid: 30223
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.881 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=0 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.892 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=1 begin
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.896 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739661]  [aicpufw-drv] [aicpufw_drv_add_match_info_check 1240] <aicpu_scheduler:30223>register pid(40927) ts_index(0) monitor_is_running(1).  
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.900 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:1, ts_ind:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.910 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739665]  [aicpufw-drv] [aicpufw_drv_add_match_info 1282] <aicpu_scheduler:30223>register pid(40927) ts_index(0) monitor_is_running(1).  
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.921 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739680]  [aicpufw-drv] [aicpufw_drv_get_moniter_info 1583] <aicpu_m_ioctl:8314>aicpufw event happened. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.930 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739733]  [devdrv] [devdrv_manager_get_cpu_info 1927] <aicpu_scheduler:30223> aicpu_num=14, ccpu_num=1
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.941 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740140]  [aicpufw-drv] [aicpufw_drv_mmap 1123] <aicpu_scheduler:30223>mmap_sram,ts_index=0, offset=4096, ts_size=4096.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.940 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:1, ret:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.950 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740143]  [aicpufw-drv] [aicpufw_drv_mmap_sram 974] <aicpu_scheduler:30223>sram status mem: virt_addr = 0xfffefe877000, tgid = 30223, size = 258048,offset = 4096, numa node = 0, ts = 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.952 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[1] id 281470647619984
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.962 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=1 end ret=0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.964 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740150]  [aicpufw-drv] [aicpufw_drv_mmap_sram 995] <aicpu_scheduler:30223>finish sram status mem: virt_addr = 0xfffefe877000, tgid = 30223, size = 258048,offset = 4096, numa node = 0, ts = 0, ret = 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.973 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=2 begin
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.976 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740210]  [aicpufw-drv] [aicpufw_drv_mmap 1127] <aicpu_scheduler:30223>mmap_gicd,ts_index=0, offset=262144, ts_size=4096, sram_size=258048.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.983 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:2, ts_ind:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.988 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740212]  [aicpufw-drv] [aicpufw_drv_mmap_gicd 926] <aicpu_scheduler:30223>gicd status mem: virt_addr = 0xfffefe777000,  tgid = 30223, size = 1048576,offset = 262144, numa node = 0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.999 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740278]  [aicpufw-drv] [aicpufw_drv_mmap 1132] <aicpu_scheduler:30223>mmap_gicr,ts_index=0, offset=1310720, ts_size=4096, sram_size=258048, gicd_size=1048576.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.374.008 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740280]  [aicpufw-drv] [aicpufw_drv_mmap_gicr 892] <aicpu_scheduler:30223>ts gicr mem: virt_addr = 0xfffeffffa000,  tgid = 30223, size = 4096,offset = 1310720, numa node = 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.011 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:2, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.023 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[2] id 281470639227280
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.033 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=2 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.042 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=3 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.051 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:3, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.079 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:3, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.090 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[3] id 281470630834576
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.099 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=3 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.109 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=4 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.118 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:4, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.154 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:4, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.168 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[4] id 281470622441872
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.178 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=4 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.190 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=5 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.199 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:5, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.230 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:5, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.243 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[5] id 281470614049168
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.253 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=5 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.265 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=6 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.275 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:6, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.307 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:6, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.321 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[6] id 281470605656464
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.330 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=6 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.342 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=7 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.351 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:7, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.381 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:7, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.395 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[7] id 281470597263760
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.404 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=7 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.415 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=8 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.425 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:8, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.455 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:8, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.469 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[8] id 281470588871056
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.478 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=8 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.490 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=9 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.499 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:9, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.528 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:9, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.541 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[9] id 281470580478352
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.550 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=9 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.561 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=10 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.571 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:10, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.599 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:10, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.612 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[10] id 281470572085648
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.621 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=10 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.633 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=11 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.642 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:11, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.676 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:11, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.690 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[11] id 281470563692944
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.699 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=11 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.711 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=12 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.720 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:12, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.748 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:12, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.762 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[12] id 281470555300240
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.771 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=12 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.783 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=13 begin
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.792 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:13, ts_ind:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.820 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:13, ret:0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.833 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[13] id 281470546907536
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.842 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=13 end ret=0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.854 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.864 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.876 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.885 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.897 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.907 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.918 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.928 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.940 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.949 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.961 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.970 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.982 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.992 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.003 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.012 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.024 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.033 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.045 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.054 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.065 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.075 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.086 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.095 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.107 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.117 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.129 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.138 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.150 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.217 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 5] begin, tid: 30230.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.329 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 6] begin, tid: 30231.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.366 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 7, chip 0, ts 0, aicpu_index 5, thread id 30230
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.411 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.443 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 3] begin, tid: 30228.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.503 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 8] begin, tid: 30233.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.550 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 9] begin, tid: 30234.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.602 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 4] begin, tid: 30229.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.634 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 8, chip 0, ts 0, aicpu_index 6, thread id 30231
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.673 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.690 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.706 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.721 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.741 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.762 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.776 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.799 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 11, chip 0, ts 0, aicpu_index 9, thread id 30234
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.824 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.841 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.855 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.870 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.900 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 0] begin, tid: 30225.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.963 [hardware/dev_plat/../dev_core/devdrv/devdrv_aicpu.c:449][devdrv] [devdrv_load_kernel_serve_thread 449] thread for load kernel start.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.027 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[11] begin, tid: 30236.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.081 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[12] begin, tid: 30237.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.127 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 7] begin, tid: 30232.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.179 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[13] begin, tid: 30238.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.204 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 5, chip 0, ts 0, aicpu_index 3, thread id 30228
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.231 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.247 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.259 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.273 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.291 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 6, chip 0, ts 0, aicpu_index 4, thread id 30229
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.316 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.332 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.346 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.362 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.391 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 15, chip 0, ts 0, aicpu_index 13, thread id 30238
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.431 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.447 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.462 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.477 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.500 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 2, chip 0, ts 0, aicpu_index 0, thread id 30225
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.528 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.545 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.559 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.578 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.604 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[10] begin, tid: 30235.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.657 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 2] begin, tid: 30227.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.706 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 1] begin, tid: 30226.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.731 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 13, chip 0, ts 0, aicpu_index 11, thread id 30236
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.760 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 3, chip 0, ts 0, aicpu_index 1, thread id 30226
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.785 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.801 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.815 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.830 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.851 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 4, chip 0, ts 0, aicpu_index 2, thread id 30227
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.878 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.894 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.908 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.925 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.945 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index0
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.963 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.978 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index1
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.993 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.008 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index2
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.023 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.039 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index3
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.054 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.070 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index4
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.085 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.099 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index5
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.114 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.130 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index6
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.145 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.168 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 9, chip 0, ts 0, aicpu_index 7, thread id 30232
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.195 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.211 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.225 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.242 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.261 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 10, chip 0, ts 0, aicpu_index 8, thread id 30233
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.284 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.300 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.315 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.330 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.351 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 12, chip 0, ts 0, aicpu_index 10, thread id 30235
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.377 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.393 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.407 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.422 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.441 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 14, chip 0, ts 0, aicpu_index 12, thread id 30237
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.465 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.481 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.496 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.510 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.527 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index7
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.543 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.558 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index8
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.572 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.587 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index9
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.602 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.618 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index10
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.633 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.652 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.668 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.692 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.703 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.713 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index11
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.725 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.734 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index12
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.742 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.749 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index13
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.761 [hardware/dev_plat/aicpufw/aicpufw_timer.c:133][AICPUFW] [aicpufw_timer_init 133] aicpufw_timer_init end, AICPU_TASK_TIMEOUT=30.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.377.775 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.743120]  [devdrv] [devdrv_manager_get_kernel_lib_process 198] <devdrv_load_ser:30223> begin to get kernel lib, device pid: 30223.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.377.790 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.743123]  [devdrv] [devdrv_manager_get_kernel_lib_process 211] <devdrv_load_ser:30223> host_pid: 40927.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.377.799 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.743131]  [devdrv] [devdrv_manager_get_kernel_lib_process 247] <devdrv_load_ser:30223> begin to wait, host pid: 40927, device pid: 30223.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.833 [hardware/dev_plat/aicpufw/aicpufw_supervisor_thread.c:173][AICPUFW] [aicpufw_sup_thread_proc 173] supervisor thread begin, current tid:30239 thread
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.854 [hardware/dev_plat/aicpufw/aicpufw_thread.c:797][AICPUFW] [aicpufw_thread_init 797] chip_id 0 thread init end.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.866 [hardware/dev_plat/aicpufw/aicpufw_api.c:77][AICPUFW] [aicpufw_create_work_tasks 77] drvCreateAicpuWorkTasks exit: chip_id = 0, pid=40927, mode=0.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.892 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:233][AICPUFW] [StartTdtServer 233] Start tdt server, deviceId=0.
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.377.932 [tdt/device/../common/src/log.cpp:158]BindCpu init and bindCoreList size is 1,[tdt/device/src/hdc/bind_cpu.cpp:38:Init]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.034 [tdt/device/../common/src/log.cpp:158]BindCpu cpu core num = 64,[tdt/device/src/hdc/bind_cpu.cpp:40:Init]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.051 [tdt/device/../common/src/log.cpp:158]BindCoreList member is 1,[tdt/device/src/hdc/bind_cpu.cpp:42:Init]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.075 [tdt/device/../common/src/log.cpp:158]Begin to Init tdtserver [devicID_=0] and [newInputDeviceId=0],[tdt/device/src/hdc/tdt_server_impl.cpp:664:Init]30223 Msg: running ok
+[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.110 [tdt/device/../common/src/log.cpp:143]{"Start":"TDT_RECV"},[tdt/device/../common/src/statistic.cpp:113:PeriodStatisticManager]30223 Msg: warnging
+[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.128 [tdt/device/../common/src/log.cpp:143]{"Start":"DP_ENQUEUE"},[tdt/device/../common/src/statistic.cpp:114:PeriodStatisticManager]30223 Msg: warnging
+[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.147 [tdt/device/../common/src/log.cpp:143]{"Start":"RECV_ENLARGE"},[tdt/device/../common/src/statistic.cpp:115:PeriodStatisticManager]30223 Msg: warnging
+[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.167 [tdt/device/../common/src/log.cpp:143]{"Start":"RECV_REDUCE"},[tdt/device/../common/src/statistic.cpp:116:PeriodStatisticManager]30223 Msg: warnging
+[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.185 [tdt/device/../common/src/log.cpp:143]{"Start":"RELEASE_DATA"},[tdt/device/../common/src/statistic.cpp:117:PeriodStatisticManager]30223 Msg: warnging
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.285 [tdt/device/../common/src/log.cpp:158]HdcServer::Init Start,[tdt/device/../common/src/hdc_server.cpp:104:Init]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.378.335 [hardware/dev_plat/../dev_plat/devhdc/hdc_server.c:287][drvHdcServerCreate:287] >>> create server (listen device: 0) success
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.356 [tdt/device/../common/src/log.cpp:158]HdcCommon::InitMsgSize Start,[tdt/device/../common/src/hdc_common.cpp:28:InitMsgSize]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.374 [tdt/device/../common/src/log.cpp:158]msgMaxSize_ = 524224, msgShortHeadDataMaxSize_ = 524212 msgLongHeadDataMaxSize_ = 524200,[tdt/device/../common/src/hdc_common.cpp:42:InitMsgSize]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.410 [tdt/device/../common/src/log.cpp:158]hdcserver in tdtserver is already initialed,[tdt/device/src/hdc/tdt_server_impl.cpp:652:InitDirectly]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.428 [tdt/device/../common/src/log.cpp:158]Begin to init device's hdc channel of tdt.,[tdt/device/src/hdc/tdt_server.cpp:32:TDTServerInit]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.448 [tdt/device/../common/src/log.cpp:158]begin tdt device init, [deviceId=0],[tdt/device/src/hdc/tdt_device_impl.cpp:65:Init]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.466 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer is initialized with deviceID:0,[tdt/device/src/hdc/tuning_data_transfer.cpp:71:Init]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.640 [tdt/device/../common/src/log.cpp:158]HdcCommon::InitMsgSize Start,[tdt/device/../common/src/hdc_common.cpp:28:InitMsgSize]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.657 [tdt/device/../common/src/log.cpp:158]msgMaxSize_ = 524224, msgShortHeadDataMaxSize_ = 524212 msgLongHeadDataMaxSize_ = 524200,[tdt/device/../common/src/hdc_common.cpp:42:InitMsgSize]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.673 [tdt/device/../common/src/log.cpp:158]capacity.maxSegment: 524224,[tdt/device/../common/src/hdc_client.cpp:152:Init]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.690 [tdt/device/../common/src/log.cpp:158]HdcClient::CreateHdcSession Start,[tdt/device/../common/src/hdc_client.cpp:284:CreateHdcSession]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.809 [tdt/device/../common/src/log.cpp:158]HdcServer::Accept thread = 281470521594256,[tdt/device/../common/src/hdc_server.cpp:259:Accept]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.878 [tdt/device/../common/src/log.cpp:158][HdcClient] deviceId: 0 connect session,[tdt/device/../common/src/hdc_client.cpp:306:CreateHdcSession]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.378.908 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:1609][drvHdcSetSessionReference:1609] >>> session 56, pid 30223
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.926 [tdt/device/../common/src/log.cpp:158][HdcClient] drvHdcSetSessionReference success,[tdt/device/../common/src/hdc_client.cpp:317:CreateHdcSession]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.945 [tdt/device/../common/src/log.cpp:158][HdcClient] deviceId: 0 connect session and drvHdcSetSessionReference success, sessionId=1,[tdt/device/../common/src/hdc_client.cpp:327:CreateHdcSession]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.006 [tdt/device/../common/src/log.cpp:158][HdcClient] SendPidMsg sessionId=1, tdt_main_pid=30223,[tdt/device/../common/src/hdc_client.cpp:346:sendPidMsg]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.073 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer tdt client wait for send data begin,[tdt/device/src/hdc/tuning_data_transfer.cpp:173:CreateSingleSession]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.101 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.213 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470521594256 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.239 [tdt/device/../common/src/log.cpp:158]Thread[281470521594256] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.282 [tdt/device/../common/src/log.cpp:158]Free Tdt thread ID = 281470529986960,[tdt/device/src/hdc/tdt_server_impl.cpp:555:FreeTdtMemoryThread]30241 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.304 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30241 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.359 [tdt/device/../common/src/log.cpp:158]TimerFunc thread = 281470538379664,[tdt/device/../common/src/statistic.cpp:140:TimerFunc]30240 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.377 [tdt/device/../common/src/log.cpp:158]HdcServer::AcceptConnection Start,[tdt/device/../common/src/hdc_server.cpp:310:AcceptHdcSession]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.403 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470529986960 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30241 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.421 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30240 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.476 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.496 [tdt/device/../common/src/log.cpp:158]Thread[281470529986960] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30241 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.516 [tdt/device/../common/src/log.cpp:158][deviceId=0] Free Tdt thread is success to bind cpu core,[tdt/device/src/hdc/tdt_server_impl.cpp:559:FreeTdtMemoryThread]30241 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.534 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470538379664 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30240 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.550 [tdt/device/../common/src/log.cpp:158]Thread[281470538379664] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30240 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.576 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470513033616 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.591 [tdt/device/../common/src/log.cpp:158]Thread[281470513033616] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.637 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer find channel OK, sessionID: 1,[tdt/device/src/hdc/tuning_data_transfer.cpp:126:SetSendFlagBySessionID]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.664 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer tdt client wait for send data end,[tdt/device/src/hdc/tuning_data_transfer.cpp:178:CreateSingleSession]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.688 [tdt/device/../common/src/log.cpp:158]Begin to start send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:179:StartSendThread]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.732 [tdt/device/../common/src/log.cpp:158]Success start send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:181:StartSendThread]30223 Msg: running ok
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.379.746 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:265][AICPUFW] [StartTdtServer 265] TDT server init success.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.379.758 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:135][AICPUFW] [Start 135] Aicpu_scheduler has started, deviceId=0, hostpid=40927.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.379.771 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:206][AICPUFW] [main 206] Compute process start success.
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.793 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] TsdWaitForShutdownProc() deviceId: 0 waitType:1(0:HCCP,1:COMPUTE) is running,[tdt/device/src/tsd/ppc_client.cpp:220:TsdWaitForShutdownProc]30223 Msg: running ok
+[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.810 [tdt/device/../common/src/log.cpp:149][PPCCLIEVENT] TsdWaitForShutdown!,[tdt/device/src/tsd/ppc_client.cpp:225:TsdWaitForShutdownProc]30223
+[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.826 [tdt/device/../common/src/log.cpp:149][PPCCLIEVENT] TsdWaitForShutdown Start Rsp device[0] procType:1(0:HCCP,1:COMPUTE), subProcPid[30223] ,[tdt/device/src/tsd/ppc_client.cpp:244:TsdWaitForShutdownProc]30223
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.846 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] GetPpcSession() will Create session use serverId: 19280,[tdt/device/src/tsd/ppc_client.cpp:110:GetPpcSession]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.379.899 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:129][drvPpcSessionConnect:129] >>> Ppc connect session 26, pid 30223
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.917 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] [serverId=19280] GetPpcSession() receive thread has been started,[tdt/device/src/tsd/ppc_client.cpp:118:GetPpcSession]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.935 [tdt/device/../common/src/log.cpp:158]PpcInterface::SendMsg,size=10, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:39:SendMsg]30223 Msg: running ok
+[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:22.379.941 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:268][drvPpcSessionAccept:268] >>> Ppc Accept Session 16, Server fd 12, pid 8380
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.380.006 [tdt/device/../common/src/log.cpp:158]tdt device begin to send to host.,[tdt/device/src/hdc/tdt_device_impl.cpp:275:TdtDeviceSendImpl]30244 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.380.025 [tdt/device/../common/src/log.cpp:158]begin to check queueDataSize_.,[tdt/device/src/hdc/tdt_device_impl.cpp:286:TdtDeviceSendImpl]30244 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.091 [tdt/device/../common/src/log.cpp:158]PpcInterface::RecvMsg, size=10, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:122:RecvMsg]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.114 [tdt/device/../common/src/log.cpp:158][PpcServer] SetPpcSession, deviceId:0, subProcPid:30223,[tdt/device/src/tsd/ppc_server.cpp:338:SetPpcSession]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.139 [tdt/device/../common/src/log.cpp:158][TSDaemon]PPCSerToTsdMsg deviceId[0], subProcPid[30223]msgType[0](0:START RSP,2:SHUTDOWN,1:SHUTDOWN RSP,3:SOCKET CLOSE), procType[1](0:HCCP,1:COMPUTE) state[6],[tdt/device/src/tsd/tsdaemon.cpp:1441:PPCSerToTsdMsg]30245 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.160 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### PPCSer->TSD RspMsg device[0] Start Rsp procType[1] ####,[tdt/device/src/tsd/tsdaemon.cpp:1406:PPCSerToTsdProc]30245
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.205 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] ##### [threadName:ppc_srv_recv_0] RecvData [ret=16846848] Save [notifyDeviceId:0] [notifyProcType:1]####,[tdt/device/src/tsd/ppc_server.cpp:242:RecvData]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.256 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartRspProc() [dev=0][subProcPid=30223][procType=1(0:HCCP,1:COMPUTE)][tid=281470572192176]!,[tdt/device/src/tsd/tsdaemon.cpp:1190:StartRspProc]30246 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.276 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartRspProc curState is: 6,[tdt/device/src/tsd/tsdaemon.cpp:1193:StartRspProc]30246 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.295 [tdt/device/../common/src/log.cpp:158][TSDaemon]  GetTsdToFmkMsg deviceId[0] subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:959:GetTsdToFmkMsg]30246 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.314 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap size = 1,[tdt/device/src/tsd/tsdaemon.cpp:964:GetTsdToFmkMsg]30246 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.331 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap [deviceId] size = 1,[tdt/device/src/tsd/tsdaemon.cpp:967:GetTsdToFmkMsg]30246 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.349 [tdt/device/../common/src/log.cpp:158][TSDaemon] SendRspMsgToFmk deviceId: 0, subProcPid: 30223, sessionID: 1,[tdt/device/src/tsd/tsdaemon.cpp:1016:SendRspMsgToFmk]30246 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.364 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### Start Rsp TSD->FMK device[0] sessionID[1] realDeviceId[0]####,[tdt/device/src/tsd/tsdaemon.cpp:1018:SendRspMsgToFmk]30246
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.402 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartRspProc subRunState is: 3,[tdt/device/src/tsd/tsdaemon.cpp:1173:TsdWaitRspProcForStar]30246 Msg: running ok
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.381.752 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.745228]  [hdcdrv] [hdcdrv_server_create 2330] <aicpu_scheduler:30223> dev_id 0 service_type service_TDT server create
+[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:23.379.610 [tdt/device/../common/src/log.cpp:149],[tdt/common/common_inc/queue_manager.h:683:ShowSizeForEveryChannel]30240
+[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:23.379.656 [tdt/device/../common/src/log.cpp:149]"DeviceSendPool: " "DeviceRecvPool: " "HostRecvPool: " "DeviceCtrlPool: {SendPool: 0, FreePool: 0}, {RecvPool: 0, FreePool: 0}",[tdt/device/../common/src/memory_pool.cpp:707:GetDevicePoolStatus]30240
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:23.379.673 [tdt/device/../common/src/log.cpp:158]DeviceRecNormalData: Device receive normal message number:0,[tdt/device/../common/src/memory_pool.cpp:709:GetDevicePoolStatus]30240 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.322 [tdt/device/../common/src/log.cpp:158]tsdaemon get process sign successfully, procpid:0 signSize:0,[tdt/device/src/tsd/tsdaemon.cpp:901:FmkToTsdMsg]30221 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.360 [tdt/device/../common/src/log.cpp:149][TsdEVENT]FmkToTsdMsg dev[0] msg[7] sessionId[1] realDev[0] fmkSignPid[0] profilingMode[0] rankSize[1],[tdt/device/src/tsd/tsdaemon.cpp:905:FmkToTsdMsg]30221
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.375 [tdt/device/../common/src/log.cpp:149][TsdEVENT]From FMK Close <<<<<<<<<<< TSDdev[0] sessionId[1] realDev[0] fmkPid[0],[tdt/device/src/tsd/tsdaemon.cpp:861:FmkToTsdMsgProc]30221
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.397 [tdt/device/../common/src/log.cpp:158][TSDaemon] Begin closeSubProcess deviceId:0, sessionId:1, cpProcPid:30223, hccpProcPid:0, subProcState:3,[tdt/device/src/tsd/tsdaemon.cpp:729:CloseSubProcess]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.414 [tdt/device/../common/src/log.cpp:158][TSDaemon] SetTsdToFmkMsg:deviceId[0], sessionId[1], subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:774:SetTsdToFmkMsg]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.429 [tdt/device/../common/src/log.cpp:158][TSDaemon] Process HCCP is abandoned to close, the rank size is 1,[tdt/device/src/tsd/tsdaemon.cpp:747:CloseSubProcess]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.473 [tdt/device/../common/src/log.cpp:158][TSDaemon] start delete file, direct is /home/HwHiAiUser/hdcd/device0/,[tdt/device/src/tsd/tsdaemon.cpp:1878:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.509 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is .,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.524 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is ..,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.541 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is etc,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.560 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is upgrade,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.624 [tdt/device/../common/src/log.cpp:158][PpcServer] GetPpcSession, deviceId:0, subProcPid:30223,[tdt/device/src/tsd/ppc_server.cpp:324:GetPpcSession]30255 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.648 [tdt/device/../common/src/log.cpp:158]PpcInterface::SendMsg,size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:39:SendMsg]30255 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.686 [tdt/device/../common/src/log.cpp:158][TSDaemon] Begin ExecuteClose deviceId:0, subProcPid:30223,[tdt/device/src/tsd/tsdaemon.cpp:703:ExecuteClose]30255 Msg: running ok
+[OPLOG] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.704 [tdt/device/../common/src/log.cpp:151][tdt/device/src/tsd/tsdaemon.cpp:705:ExecuteClose]30255 free resource {devOS:[30223]} for {dev:0}
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.731 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### Send TSD->SubProcess[PPCSer] Close Msg Device[0] proType[1] [tid=281470597370288]####,[tdt/device/src/tsd/tsdaemon.cpp:709:ExecuteClose]30255
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.728 [tdt/device/../common/src/log.cpp:158]PpcInterface::RecvMsg, size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:122:RecvMsg]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.755 [tdt/device/../common/src/log.cpp:158]PpcClient::RecvMsgProc, size=12, subpid=30223,[tdt/device/src/tsd/ppc_client.cpp:150:RecvMsgProc]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.773 [tdt/device/../common/src/log.cpp:158]PpcClient::RecvMsgProc, subpid1=30223, subpid2=30223,[tdt/device/src/tsd/ppc_client.cpp:160:RecvMsgProc]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.789 [tdt/device/../common/src/log.cpp:158]PpcInterface::SendMsg,size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:39:SendMsg]30223 Msg: running ok
+[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.816 [tdt/device/../common/src/log.cpp:149][PPCCLIEVENT] #### PPCClient->TSD Close Rsp send OK device[0] procType:1 ####,[tdt/device/src/tsd/ppc_client.cpp:166:RecvMsgProc]30223
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.832 [tdt/device/../common/src/log.cpp:158][PPCClient]Process Exit DevId:0 procType:1(0:HCCP,1:COMPUTE),[tdt/device/src/tsd/ppc_client.cpp:200:RecvData]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.848 [tdt/device/../common/src/log.cpp:158]TsdWaitForShutdown exit,[tdt/device/src/tsd/ppc_client.cpp:274:TsdWaitForShutdown]30223 Msg: running ok
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.063.861 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:214][AICPUFW] [main 214] Tsd wait for shut down success.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.063.872 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:272][AICPUFW] [StopTdtServer 272] Stop tdt server, deviceId=0.
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.987 [tdt/device/../common/src/log.cpp:158]PpcInterface::RecvMsg, size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:122:RecvMsg]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.025 [tdt/device/../common/src/log.cpp:158][PpcServer] SetPpcSession, deviceId:0, subProcPid:30223,[tdt/device/src/tsd/ppc_server.cpp:338:SetPpcSession]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.057 [tdt/device/../common/src/log.cpp:158][TSDaemon]PPCSerToTsdMsg deviceId[0], subProcPid[30223]msgType[1](0:START RSP,2:SHUTDOWN,1:SHUTDOWN RSP,3:SOCKET CLOSE), procType[1](0:HCCP,1:COMPUTE) state[6],[tdt/device/src/tsd/tsdaemon.cpp:1441:PPCSerToTsdMsg]30245 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.078 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### PPCSer->TSD RspMsg device[0] Close Rsp procType[1] ####,[tdt/device/src/tsd/tsdaemon.cpp:1415:PPCSerToTsdProc]30245
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.195 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] ##### [threadName:ppc_srv_recv_0] RecvData [ret=16846848] Save [notifyDeviceId:0] [notifyProcType:1]####,[tdt/device/src/tsd/ppc_server.cpp:242:RecvData]30245 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.227 [tdt/device/../common/src/log.cpp:158]tdtserver is destroying, [devicID_=0],[tdt/device/src/hdc/tdt_server_impl.cpp:709:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.248 [tdt/device/../common/src/log.cpp:158]Stop QueueManager success,[tdt/device/src/hdc/tdt_server_impl.cpp:716:Destroy]30223 Msg: running ok
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.263 [datapreprocess/src/dp_interface.cc:288][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:288] Release blocked TDT threads.
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.280 [datapreprocess/src/dp_interface.cc:406][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:406] Begin write queue blocking eventfd of source().
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.294 [datapreprocess/src/dp_interface.cc:409][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:409] Got empty source name, start write all blocking eventfd.
+[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.305 [datapreprocess/src/dp_interface.cc:291][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:291] All TDT threads mark info memory have released.
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.320 [tdt/device/../common/src/log.cpp:158]DataPreprocess exited,[tdt/device/src/hdc/tdt_server_impl.cpp:720:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.338 [tdt/device/../common/src/log.cpp:158]Enqueue Thread exited,[tdt/device/src/hdc/tdt_server_impl.cpp:722:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.368 [tdt/device/../common/src/log.cpp:158][deviceId=0] Free Tdt thread is exist,total free number = 0,enqueue number is = 0,[tdt/device/src/hdc/tdt_server_impl.cpp:570:FreeTdtMemoryThread]30241 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.412 [tdt/device/../common/src/log.cpp:158]Free Thread exited,[tdt/device/src/hdc/tdt_server_impl.cpp:725:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.428 [tdt/device/../common/src/log.cpp:158]enter HdcServer::Destroy() function,[tdt/device/../common/src/hdc_server.cpp:582:Destroy]30223 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.462 [tdt/device/../common/src/log.cpp:158][TSDaemon] CloseRspProc() [dev=0][subPid=30223][procType=1(0:HCCP,1:COMPUTE)][tid=281470588977584]!,[tdt/device/src/tsd/tsdaemon.cpp:1267:CloseRspProc]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.485 [tdt/device/../common/src/log.cpp:158][TSDaemon] CloseRspProc curState is: 6,[tdt/device/src/tsd/tsdaemon.cpp:1271:CloseRspProc]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.508 [tdt/device/../common/src/log.cpp:158][TSDaemon]  GetTsdToFmkMsg deviceId[0] subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:959:GetTsdToFmkMsg]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.527 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap size = 1,[tdt/device/src/tsd/tsdaemon.cpp:964:GetTsdToFmkMsg]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.544 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap [deviceId] size = 1,[tdt/device/src/tsd/tsdaemon.cpp:967:GetTsdToFmkMsg]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.568 [tdt/device/../common/src/log.cpp:158][TSDaemon] subProcessPid is 30223 on device[0], procType[1](0:HCCP PROC,1:COMPUTE PROC),[tdt/device/src/tsd/tsdaemon.cpp:1085:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.588 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.535 [hardware/dev_plat/../dev_plat/devhdc/hdc_server.c:367][drvHdcServerDestroy:367] >>> destroy server success, deviceId 0, serviceType 10
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.556 [hardware/dev_plat/../dev_plat/devhdc/hdc_server.c:236][drvHdcPcieSessionAccept:236] >>> device:0 server 10 is destroyed, ret:18
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.575 [tdt/device/../common/src/log.cpp:158]drv accept exception, because acceptSwitch has been set false, ret=18,[tdt/device/../common/src/hdc_server.cpp:324:AcceptHdcSession]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.592 [tdt/device/../common/src/log.cpp:158]acceptSwitch has been set false,[tdt/device/../common/src/hdc_server.cpp:269:Accept]30242 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.634 [tdt/device/../common/src/log.cpp:158][HdcServer] SessionIdPidMsg enter into ClearSessionIdPid,[tdt/device/../common/src/hdc_server.cpp:495:ClearSessionIdPid]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.658 [tdt/device/../common/src/log.cpp:158]Begin StopMemoryPool,[tdt/device/../common/src/memory_pool.cpp:3389:StopMemoryPool]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.685 [tdt/device/../common/src/log.cpp:158]DestroyMemoryPool, memoryEnd = 2,[tdt/device/../common/src/memory_pool.cpp:1323:DestroyMemoryPool]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.703 [tdt/device/../common/src/log.cpp:158]FreeMemoryByMap, memoryEnd = 2, memoryType = 1, devId_ = 0,[tdt/device/../common/src/memory_pool.cpp:1131:FreeMemoryByMap]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.721 [tdt/device/../common/src/log.cpp:158]FreeMemoryByMap, memoryEnd = 2, memoryType = 1, devId_ = 0,[tdt/device/../common/src/memory_pool.cpp:1131:FreeMemoryByMap]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.743 [tdt/device/../common/src/log.cpp:158]hdcServer_ destroyed,[tdt/device/src/hdc/tdt_server_impl.cpp:735:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.759 [tdt/device/../common/src/log.cpp:158]Begin to destroy device's hdc channel of tdt.,[tdt/device/src/hdc/tdt_server.cpp:48:TDTServerStop]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.774 [tdt/device/../common/src/log.cpp:158]begin to destroy.,[tdt/device/src/hdc/tdt_device_impl.cpp:367:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.793 [tdt/device/../common/src/log.cpp:158]Stop QueueManager success,[tdt/device/src/hdc/tdt_device_impl.cpp:381:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.808 [tdt/device/../common/src/log.cpp:158]Begin to stop send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:192:StopSendThread]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.866 [tdt/device/../common/src/log.cpp:158]Success stop send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:196:StopSendThread]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.885 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer destory hdc client,[tdt/device/src/hdc/tuning_data_transfer.cpp:456:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.900 [tdt/device/../common/src/log.cpp:158]enter HdcClient::Destroy() function,[tdt/device/../common/src/hdc_client.cpp:468:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.917 [tdt/device/../common/src/log.cpp:158]begin drvHdcSessionClose,[tdt/device/../common/src/hdc_client.cpp:415:ClearAllSession]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.930 [hardware/dev_plat/../dev_plat/devhdc/hdc_client.c:413][drvHdcClientSessionClose:413] >>> destroy client session(sock: 56)
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.960 [tdt/device/../common/src/log.cpp:158]end drvHdcSessionClose,[tdt/device/../common/src/hdc_client.cpp:420:ClearAllSession]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.976 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:1300][drvHdcRecvMsgLen:1300] >>> the session 56 local or remote was closed
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.996 [tdt/device/../common/src/log.cpp:158]begin HdcClient::JoinAllRecvThread,[tdt/device/../common/src/hdc_client.cpp:383:JoinAllRecvThread]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.016 [tdt/device/../common/src/log.cpp:158]drvHdcRecv() return 25,[tdt/device/../common/src/hdc_common.cpp:494:RecvHdcDefaultMsg]30243 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.075.018 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.038 [tdt/device/../common/src/log.cpp:158]Receive() return 17903651, which means : hdc service or client socket closed,[tdt/device/../common/src/hdc_common.cpp:438:RecvMsg]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.057 [tdt/device/../common/src/log.cpp:158][deviceId=0][sessionId=1] recv runswitch has been set false,[tdt/device/../common/src/hdc_client.cpp:175:RecvData]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.072 [tdt/device/../common/src/log.cpp:158][deviceId=0] the recv data pthread exit,[tdt/device/../common/src/hdc_client.cpp:190:RecvData]30243 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.138 [tdt/device/../common/src/log.cpp:158]end HdcClient::JoinAllRecvThread,[tdt/device/../common/src/hdc_client.cpp:392:JoinAllRecvThread]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.156 [tdt/device/../common/src/log.cpp:158]begin drvHdcClientDestroy,[tdt/device/../common/src/hdc_client.cpp:450:DestroyClient]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.236 [tdt/device/../common/src/log.cpp:158]end drvHdcClientDestroy,[tdt/device/../common/src/hdc_client.cpp:455:DestroyClient]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.252 [tdt/device/../common/src/log.cpp:158]begin HdcClient::ClearClientPtr,[tdt/device/../common/src/hdc_client.cpp:432:ClearClientPtr]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.269 [tdt/device/../common/src/log.cpp:158]end HdcClient::ClearClientPtr,[tdt/device/../common/src/hdc_client.cpp:440:ClearClientPtr]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.284 [tdt/device/../common/src/log.cpp:158]begin HdcClient::ClearAll,[tdt/device/../common/src/hdc_client.cpp:401:ClearAll]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.299 [tdt/device/../common/src/log.cpp:158]end HdcClient::ClearAll,[tdt/device/../common/src/hdc_client.cpp:404:ClearAll]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.314 [tdt/device/../common/src/log.cpp:158]end HdcClient::Destroy() function,[tdt/device/../common/src/hdc_client.cpp:476:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.330 [tdt/device/../common/src/log.cpp:158]success destroy.,[tdt/device/src/hdc/tdt_device_impl.cpp:388:Destroy]30223 Msg: running ok
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.075.343 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:277][AICPUFW] [StopTdtServer 277] TDT server stop success.
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.075.355 [hardware/dev_plat/aicpufw/aicpufw_dev.c:94][AICPUFW] [aicpufw_dev_close 94] chip_id:0 will be closed, fd=18.
+[TRACE] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.075.373 [status:STOP] [datapreprocess/src/task_queue.cc:292]DP_PREPROCESS module has been closed
+[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.075.389 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:219][AICPUFW] [main 219] Compute process stopped.
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.408 [tdt/device/../common/src/log.cpp:158]PpcClient::~PpcClient() destructor function called,[tdt/device/src/tsd/ppc_client.cpp:23:~PpcClient]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.422 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI]  Destroy() enter,[tdt/device/src/tsd/ppc_client.cpp:44:Destroy]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.075.437 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:151][drvPpcSessionDestroy:151] >>> Ppc Destroy session 26, pid 30223
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.473 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] Destroy() call drvPpcSessionDestroy func return [drvRet:0],[tdt/device/src/tsd/ppc_client.cpp:54:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.490 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI]  Destroy() exit,[tdt/device/src/tsd/ppc_client.cpp:73:Destroy]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.508 [tdt/device/../common/src/log.cpp:158]TdtDeviceImpl::~TdtDeviceImpl() destructor function called.,[tdt/device/src/hdc/tdt_device_impl.cpp:54:~TdtDeviceImpl]30223 Msg: running ok
+[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.532 [tdt/device/../common/src/log.cpp:158]TdtServerImpl::~TdtServerImpl() destructor function called,[tdt/device/src/hdc/tdt_server_impl.cpp:71:~TdtServerImpl]30223 Msg: running ok
+[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.076.267 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:2002][drv_hdc_exit:2002] >>> HDC uninit success
+[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:24.077.353 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:626][hdcSocketRecvPeek:626] >>> client connection closed: Success(errno: 0)(sock: 16)
+[WARNING] TDT(8380,tsdaemon):2020-05-12-11:05:24.077.393 [tdt/device/../common/src/log.cpp:143][TSDPPCIF] drvPpcRecv fail 25,[tdt/device/src/tsd/ppc_interface.cpp:92:RecvMsg]30245 Msg: warnging
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.077.412 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] 0 SOCKET_CLOSED notify dev[0] procType[1] ret=[17379389]?[17379389]TSD to clean,[tdt/device/src/tsd/ppc_server.cpp:199:RecvData]30245
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.077.429 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] 1 SOCKET_CLOSED notify dev[0] procType[1] ret[17379389] TSD to clean,[tdt/device/src/tsd/ppc_server.cpp:204:RecvData]30245
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.804 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.441417]  [hdcdrv] [hdcdrv_server_destroy 2415] <aicpu_scheduler:30223> dev_id 0 service_type service_TDT server destroy
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.829 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.441427]  [hdcdrv] [hdcdrv_accept 2470] <Accept:30223> dev_id 0 service_type service_TDT accept wait dev 0 quit, dev status 1, listen status 0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.841 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.441858]  [hdcdrv] [hdcdrv_recv_peek 2899] <RecvData:30223> dev 0 session 56 local or remote close, local_close_state closed_by_user, remote_close_state closed_by_user,local_session_fd 56, remote_session_fd 227.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.850 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.444353]  [devdrv] [devdrv_manager_get_kernel_lib_process 251] <devdrv_load_ser:30223> wait_event_interruptible return: -512.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.860 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.444495]  [devmm] [devmm_notifier_release_private 1225] <aicpufw_sup:30223,30239> device wait ts exit hostpid(40927) exit(0).
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.085.102 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.095.180 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.105.257 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.115.334 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.125.408 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.133.771 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.496591]  [devmm] [devmm_notifier_release_private 1231] <aicpufw_sup:30223,30239> ts exited,device hostpid(40927) begin recover resource.
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.137.750 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.147.847 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.157.927 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.168.004 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.178.083 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.188.161 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.198.238 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.208.316 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.218.391 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.228.469 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.238.544 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.248.619 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.258.697 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.268.776 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] DRV(8314,aicpufw_monitor):2020-05-12-11:05:24.275.875 [hardware/dev_plat/aicpufw/aicpufw_thread.c:931][AICPUFW] [aicpufw_monitor_recycle_so 931] recycle so pid_dir=/home/HwHiAiUser/tmp/30223/.
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.856 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642610]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>0 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.878 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642614]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>1 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.889 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642616]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>2 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.899 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642618]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>3 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.915 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642620]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>4 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.924 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642622]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>5 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.933 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642624]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>6 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.942 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642627]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>7 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.950 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642628]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>8 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.959 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642630]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>9 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.967 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642632]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>10 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.975 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642634]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>11 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.984 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642636]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>12 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.994 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642638]  [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>13 :rev_int_cnt 0  rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.278.003 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642641]  [aicpufw-drv] [aicpufw_drv_delete_context 568] <aicpufw_sup:30223>delete match-pid(40927). dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.278.011 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642642]  [aicpufw-drv] [aicpufw_drv_release 690] <aicpufw_sup:30223>processes(1),process pid(30223) released.current tgid: 30223 numa node:0. dev_id:0
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.278.019 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642657]  [aicpufw-drv] [aicpufw_drv_get_moniter_info 1583] <aicpu_m_ioctl:8314>aicpufw event happened. dev_id:0
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.860 [tdt/device/../common/src/log.cpp:158]The current kill result is [-1],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.880 [tdt/device/../common/src/log.cpp:158][TSDaemon] Computer Process stop success.,[tdt/device/src/tsd/tsdaemon.cpp:1096:CheckSubProcessExitByType]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.892 [tdt/device/../common/src/log.cpp:158][TSDaemon] SubProcesses exit success on device[0], tryTimes is 21,[tdt/device/src/tsd/tsdaemon.cpp:1098:CheckSubProcessExitByType]30256 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.908 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### Close Rsp TSD->FMK device[0] sessionID[1] realDeviceId[0]####,[tdt/device/src/tsd/tsdaemon.cpp:1023:SendRspMsgToFmk]30256
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.950 [tdt/device/../common/src/log.cpp:158][TSDaemon] CloseRspProc subRunState is: 0,[tdt/device/src/tsd/tsdaemon.cpp:1248:TsdWaitRspProcForClose]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.966 [tdt/device/../common/src/log.cpp:158][TSDaemon] EraseTsdToFmkMsg:deviceId[0], subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:789:EraseTsdToFmkMsg]30256 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.027 [tdt/device/../common/src/log.cpp:158][TSDaemon]###### PPCSerToTsdAbnormalMsg deviceId[0] msgType[3](0:START RSP,1:SHUTDOWN,2:SHUTDOWN RSP,3:SOCKET CLOSE), procType[1](0:HCCP,1:COMPUTE), state[0],[tdt/device/src/tsd/tsdaemon.cpp:1471:PPCSerToTsdAbnormalMsg]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.057 [tdt/device/../common/src/log.cpp:158][TSDaemon] Begin to ClearPPCThreadCleanFlag,[tdt/device/src/tsd/tsdaemon.cpp:1473:PPCSerToTsdAbnormalMsg]30245 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.081 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] 2 SOCKET_CLOSED notify dev[0] procType[1] ret[17379389] TSD to clean,[tdt/device/src/tsd/ppc_server.cpp:211:RecvData]30245
+[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:24.279.117 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:304][drvPpcSessionClose:304] >>> Ppc Close session fd 16, pid 8380 session 0xfffee0000d30
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.139 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] CloseSomeSession enter [sessionSize:1],[tdt/device/src/tsd/ppc_server.cpp:80:RemoveFromPpcSessionList]30245 Msg: running ok
+[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.157 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] CloseSomeSession exit [sessionSize:0],[tdt/device/src/tsd/ppc_server.cpp:85:RemoveFromPpcSessionList]30245 Msg: running ok
+[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.190 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] [threadName:ppc_srv_recv_0] RecvData [ret=17379389] [tid=281470580584880]thread exit,[tdt/device/src/tsd/ppc_server.cpp:218:RecvData]30245
+[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.609.779 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.973818]  [devmm] [devmm_chan_close_device_h2d 1124] <kworker/0:0:55678,55678> device process exited, hostpid=40927, devpid=30223, devid=0.
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience functions for managing dataset file buffers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import atexit
+import multiprocessing
+import multiprocessing.dummy
+import os
+import tempfile
+import uuid
+
+import numpy as np
+import six
+
+import tensorflow as tf
+
+
+class _GarbageCollector(object):
+  """Deletes temporary buffer files at exit.
+
+  Certain tasks (such as NCF Recommendation) require writing buffers to
+  temporary files. (Which may be local or distributed.) It is not generally safe
+  to delete these files during operation, but they should be cleaned up. This
+  class keeps track of temporary files created, and deletes them at exit.
+  """
+  def __init__(self):
+    self.temp_buffers = []
+
+  def register(self, filepath):
+    self.temp_buffers.append(filepath)
+
+  def purge(self):
+    try:
+      for i in self.temp_buffers:
+        if tf.io.gfile.exists(i):
+          tf.io.gfile.remove(i)
+          tf.compat.v1.logging.info("Buffer file {} removed".format(i))
+    except Exception as e:
+      tf.compat.v1.logging.error("Failed to cleanup buffer files: {}".format(e))
+
+
+_GARBAGE_COLLECTOR = _GarbageCollector()
+atexit.register(_GARBAGE_COLLECTOR.purge)
+
+_ROWS_PER_CORE = 50000
+
+
+def write_to_temp_buffer(dataframe, buffer_folder, columns):
+  if buffer_folder is None:
+    _, buffer_path = tempfile.mkstemp()
+  else:
+    tf.io.gfile.makedirs(buffer_folder)
+    buffer_path = os.path.join(buffer_folder, str(uuid.uuid4()))
+  _GARBAGE_COLLECTOR.register(buffer_path)
+
+  return write_to_buffer(dataframe, buffer_path, columns)
+
+
+def iter_shard_dataframe(df, rows_per_core=1000):
+  """Two way shard of a dataframe.
+
+  This function evenly shards a dataframe so that it can be mapped efficiently.
+  It yields a list of dataframes with length equal to the number of CPU cores,
+  with each dataframe having rows_per_core rows. (Except for the last batch
+  which may have fewer rows in the dataframes.) Passing vectorized inputs to
+  a pool is more effecient than iterating through a dataframe in serial and
+  passing a list of inputs to the pool.
+
+  Args:
+    df: Pandas dataframe to be sharded.
+    rows_per_core: Number of rows in each shard.
+
+  Returns:
+    A list of dataframe shards.
+  """
+  n = len(df)
+  num_cores = min([multiprocessing.cpu_count(), n])
+
+  num_blocks = int(np.ceil(n / num_cores / rows_per_core))
+  max_batch_size = num_cores * rows_per_core
+  for i in range(num_blocks):
+    min_index = i * max_batch_size
+    max_index = min([(i + 1) * max_batch_size, n])
+    df_shard = df[min_index:max_index]
+    n_shard = len(df_shard)
+    boundaries = np.linspace(0, n_shard, num_cores + 1, dtype=np.int64)
+    yield [df_shard[boundaries[j]:boundaries[j+1]] for j in range(num_cores)]
+
+
+def _shard_dict_to_examples(shard_dict):
+  """Converts a dict of arrays into a list of example bytes."""
+  n = [i for i in shard_dict.values()][0].shape[0]
+  feature_list = [{} for _ in range(n)]
+  for column, values in shard_dict.items():
+    if len(values.shape) == 1:
+      values = np.reshape(values, values.shape + (1,))
+
+    if values.dtype.kind == "i":
+      feature_map = lambda x: tf.train.Feature(
+          int64_list=tf.train.Int64List(value=x))
+    elif values.dtype.kind == "f":
+      feature_map = lambda x: tf.train.Feature(
+          float_list=tf.train.FloatList(value=x))
+    else:
+      raise ValueError("Invalid dtype")
+    for i in range(n):
+      feature_list[i][column] = feature_map(values[i])
+  examples = [
+      tf.train.Example(features=tf.train.Features(feature=example_features))
+      for example_features in feature_list
+  ]
+
+  return [e.SerializeToString() for e in examples]
+
+
+def _serialize_shards(df_shards, columns, pool, writer):
+  """Map sharded dataframes to bytes, and write them to a buffer.
+
+  Args:
+    df_shards: A list of pandas dataframes. (Should be of similar size)
+    columns: The dataframe columns to be serialized.
+    pool: A pool to serialize in parallel.
+    writer: A TFRecordWriter to write the serialized shards.
+  """
+  # Pandas does not store columns of arrays as nd arrays. stack remedies this.
+  map_inputs = [{c: np.stack(shard[c].values, axis=0) for c in columns}
+                for shard in df_shards]
+
+  # Failure within pools is very irksome. Thus, it is better to thoroughly check
+  # inputs in the main process.
+  for inp in map_inputs:
+    # Check that all fields have the same number of rows.
+    assert len(set([v.shape[0] for v in inp.values()])) == 1
+    for val in inp.values():
+      assert hasattr(val, "dtype")
+      assert hasattr(val.dtype, "kind")
+      assert val.dtype.kind in ("i", "f")
+      assert len(val.shape) in (1, 2)
+  shard_bytes = pool.map(_shard_dict_to_examples, map_inputs)
+  for s in shard_bytes:
+    for example in s:
+      writer.write(example)
+
+
+def write_to_buffer(dataframe, buffer_path, columns, expected_size=None):
+  """Write a dataframe to a binary file for a dataset to consume.
+
+  Args:
+    dataframe: The pandas dataframe to be serialized.
+    buffer_path: The path where the serialized results will be written.
+    columns: The dataframe columns to be serialized.
+    expected_size: The size in bytes of the serialized results. This is used to
+      lazily construct the buffer.
+
+  Returns:
+    The path of the buffer.
+  """
+  if (tf.io.gfile.exists(buffer_path) and
+      tf.io.gfile.stat(buffer_path).length > 0):
+    actual_size = tf.io.gfile.stat(buffer_path).length
+    if expected_size == actual_size:
+      return buffer_path
+    tf.compat.v1.logging.warning(
+        "Existing buffer {} has size {}. Expected size {}. Deleting and "
+        "rebuilding buffer.".format(buffer_path, actual_size, expected_size))
+    tf.io.gfile.remove(buffer_path)
+
+  if dataframe is None:
+    raise ValueError(
+        "dataframe was None but a valid existing buffer was not found.")
+
+  tf.io.gfile.makedirs(os.path.split(buffer_path)[0])
+
+  tf.compat.v1.logging.info("Constructing TFRecordDataset buffer: {}"
+                            .format(buffer_path))
+
+  count = 0
+  pool = multiprocessing.dummy.Pool(multiprocessing.cpu_count())
+  try:
+    with tf.io.TFRecordWriter(buffer_path) as writer:
+      for df_shards in iter_shard_dataframe(df=dataframe,
+                                            rows_per_core=_ROWS_PER_CORE):
+        _serialize_shards(df_shards, columns, pool, writer)
+        count += sum([len(s) for s in df_shards])
+        tf.compat.v1.logging.info("{}/{} examples written."
+                                  .format(str(count).ljust(8), len(dataframe)))
+  finally:
+    pool.terminate()
+
+  tf.compat.v1.logging.info("Buffer write complete.")
+  return buffer_path
@@ -0,0 +1,199 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for binary data file utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import multiprocessing
+
+# pylint: disable=wrong-import-order
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+# pylint: enable=wrong-import-order
+
+from official.r1.utils.data import file_io
+from official.utils.misc import keras_utils
+
+
+_RAW_ROW = "raw_row"
+_DUMMY_COL = "column_0"
+_DUMMY_VEC_COL = "column_1"
+_DUMMY_VEC_LEN = 4
+
+_ROWS_PER_CORE = 4
+_TEST_CASES = [
+    # One batch of one
+    dict(row_count=1, cpu_count=1, expected=[
+        [[0]]
+    ]),
+
+    dict(row_count=10, cpu_count=1, expected=[
+        [[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]
+    ]),
+
+    dict(row_count=21, cpu_count=1, expected=[
+        [[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]],
+        [[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]
+    ]),
+
+    dict(row_count=1, cpu_count=4, expected=[
+        [[0]]
+    ]),
+
+    dict(row_count=10, cpu_count=4, expected=[
+        [[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]
+    ]),
+
+    dict(row_count=21, cpu_count=4, expected=[
+        [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]],
+        [[16], [17], [18], [19, 20]]
+    ]),
+
+    dict(row_count=10, cpu_count=8, expected=[
+        [[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]
+    ]),
+
+    dict(row_count=40, cpu_count=8, expected=[
+        [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15],
+         [16, 17, 18, 19], [20, 21, 22, 23], [24, 25, 26, 27],
+         [28, 29, 30, 31]],
+        [[32], [33], [34], [35], [36], [37], [38], [39]]
+    ]),
+]
+
+_FEATURE_MAP = {
+    _RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
+    _DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
+    _DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32)
+}
+
+
+@contextlib.contextmanager
+def fixed_core_count(cpu_count):
+  """Override CPU count.
+
+  file_io.py uses the cpu_count function to scale to the size of the instance.
+  However, this is not desirable for testing because it can make the test flaky.
+  Instead, this context manager fixes the count for more robust testing.
+
+  Args:
+    cpu_count: How many cores multiprocessing claims to have.
+
+  Yields:
+    Nothing. (for context manager only)
+  """
+  old_count_fn = multiprocessing.cpu_count
+  multiprocessing.cpu_count = lambda: cpu_count
+  yield
+  multiprocessing.cpu_count = old_count_fn
+
+
+class BaseTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BaseTest, self).setUp()
+    if keras_utils.is_v2_0:
+      tf.compat.v1.disable_eager_execution()
+
+  def _test_sharding(self, row_count, cpu_count, expected):
+    df = pd.DataFrame({_DUMMY_COL: list(range(row_count))})
+    with fixed_core_count(cpu_count):
+      shards = list(file_io.iter_shard_dataframe(df, _ROWS_PER_CORE))
+    result = [[j[_DUMMY_COL].tolist() for j in i] for i in shards]
+    self.assertAllEqual(expected, result)
+
+  def test_tiny_rows_low_core(self):
+    self._test_sharding(**_TEST_CASES[0])
+
+  def test_small_rows_low_core(self):
+    self._test_sharding(**_TEST_CASES[1])
+
+  def test_large_rows_low_core(self):
+    self._test_sharding(**_TEST_CASES[2])
+
+  def test_tiny_rows_medium_core(self):
+    self._test_sharding(**_TEST_CASES[3])
+
+  def test_small_rows_medium_core(self):
+    self._test_sharding(**_TEST_CASES[4])
+
+  def test_large_rows_medium_core(self):
+    self._test_sharding(**_TEST_CASES[5])
+
+  def test_small_rows_large_core(self):
+    self._test_sharding(**_TEST_CASES[6])
+
+  def test_large_rows_large_core(self):
+    self._test_sharding(**_TEST_CASES[7])
+
+  def _serialize_deserialize(self, num_cores=1, num_rows=20):
+    np.random.seed(1)
+    df = pd.DataFrame({
+        # Serialization order is only deterministic for num_cores=1. raw_row is
+        # used in validation after the deserialization.
+        _RAW_ROW: np.array(range(num_rows), dtype=np.int64),
+        _DUMMY_COL: np.random.randint(0, 35, size=(num_rows,)),
+        _DUMMY_VEC_COL: [
+            np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)])
+            for i in range(num_rows)  # pylint: disable=unused-variable
+        ]
+    })
+
+    with fixed_core_count(num_cores):
+      buffer_path = file_io.write_to_temp_buffer(
+          df, self.get_temp_dir(), [_RAW_ROW, _DUMMY_COL, _DUMMY_VEC_COL])
+
+    with self.session(graph=tf.Graph()) as sess:
+      dataset = tf.data.TFRecordDataset(buffer_path)
+      dataset = dataset.batch(1).map(
+          lambda x: tf.io.parse_example(serialized=x, features=_FEATURE_MAP))
+
+      data_iter = tf.compat.v1.data.make_one_shot_iterator(dataset)
+      seen_rows = set()
+      for i in range(num_rows+5):
+        row = data_iter.get_next()
+        try:
+          row_id, val_0, val_1 = sess.run(
+              [row[_RAW_ROW], row[_DUMMY_COL], row[_DUMMY_VEC_COL]])
+          row_id, val_0, val_1 = row_id[0][0], val_0[0][0], val_1[0]
+          assert row_id not in seen_rows
+          seen_rows.add(row_id)
+
+          self.assertEqual(val_0, df[_DUMMY_COL][row_id])
+          self.assertAllClose(val_1, df[_DUMMY_VEC_COL][row_id])
+
+          self.assertLess(i, num_rows, msg="Too many rows.")
+        except tf.errors.OutOfRangeError:
+          self.assertGreaterEqual(i, num_rows, msg="Too few rows.")
+
+    file_io._GARBAGE_COLLECTOR.purge()
+    assert not tf.io.gfile.exists(buffer_path)
+
+  def test_serialize_deserialize_0(self):
+    self._serialize_deserialize(num_cores=1)
+
+  def test_serialize_deserialize_1(self):
+    self._serialize_deserialize(num_cores=2)
+
+  def test_serialize_deserialize_2(self):
+    self._serialize_deserialize(num_cores=8)
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,49 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Convenience functions for exporting models as SavedModels or other types."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32,
+                                           batch_size=1):
+  """Returns a input_receiver_fn that can be used during serving.
+
+  This expects examples to come through as float tensors, and simply
+  wraps them as TensorServingInputReceivers.
+
+  Arguably, this should live in tf.estimator.export. Testing here first.
+
+  Args:
+    shape: list representing target size of a single example.
+    dtype: the expected datatype for the input example
+    batch_size: number of input tensors that will be passed for prediction
+
+  Returns:
+    A function that itself returns a TensorServingInputReceiver.
+  """
+  def serving_input_receiver_fn():
+    # Prep a placeholder where the input example will be fed in
+    features = tf.compat.v1.placeholder(
+        dtype=dtype, shape=[batch_size] + shape, name='input_tensor')
+
+    return tf.estimator.export.TensorServingInputReceiver(
+        features=features, receiver_tensors=features)
+
+  return serving_input_receiver_fn
@@ -0,0 +1,63 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for exporting utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.r1.utils import export
+
+
+class ExportUtilsTest(tf.test.TestCase):
+  """Tests for the ExportUtils."""
+
+  def test_build_tensor_serving_input_receiver_fn(self):
+    receiver_fn = export.build_tensor_serving_input_receiver_fn(shape=[4, 5])
+    with tf.Graph().as_default():
+      receiver = receiver_fn()
+      self.assertIsInstance(
+          receiver, tf.estimator.export.TensorServingInputReceiver)
+
+      self.assertIsInstance(receiver.features, tf.Tensor)
+      self.assertEqual(receiver.features.shape, tf.TensorShape([1, 4, 5]))
+      self.assertEqual(receiver.features.dtype, tf.float32)
+      self.assertIsInstance(receiver.receiver_tensors, dict)
+      # Note that Python 3 can no longer index .values() directly; cast to list.
+      self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
+                       tf.TensorShape([1, 4, 5]))
+
+  def test_build_tensor_serving_input_receiver_fn_batch_dtype(self):
+    receiver_fn = export.build_tensor_serving_input_receiver_fn(
+        shape=[4, 5], dtype=tf.int8, batch_size=10)
+
+    with tf.Graph().as_default():
+      receiver = receiver_fn()
+      self.assertIsInstance(
+          receiver, tf.estimator.export.TensorServingInputReceiver)
+
+      self.assertIsInstance(receiver.features, tf.Tensor)
+      self.assertEqual(receiver.features.shape, tf.TensorShape([10, 4, 5]))
+      self.assertEqual(receiver.features.dtype, tf.int8)
+      self.assertIsInstance(receiver.receiver_tensors, dict)
+      # Note that Python 3 can no longer index .values() directly; cast to list.
+      self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
+                       tf.TensorShape([10, 4, 5]))
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,116 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions specific to running TensorFlow on TPUs."""
+
+import tensorflow as tf
+
+
+# "local" is a magic word in the TPU cluster resolver; it informs the resolver
+# to use the local CPU as the compute device. This is useful for testing and
+# debugging; the code flow is ostensibly identical, but without the need to
+# actually have a TPU on the other end.
+LOCAL = "local"
+
+
+def construct_scalar_host_call(metric_dict, model_dir, prefix=""):
+  """Construct a host call to log scalars when training on TPU.
+
+  Args:
+    metric_dict: A dict of the tensors to be logged.
+    model_dir: The location to write the summary.
+    prefix: The prefix (if any) to prepend to the metric names.
+
+  Returns:
+    A tuple of (function, args_to_be_passed_to_said_function)
+  """
+  # type: (dict, str) -> (function, list)
+  metric_names = list(metric_dict.keys())
+
+  def host_call_fn(global_step, *args):
+    """Training host call. Creates scalar summaries for training metrics.
+
+    This function is executed on the CPU and should not directly reference
+    any Tensors in the rest of the `model_fn`. To pass Tensors from the
+    model to the `metric_fn`, provide as part of the `host_call`. See
+    https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
+    for more information.
+
+    Arguments should match the list of `Tensor` objects passed as the second
+    element in the tuple passed to `host_call`.
+
+    Args:
+      global_step: `Tensor with shape `[batch]` for the global_step
+      *args: Remaining tensors to log.
+
+    Returns:
+      List of summary ops to run on the CPU host.
+    """
+    step = global_step[0]
+    with tf.compat.v1.summary.create_file_writer(
+        logdir=model_dir, filename_suffix=".host_call").as_default():
+      with tf.compat.v1.summary.always_record_summaries():
+        for i, name in enumerate(metric_names):
+          tf.compat.v1.summary.scalar(prefix + name, args[i][0], step=step)
+
+        return tf.compat.v1.summary.all_summary_ops()
+
+  # To log the current learning rate, and gradient norm for Tensorboard, the
+  # summary op needs to be run on the host CPU via host_call. host_call
+  # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
+  # dimension. These Tensors are implicitly concatenated to
+  # [params['batch_size']].
+  global_step_tensor = tf.reshape(
+      tf.compat.v1.train.get_or_create_global_step(), [1])
+  other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]
+
+  return host_call_fn, [global_step_tensor] + other_tensors
+
+
+def embedding_matmul(embedding_table, values, mask, name="embedding_matmul"):
+  """Performs embedding lookup via a matmul.
+
+  The matrix to be multiplied by the embedding table Tensor is constructed
+  via an implementation of scatter based on broadcasting embedding indices
+  and performing an equality comparison against a broadcasted
+  range(num_embedding_table_rows). All masked positions will produce an
+  embedding vector of zeros.
+
+  Args:
+    embedding_table: Tensor of embedding table.
+      Rank 2 (table_size x embedding dim)
+    values: Tensor of embedding indices. Rank 2 (batch x n_indices)
+    mask: Tensor of mask / weights. Rank 2 (batch x n_indices)
+    name: Optional name scope for created ops
+
+  Returns:
+    Rank 3 tensor of embedding vectors.
+  """
+
+  with tf.name_scope(name):
+    n_embeddings = embedding_table.get_shape().as_list()[0]
+    batch_size, padded_size = values.shape.as_list()
+
+    emb_idcs = tf.tile(
+        tf.reshape(values, (batch_size, padded_size, 1)), (1, 1, n_embeddings))
+    emb_weights = tf.tile(
+        tf.reshape(mask, (batch_size, padded_size, 1)), (1, 1, n_embeddings))
+    col_idcs = tf.tile(
+        tf.reshape(tf.range(n_embeddings), (1, 1, n_embeddings)),
+        (batch_size, padded_size, 1))
+    one_hot = tf.where(
+        tf.equal(emb_idcs, col_idcs), emb_weights,
+        tf.zeros((batch_size, padded_size, n_embeddings)))
+
+    return tf.tensordot(one_hot, embedding_table, 1)
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test TPU optimized matmul embedding."""
+
+import numpy as np
+import tensorflow as tf
+
+from official.r1.utils import tpu as tpu_utils
+
+
+TEST_CASES = [
+    dict(embedding_dim=256, vocab_size=1000, sequence_length=64,
+         batch_size=32, seed=54131),
+    dict(embedding_dim=8, vocab_size=15, sequence_length=12,
+         batch_size=256, seed=536413),
+    dict(embedding_dim=2048, vocab_size=512, sequence_length=50,
+         batch_size=8, seed=35124)
+]
+
+
+class TPUBaseTester(tf.test.TestCase):
+  def construct_embedding_and_values(self, embedding_dim, vocab_size,
+                                     sequence_length, batch_size, seed):
+    np.random.seed(seed)
+
+    embeddings = np.random.random(size=(vocab_size, embedding_dim))
+    embedding_table = tf.convert_to_tensor(value=embeddings, dtype=tf.float32)
+
+    tokens = np.random.randint(low=1, high=vocab_size-1,
+                               size=(batch_size, sequence_length))
+    for i in range(batch_size):
+      tokens[i, np.random.randint(low=0, high=sequence_length-1):] = 0
+    values = tf.convert_to_tensor(value=tokens, dtype=tf.int32)
+    mask = tf.cast(tf.not_equal(values, 0), dtype=tf.float32)
+    return embedding_table, values, mask
+
+  def _test_embedding(self, embedding_dim, vocab_size,
+                      sequence_length, batch_size, seed):
+    """Test that matmul embedding matches embedding lookup (gather)."""
+
+    with self.test_session():
+      embedding_table, values, mask = self.construct_embedding_and_values(
+          embedding_dim=embedding_dim,
+          vocab_size=vocab_size,
+          sequence_length=sequence_length,
+          batch_size=batch_size,
+          seed=seed
+      )
+
+      embedding = (tf.nn.embedding_lookup(params=embedding_table, ids=values) *
+                   tf.expand_dims(mask, -1))
+
+      matmul_embedding = tpu_utils.embedding_matmul(
+          embedding_table=embedding_table, values=values, mask=mask)
+
+      self.assertAllClose(embedding, matmul_embedding)
+
+  def _test_masking(self, embedding_dim, vocab_size,
+                    sequence_length, batch_size, seed):
+    """Test that matmul embedding properly zeros masked positions."""
+    with self.test_session():
+      embedding_table, values, mask = self.construct_embedding_and_values(
+          embedding_dim=embedding_dim,
+          vocab_size=vocab_size,
+          sequence_length=sequence_length,
+          batch_size=batch_size,
+          seed=seed
+      )
+
+      matmul_embedding = tpu_utils.embedding_matmul(
+          embedding_table=embedding_table, values=values, mask=mask)
+
+      self.assertAllClose(matmul_embedding,
+                          matmul_embedding * tf.expand_dims(mask, -1))
+
+  def test_embedding_0(self):
+    self._test_embedding(**TEST_CASES[0])
+
+  def test_embedding_1(self):
+    self._test_embedding(**TEST_CASES[1])
+
+  def test_embedding_2(self):
+    self._test_embedding(**TEST_CASES[2])
+
+  def test_masking_0(self):
+    self._test_masking(**TEST_CASES[0])
+
+  def test_masking_1(self):
+    self._test_masking(**TEST_CASES[1])
+
+  def test_masking_2(self):
+    self._test_masking(**TEST_CASES[2])
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,24 @@
+six
+google-api-python-client>=1.6.7
+google-cloud-bigquery>=0.31.0
+kaggle>=1.3.9
+mlperf_compliance==0.0.10
+numpy>=1.15.4
+oauth2client>=4.1.2
+pandas>=0.22.0
+psutil>=5.4.3
+py-cpuinfo>=3.3.0
+scipy>=0.19.1
+tensorflow-hub>=0.6.0
+tensorflow-model-optimization>=0.2.1
+tensorflow_datasets
+dataclasses
+gin-config
+typing
+sentencepiece
+Cython
+matplotlib
+opencv-python-headless
+pyyaml
+Pillow
+-e git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI
@@ -0,0 +1,97 @@
+# Adding Abseil (absl) flags quickstart
+## Defining a flag
+absl flag definitions are similar to argparse, although they are defined on a global namespace.
+
+For instance defining a string flag looks like:
+```$xslt
+from absl import flags
+flags.DEFINE_string(
+    name="my_flag",
+    default="a_sensible_default",
+    help="Here is what this flag does."
+)
+```
+
+All three arguments are required, but default may be `None`. A common optional argument is
+short_name for defining abreviations. Certain `DEFINE_*` methods will have other required arguments.
+For instance `DEFINE_enum` requires the `enum_values` argument to be specified.
+
+## Key Flags
+absl has the concept of a key flag. Any flag defined in `__main__` is considered a key flag by
+default. Key flags are displayed in `--help`, others only appear in `--helpfull`. In order to
+handle key flags that are defined outside the module in question, absl provides the
+`flags.adopt_module_key_flags()` method. This adds the key flags of a different module to one's own
+key flags. For example:
+```$xslt
+File: flag_source.py
+---------------------------------------
+
+from absl import flags
+flags.DEFINE_string(name="my_flag", default="abc", help="a flag.")
+```
+
+```$xslt
+File: my_module.py
+---------------------------------------
+
+from absl import app as absl_app
+from absl import flags
+
+import flag_source
+
+flags.adopt_module_key_flags(flag_source)
+
+def main(_):
+  pass
+
+absl_app.run(main, [__file__, "-h"]
+```
+
+when `my_module.py` is run it will show the help text for `my_flag`. Because not all flags defined
+in a file are equally important, `official/utils/flags/core.py` (generally imported as flags_core)
+provides an abstraction for handling key flag declaration in an easy way through the
+`register_key_flags_in_core()` function, which allows a module to make a single
+`adopt_key_flags(flags_core)` call when using the util flag declaration functions.
+
+## Validators
+Often the constraints on a flag are complicated. absl provides the validator decorator to allow
+one to mark a function as a flag validation function. Suppose we want users to provide a flag
+which is a palindrome.
+
+```$xslt
+from absl import flags
+
+flags.DEFINE_string(name="pal_flag", short_name="pf", default="", help="Give me a palindrome")
+
+@flags.validator("pal_flag")
+def _check_pal(provided_pal_flag):
+  return provided_pal_flag == provided_pal_flag[::-1]
+
+```
+
+Validators take the form that returning True (truthy) passes, and all others 
+(False, None, exception) fail.
+
+## Testing
+To test using absl, simply declare flags in the setupClass method of TensorFlow's TestCase.
+
+```$xslt
+from absl import flags
+import tensorflow as tf
+
+def define_flags():
+  flags.DEFINE_string(name="test_flag", default="abc", help="an example flag")
+
+
+class BaseTester(unittest.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(BaseTester, cls).setUpClass()
+    define_flags()
+    
+  def test_trivial(self):
+    flags_core.parse_flags([__file__, "test_flag", "def"])
+    self.AssertEqual(flags.FLAGS.test_flag, "def")
+    
+```
@@ -0,0 +1,163 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Flags which will be nearly universal across models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+from official.utils.flags._conventions import help_wrap
+from official.utils.logs import hooks_helper
+
+############## npu modify begin #############
+from hccl.manage.api import get_rank_size
+from hccl.manage.api import get_rank_id
+############## npu modify end ###############
+
+def define_base(data_dir=True, model_dir=True, clean=False, train_epochs=False,
+                epochs_between_evals=False, stop_threshold=False,
+                batch_size=True, num_gpu=False, hooks=False, export_dir=False,
+                distribution_strategy=False, run_eagerly=False):
+  """Register base flags.
+
+  Args:
+    data_dir: Create a flag for specifying the input data directory.
+    model_dir: Create a flag for specifying the model file directory.
+    clean: Create a flag for removing the model_dir.
+    train_epochs: Create a flag to specify the number of training epochs.
+    epochs_between_evals: Create a flag to specify the frequency of testing.
+    stop_threshold: Create a flag to specify a threshold accuracy or other
+      eval metric which should trigger the end of training.
+    batch_size: Create a flag to specify the batch size.
+    num_gpu: Create a flag to specify the number of GPUs used.
+    hooks: Create a flag to specify hooks for logging.
+    export_dir: Create a flag to specify where a SavedModel should be exported.
+    distribution_strategy: Create a flag to specify which Distribution Strategy
+      to use.
+    run_eagerly: Create a flag to specify to run eagerly op by op.
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+
+  if data_dir:
+    flags.DEFINE_string(
+        name="data_dir", short_name="dd", default="/tmp",
+        help=help_wrap("The location of the input data."))
+    key_flags.append("data_dir")
+
+  if model_dir:
+    flags.DEFINE_string(
+        name="model_dir", short_name="md", default="/tmp",
+        help=help_wrap("The location of the model checkpoint files."))
+    key_flags.append("model_dir")
+
+  if clean:
+    flags.DEFINE_boolean(
+        name="clean", default=False,
+        help=help_wrap("If set, model_dir will be removed if it exists."))
+    key_flags.append("clean")
+
+  if train_epochs:
+    flags.DEFINE_integer(
+        name="train_epochs", short_name="te", default=1,
+        help=help_wrap("The number of epochs used to train."))
+    key_flags.append("train_epochs")
+
+  if epochs_between_evals:
+    flags.DEFINE_integer(
+        name="epochs_between_evals", short_name="ebe", default=1,
+        help=help_wrap("The number of training epochs to run between "
+                       "evaluations."))
+    key_flags.append("epochs_between_evals")
+
+  if stop_threshold:
+    flags.DEFINE_float(
+        name="stop_threshold", short_name="st",
+        default=None,
+        help=help_wrap("If passed, training will stop at the earlier of "
+                       "train_epochs and when the evaluation metric is  "
+                       "greater than or equal to stop_threshold."))
+
+  if batch_size:
+    flags.DEFINE_integer(
+        name="batch_size", short_name="bs", default=32,
+        help=help_wrap("Batch size for training and evaluation. When using "
+                       "multiple gpus, this is the global batch size for "
+                       "all devices. For example, if the batch size is 32 "
+                       "and there are 4 GPUs, each GPU will get 8 examples on "
+                       "each step."))
+    key_flags.append("batch_size")
+
+  if num_gpu:
+    flags.DEFINE_integer(
+        name="num_gpus", short_name="ng",
+        default=1,
+        help=help_wrap(
+            "How many GPUs to use at each worker with the "
+            "DistributionStrategies API. The default is 1."))
+
+  if run_eagerly:
+    flags.DEFINE_boolean(
+        name="run_eagerly", default=False,
+        help="Run the model op by op without building a model function.")
+
+  if hooks:
+    # Construct a pretty summary of hooks.
+    hook_list_str = (
+        u"\ufeff  Hook:\n" + u"\n".join([u"\ufeff    {}".format(key) for key
+                                         in hooks_helper.HOOKS]))
+    flags.DEFINE_list(
+        name="hooks", short_name="hk", default="LoggingTensorHook",
+        help=help_wrap(
+            u"A list of (case insensitive) strings to specify the names of "
+            u"training hooks.\n{}\n\ufeff  Example: `--hooks ProfilerHook,"
+            u"ExamplesPerSecondHook`\n See official.utils.logs.hooks_helper "
+            u"for details.".format(hook_list_str))
+    )
+    key_flags.append("hooks")
+
+  if export_dir:
+    flags.DEFINE_string(
+        name="export_dir", short_name="ed", default=None,
+        help=help_wrap("If set, a SavedModel serialization of the model will "
+                       "be exported to this directory at the end of training. "
+                       "See the README for more details and relevant links.")
+    )
+    key_flags.append("export_dir")
+
+  if distribution_strategy:
+    flags.DEFINE_string(
+        name="distribution_strategy", short_name="ds", default="mirrored",
+        help=help_wrap("The Distribution Strategy to use for training. "
+                       "Accepted values are 'off', 'one_device', "
+                       "'mirrored', 'parameter_server', 'collective', "
+                       "case insensitive. 'off' means not to use "
+                       "Distribution Strategy; 'default' means to choose "
+                       "from `MirroredStrategy` or `OneDeviceStrategy` "
+                       "according to the number of GPUs.")
+    )
+
+
+  return key_flags
+
+
+def get_num_gpus(flags_obj):
+  """get the num npus using hccl api"""
+  ############## npu modify begin #############
+  return get_rank_size()
+  ############## npu modify end ###############
@@ -0,0 +1,109 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Flags for benchmarking models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+from official.utils.flags._conventions import help_wrap
+
+
+def define_log_steps():
+  flags.DEFINE_integer(
+      name="log_steps", default=100,
+      help="Frequency with which to log timing information with TimeHistory.")
+
+  return []
+
+
+def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True):
+  """Register benchmarking flags.
+
+  Args:
+    benchmark_log_dir: Create a flag to specify location for benchmark logging.
+    bigquery_uploader: Create flags for uploading results to BigQuery.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  flags.DEFINE_enum(
+      name="benchmark_logger_type", default="BaseBenchmarkLogger",
+      enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger",
+                   "BenchmarkBigQueryLogger"],
+      help=help_wrap("The type of benchmark logger to use. Defaults to using "
+                     "BaseBenchmarkLogger which logs to STDOUT. Different "
+                     "loggers will require other flags to be able to work."))
+  flags.DEFINE_string(
+      name="benchmark_test_id", short_name="bti", default=None,
+      help=help_wrap("The unique test ID of the benchmark run. It could be the "
+                     "combination of key parameters. It is hardware "
+                     "independent and could be used compare the performance "
+                     "between different test runs. This flag is designed for "
+                     "human consumption, and does not have any impact within "
+                     "the system."))
+
+  define_log_steps()
+
+  if benchmark_log_dir:
+    flags.DEFINE_string(
+        name="benchmark_log_dir", short_name="bld", default=None,
+        help=help_wrap("The location of the benchmark logging.")
+    )
+
+  if bigquery_uploader:
+    flags.DEFINE_string(
+        name="gcp_project", short_name="gp", default=None,
+        help=help_wrap(
+            "The GCP project name where the benchmark will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_data_set", short_name="bds", default="test_benchmark",
+        help=help_wrap(
+            "The Bigquery dataset name where the benchmark will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_run_table", short_name="brt", default="benchmark_run",
+        help=help_wrap("The Bigquery table name where the benchmark run "
+                       "information will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_run_status_table", short_name="brst",
+        default="benchmark_run_status",
+        help=help_wrap("The Bigquery table name where the benchmark run "
+                       "status information will be uploaded."))
+
+    flags.DEFINE_string(
+        name="bigquery_metric_table", short_name="bmt",
+        default="benchmark_metric",
+        help=help_wrap("The Bigquery table name where the benchmark metric "
+                       "information will be uploaded."))
+
+  @flags.multi_flags_validator(
+      ["benchmark_logger_type", "benchmark_log_dir"],
+      message="--benchmark_logger_type=BenchmarkFileLogger will require "
+              "--benchmark_log_dir being set")
+  def _check_benchmark_log_dir(flags_dict):
+    benchmark_logger_type = flags_dict["benchmark_logger_type"]
+    if benchmark_logger_type == "BenchmarkFileLogger":
+      return flags_dict["benchmark_log_dir"]
+    return True
+
+  return key_flags
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Central location for shared argparse convention definitions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import codecs
+import functools
+
+from absl import app as absl_app
+from absl import flags
+
+
+# This codifies help string conventions and makes it easy to update them if
+# necessary. Currently the only major effect is that help bodies start on the
+# line after flags are listed. All flag definitions should wrap the text bodies
+# with help wrap when calling DEFINE_*.
+_help_wrap = functools.partial(flags.text_wrap, length=80, indent="",
+                               firstline_indent="\n")
+
+
+# Pretty formatting causes issues when utf-8 is not installed on a system.
+def _stdout_utf8():
+  try:
+    codecs.lookup("utf-8")
+  except LookupError:
+    return False
+  return sys.stdout.encoding == "UTF-8"
+
+
+if _stdout_utf8():
+  help_wrap = _help_wrap
+else:
+  def help_wrap(text, *args, **kwargs):
+    return _help_wrap(text, *args, **kwargs).replace(u"\ufeff", u"")
+
+
+# Replace None with h to also allow -h
+absl_app.HelpshortFlag.SHORT_NAME = "h"
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Flags for managing compute devices. Currently only contains TPU flags."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+from official.utils.flags._conventions import help_wrap
+
+
+def require_cloud_storage(flag_names):
+  """Register a validator to check directory flags.
+  Args:
+    flag_names: An iterable of strings containing the names of flags to be
+      checked.
+  """
+  msg = "TPU requires GCS path for {}".format(", ".join(flag_names))
+  @flags.multi_flags_validator(["tpu"] + flag_names, message=msg)
+  def _path_check(flag_values):  # pylint: disable=missing-docstring
+    if flag_values["tpu"] is None:
+      return True
+
+    valid_flags = True
+    for key in flag_names:
+      if not flag_values[key].startswith("gs://"):
+        tf.compat.v1.logging.error("{} must be a GCS path.".format(key))
+        valid_flags = False
+
+    return valid_flags
+
+
+def define_device(tpu=True):
+  """Register device specific flags.
+  Args:
+    tpu: Create flags to specify TPU operation.
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  if tpu:
+    flags.DEFINE_string(
+        name="tpu", default=None,
+        help=help_wrap(
+            "The Cloud TPU to use for training. This should be either the name "
+            "used when creating the Cloud TPU, or a "
+            "grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
+            "CPU of the local instance instead. (Good for debugging.)"))
+    key_flags.append("tpu")
+
+    flags.DEFINE_string(
+        name="tpu_zone", default=None,
+        help=help_wrap(
+            "[Optional] GCE zone where the Cloud TPU is located in. If not "
+            "specified, we will attempt to automatically detect the GCE "
+            "project from metadata."))
+
+    flags.DEFINE_string(
+        name="tpu_gcp_project", default=None,
+        help=help_wrap(
+            "[Optional] Project name for the Cloud TPU-enabled project. If not "
+            "specified, we will attempt to automatically detect the GCE "
+            "project from metadata."))
+
+    flags.DEFINE_integer(name="num_tpu_shards", default=8,
+                         help=help_wrap("Number of shards (TPU chips)."))
+
+  return key_flags
@@ -0,0 +1,54 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Flags related to distributed execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+from official.utils.flags._conventions import help_wrap
+
+
+def define_distribution(worker_hosts=True, task_index=True):
+  """Register distributed execution flags.
+
+  Args:
+    worker_hosts: Create a flag for specifying comma-separated list of workers.
+    task_index: Create a flag for specifying index of task.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+  key_flags = []
+
+  if worker_hosts:
+    flags.DEFINE_string(
+        name='worker_hosts', default=None,
+        help=help_wrap(
+            'Comma-separated list of worker ip:port pairs for running '
+            'multi-worker models with DistributionStrategy.  The user would '
+            'start the program on each host with identical value for this '
+            'flag.'))
+
+  if task_index:
+    flags.DEFINE_integer(
+        name='task_index', default=-1,
+        help=help_wrap('If multi-worker training, the task_index of this '
+                       'worker.'))
+
+  return key_flags
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Misc flags."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+
+from official.utils.flags._conventions import help_wrap
+
+
+def define_image(data_format=True):
+  """Register image specific flags.
+
+  Args:
+    data_format: Create a flag to specify image axis convention.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+
+  if data_format:
+    flags.DEFINE_enum(
+        name="data_format", short_name="df", default=None,
+        enum_values=["channels_first", "channels_last"],
+        help=help_wrap(
+            "A flag to override the data format used in the model. "
+            "channels_first provides a performance boost on GPU but is not "
+            "always compatible with CPU. If left unspecified, the data format "
+            "will be chosen automatically based on whether TensorFlow was "
+            "built for CPU or GPU."))
+    key_flags.append("data_format")
+
+  return key_flags
@@ -0,0 +1,289 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Register flags for optimizing performance."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+
+from absl import flags    # pylint: disable=g-bad-import-order
+import tensorflow as tf   # pylint: disable=g-bad-import-order
+
+from official.utils.flags._conventions import help_wrap
+
+
+# Map string to TensorFlow dtype
+DTYPE_MAP = {
+    "fp16": tf.float16,
+    "bf16": tf.bfloat16,
+    "fp32": tf.float32,
+}
+
+
+def get_tf_dtype(flags_obj):
+  if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
+    # If the graph_rewrite is used, we build the graph with fp32, and let the
+    # graph rewrite change ops to fp16.
+    return tf.float32
+  return DTYPE_MAP[flags_obj.dtype]
+
+
+def get_loss_scale(flags_obj, default_for_fp16):
+  dtype = get_tf_dtype(flags_obj)
+  if flags_obj.loss_scale == "dynamic":
+    return flags_obj.loss_scale
+  elif flags_obj.loss_scale is not None:
+    return float(flags_obj.loss_scale)
+  elif dtype == tf.float32 or dtype == tf.bfloat16:
+    return 1  # No loss scaling is needed for fp32
+  else:
+    assert dtype == tf.float16
+    return default_for_fp16
+
+
+def define_performance(num_parallel_calls=False, inter_op=False, intra_op=False,
+                       synthetic_data=False, max_train_steps=False, dtype=False,
+                       all_reduce_alg=False, num_packs=False,
+                       tf_gpu_thread_mode=False,
+                       datasets_num_private_threads=False,
+                       datasets_num_parallel_batches=False,
+                       dynamic_loss_scale=False, fp16_implementation=False,
+                       loss_scale=False,
+                       tf_data_experimental_slack=False, enable_xla=False,
+                       training_dataset_cache=False):
+  """Register flags for specifying performance tuning arguments.
+
+  Args:
+    num_parallel_calls: Create a flag to specify parallelism of data loading.
+    inter_op: Create a flag to allow specification of inter op threads.
+    intra_op: Create a flag to allow specification of intra op threads.
+    synthetic_data: Create a flag to allow the use of synthetic data.
+    max_train_steps: Create a flags to allow specification of maximum number
+      of training steps
+    dtype: Create flags for specifying dtype.
+    all_reduce_alg: If set forces a specific algorithm for multi-gpu.
+    num_packs: If set provides number of packs for MirroredStrategy's cross
+      device ops.
+    tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
+    datasets_num_private_threads: Number of private threads for datasets.
+    datasets_num_parallel_batches: Determines how many batches to process in
+    parallel when using map and batch from tf.data.
+    dynamic_loss_scale: Allow the "loss_scale" flag to take on the value
+      "dynamic". Only valid if `dtype` is True.
+    fp16_implementation: Create fp16_implementation flag.
+    loss_scale: Controls the loss scaling, normally for mixed-precision
+      training. Can only be turned on if dtype is also True.
+    tf_data_experimental_slack: Determines whether to enable tf.data's
+      `experimental_slack` option.
+    enable_xla: Determines if XLA (auto clustering) is turned on.
+    training_dataset_cache: Whether to cache the training dataset on workers.
+       Typically used to improve training performance when training data is in
+       remote storage and can fit into worker memory.
+
+  Returns:
+    A list of flags for core.py to marks as key flags.
+  """
+
+  key_flags = []
+  if num_parallel_calls:
+    flags.DEFINE_integer(
+        name="num_parallel_calls", short_name="npc",
+        default=multiprocessing.cpu_count(),
+        help=help_wrap("The number of records that are  processed in parallel "
+                       "during input processing. This can be optimized per "
+                       "data set but for generally homogeneous data sets, "
+                       "should be approximately the number of available CPU "
+                       "cores. (default behavior)"))
+
+  if inter_op:
+    flags.DEFINE_integer(
+        name="inter_op_parallelism_threads", short_name="inter", default=0,
+        help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
+                       "See TensorFlow config.proto for details.")
+    )
+
+  if intra_op:
+    flags.DEFINE_integer(
+        name="intra_op_parallelism_threads", short_name="intra", default=0,
+        help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
+                       "See TensorFlow config.proto for details."))
+
+  if synthetic_data:
+    flags.DEFINE_bool(
+        name="use_synthetic_data", short_name="synth", default=False,
+        help=help_wrap(
+            "If set, use fake data (zeroes) instead of a real dataset. "
+            "This mode is useful for performance debugging, as it removes "
+            "input processing steps, but will not learn anything."))
+
+  if max_train_steps:
+    flags.DEFINE_integer(
+        name="max_train_steps", short_name="mts", default=None, help=help_wrap(
+            "The model will stop training if the global_step reaches this "
+            "value. If not set, training will run until the specified number "
+            "of epochs have run as usual. It is generally recommended to set "
+            "--train_epochs=1 when using this flag."
+        ))
+
+  if dtype:
+    flags.DEFINE_enum(
+        name="dtype", short_name="dt", default="fp32",
+        enum_values=DTYPE_MAP.keys(),
+        help=help_wrap("The TensorFlow datatype used for calculations. "
+                       "Variables may be cast to a higher precision on a "
+                       "case-by-case basis for numerical stability."))
+
+    loss_scale_help_text = (
+        "The amount to scale the loss by when the model is run. {}. Before "
+        "gradients are computed, the loss is multiplied by the loss scale, "
+        "making all gradients loss_scale times larger. To adjust for this, "
+        "gradients are divided by the loss scale before being applied to "
+        "variables. This is mathematically equivalent to training without "
+        "a loss scale, but the loss scale helps avoid some intermediate "
+        "gradients from underflowing to zero. If not provided the default "
+        "for fp16 is 128 and 1 for all other dtypes.{}"
+    )
+    if dynamic_loss_scale:
+      loss_scale_help_text = loss_scale_help_text.format(
+          "This can be an int/float or the string 'dynamic'",
+          " The string 'dynamic' can be used to dynamically determine the "
+          "optimal loss scale during training, but currently this "
+          "significantly slows down performance")
+      loss_scale_validation_msg = ("loss_scale should be a positive int/float "
+                                   "or the string 'dynamic'.")
+    else:
+      loss_scale_help_text = loss_scale_help_text.format(
+          "This must be an int/float", "")
+      loss_scale_validation_msg = "loss_scale should be a positive int/float."
+    if loss_scale:
+      flags.DEFINE_string(
+          name="loss_scale", short_name="ls", default=None,
+          help=help_wrap(loss_scale_help_text))
+
+      @flags.validator(flag_name="loss_scale",
+                       message=loss_scale_validation_msg)
+      def _check_loss_scale(loss_scale):  # pylint: disable=unused-variable
+        """Validator to check the loss scale flag is valid."""
+        if loss_scale is None:
+          return True  # null case is handled in get_loss_scale()
+
+        if loss_scale == "dynamic" and dynamic_loss_scale:
+          return True
+
+        try:
+          loss_scale = float(loss_scale)
+        except ValueError:
+          return False
+
+        return loss_scale > 0
+
+    if fp16_implementation:
+      flags.DEFINE_enum(
+          name="fp16_implementation", default="keras",
+          enum_values=("keras', 'graph_rewrite"),
+          help=help_wrap(
+              "When --dtype=fp16, how fp16 should be implemented. This has no "
+              "impact on correctness. 'keras' uses the "
+              "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
+              "tf.train.experimental.enable_mixed_precision_graph_rewrite "
+              "API."))
+
+      @flags.multi_flags_validator(["fp16_implementation", "dtype",
+                                    "loss_scale"])
+      def _check_fp16_implementation(flags_dict):
+        """Validator to check fp16_implementation flag is valid."""
+        if (flags_dict["fp16_implementation"] == "graph_rewrite" and
+            flags_dict["dtype"] != "fp16"):
+          raise flags.ValidationError("--fp16_implementation should not be "
+                                      "specified unless --dtype=fp16")
+        return True
+
+  if all_reduce_alg:
+    flags.DEFINE_string(
+        name="all_reduce_alg", short_name="ara", default=None,
+        help=help_wrap("Defines the algorithm to use for performing all-reduce."
+                       "When specified with MirroredStrategy for single "
+                       "worker, this controls "
+                       "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
+                       "specified with MultiWorkerMirroredStrategy, this "
+                       "controls "
+                       "tf.distribute.experimental.CollectiveCommunication; "
+                       "valid options are `ring` and `nccl`."))
+
+  if num_packs:
+    flags.DEFINE_integer(
+        name="num_packs", default=1,
+        help=help_wrap("Sets `num_packs` in the cross device ops used in "
+                       "MirroredStrategy.  For details, see "
+                       "tf.distribute.NcclAllReduce."))
+
+  if tf_gpu_thread_mode:
+    flags.DEFINE_string(
+        name="tf_gpu_thread_mode", short_name="gt_mode", default=None,
+        help=help_wrap(
+            "Whether and how the GPU device uses its own threadpool.")
+    )
+
+    flags.DEFINE_integer(
+        name="per_gpu_thread_count", short_name="pgtc", default=0,
+        help=help_wrap(
+            "The number of threads to use for GPU. Only valid when "
+            "tf_gpu_thread_mode is not global.")
+    )
+
+  if datasets_num_private_threads:
+    flags.DEFINE_integer(
+        name="datasets_num_private_threads",
+        default=None,
+        help=help_wrap(
+            "Number of threads for a private threadpool created for all"
+            "datasets computation..")
+    )
+
+  if datasets_num_parallel_batches:
+    flags.DEFINE_integer(
+        name="datasets_num_parallel_batches",
+        default=None,
+        help=help_wrap(
+            "Determines how many batches to process in parallel when using "
+            "map and batch from tf.data.")
+    )
+
+  if training_dataset_cache:
+    flags.DEFINE_boolean(
+        name="training_dataset_cache",
+        default=False,
+        help=help_wrap(
+            "Determines whether to cache the training dataset on workers. "
+            "Typically used to improve training performance when training "
+            "data is in remote storage and can fit into worker memory.")
+    )
+
+  if tf_data_experimental_slack:
+    flags.DEFINE_boolean(
+        name="tf_data_experimental_slack",
+        default=False,
+        help=help_wrap(
+            "Whether to enable tf.data's `experimental_slack` option.")
+    )
+
+  if enable_xla:
+    flags.DEFINE_boolean(
+        name="enable_xla", default=False,
+        help="Whether to enable XLA auto jit compilation")
+
+  return key_flags
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Public interface for flag definition.
+
+See _example.py for detailed instructions on defining flags.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+from six.moves import shlex_quote
+
+from absl import app as absl_app
+from absl import flags
+
+from official.utils.flags import _base
+from official.utils.flags import _benchmark
+from official.utils.flags import _conventions
+from official.utils.flags import _device
+from official.utils.flags import _distribution
+from official.utils.flags import _misc
+from official.utils.flags import _performance
+
+
+def set_defaults(**kwargs):
+  for key, value in kwargs.items():
+    flags.FLAGS.set_default(name=key, value=value)
+
+
+def parse_flags(argv=None):
+  """Reset flags and reparse. Currently only used in testing."""
+  flags.FLAGS.unparse_flags()
+  absl_app.parse_flags_with_usage(argv or sys.argv)
+
+
+def register_key_flags_in_core(f):
+  """Defines a function in core.py, and registers its key flags.
+
+  absl uses the location of a flags.declare_key_flag() to determine the context
+  in which a flag is key. By making all declares in core, this allows model
+  main functions to call flags.adopt_module_key_flags() on core and correctly
+  chain key flags.
+
+  Args:
+    f:  The function to be wrapped
+
+  Returns:
+    The "core-defined" version of the input function.
+  """
+
+  def core_fn(*args, **kwargs):
+    key_flags = f(*args, **kwargs)
+    [flags.declare_key_flag(fl) for fl in key_flags]  # pylint: disable=expression-not-assigned
+  return core_fn
+
+
+define_base = register_key_flags_in_core(_base.define_base)
+# We have define_base_eager for compatibility, since it used to be a separate
+# function from define_base.
+define_base_eager = define_base
+define_log_steps = register_key_flags_in_core(_benchmark.define_log_steps)
+define_benchmark = register_key_flags_in_core(_benchmark.define_benchmark)
+define_device = register_key_flags_in_core(_device.define_device)
+define_image = register_key_flags_in_core(_misc.define_image)
+define_performance = register_key_flags_in_core(_performance.define_performance)
+define_distribution = register_key_flags_in_core(
+    _distribution.define_distribution)
+
+
+help_wrap = _conventions.help_wrap
+
+
+get_num_gpus = _base.get_num_gpus
+get_tf_dtype = _performance.get_tf_dtype
+get_loss_scale = _performance.get_loss_scale
+DTYPE_MAP = _performance.DTYPE_MAP
+require_cloud_storage = _device.require_cloud_storage
+
+def _get_nondefault_flags_as_dict():
+  """Returns the nondefault flags as a dict from flag name to value."""
+  nondefault_flags = {}
+  for flag_name in flags.FLAGS:
+    flag_value = getattr(flags.FLAGS, flag_name)
+    if (flag_name != flags.FLAGS[flag_name].short_name and
+        flag_value != flags.FLAGS[flag_name].default):
+      nondefault_flags[flag_name] = flag_value
+  return nondefault_flags
+
+
+def get_nondefault_flags_as_str():
+  """Returns flags as a string that can be passed as command line arguments.
+
+  E.g., returns: "--batch_size=256 --use_synthetic_data" for the following code
+  block:
+
+  ```
+  flags.FLAGS.batch_size = 256
+  flags.FLAGS.use_synthetic_data = True
+  print(get_nondefault_flags_as_str())
+  ```
+
+  Only flags with nondefault values are returned, as passing default flags as
+  command line arguments has no effect.
+
+  Returns:
+    A string with the flags, that can be passed as command line arguments to a
+    program to use the flags.
+  """
+  nondefault_flags = _get_nondefault_flags_as_dict()
+  flag_strings = []
+  for name, value in sorted(nondefault_flags.items()):
+    if isinstance(value, bool):
+      flag_str = '--{}'.format(name) if value else '--no{}'.format(name)
+    elif isinstance(value, list):
+      flag_str = '--{}={}'.format(name, ','.join(value))
+    else:
+      flag_str = '--{}={}'.format(name, value)
+    flag_strings.append(flag_str)
+  return ' '.join(shlex_quote(flag_str) for flag_str in flag_strings)
@@ -0,0 +1,162 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import unittest
+
+from absl import flags
+import tensorflow as tf
+
+from official.utils.flags import core as flags_core  # pylint: disable=g-bad-import-order
+
+
+def define_flags():
+  flags_core.define_base(clean=True, num_gpu=False, stop_threshold=True,
+                         hooks=True, train_epochs=True,
+                         epochs_between_evals=True)
+  flags_core.define_performance(
+      num_parallel_calls=True, inter_op=True,  intra_op=True,
+      dynamic_loss_scale=True, loss_scale=True, synthetic_data=True,
+      dtype=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+
+
+class BaseTester(unittest.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(BaseTester, cls).setUpClass()
+    define_flags()
+
+  def test_default_setting(self):
+    """Test to ensure fields exist and defaults can be set.
+    """
+
+    defaults = dict(
+        data_dir="dfgasf",
+        model_dir="dfsdkjgbs",
+        train_epochs=534,
+        epochs_between_evals=15,
+        batch_size=256,
+        hooks=["LoggingTensorHook"],
+        num_parallel_calls=18,
+        inter_op_parallelism_threads=5,
+        intra_op_parallelism_threads=10,
+        data_format="channels_first"
+    )
+
+    flags_core.set_defaults(**defaults)
+    flags_core.parse_flags()
+
+    for key, value in defaults.items():
+      assert flags.FLAGS.get_flag_value(name=key, default=None) == value
+
+  def test_benchmark_setting(self):
+    defaults = dict(
+        hooks=["LoggingMetricHook"],
+        benchmark_log_dir="/tmp/12345",
+        gcp_project="project_abc",
+    )
+
+    flags_core.set_defaults(**defaults)
+    flags_core.parse_flags()
+
+    for key, value in defaults.items():
+      assert flags.FLAGS.get_flag_value(name=key, default=None) == value
+
+  def test_booleans(self):
+    """Test to ensure boolean flags trigger as expected.
+    """
+
+    flags_core.parse_flags([__file__, "--use_synthetic_data"])
+
+    assert flags.FLAGS.use_synthetic_data
+
+  def test_parse_dtype_info(self):
+    flags_core.parse_flags([__file__, "--dtype", "fp16"])
+    self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float16)
+    self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
+                                               default_for_fp16=2), 2)
+
+    flags_core.parse_flags(
+        [__file__, "--dtype", "fp16", "--loss_scale", "5"])
+    self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
+                                               default_for_fp16=2), 5)
+
+    flags_core.parse_flags(
+        [__file__, "--dtype", "fp16", "--loss_scale", "dynamic"])
+    self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
+                                               default_for_fp16=2), "dynamic")
+
+    flags_core.parse_flags([__file__, "--dtype", "fp32"])
+    self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float32)
+    self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
+                                               default_for_fp16=2), 1)
+
+    flags_core.parse_flags([__file__, "--dtype", "fp32", "--loss_scale", "5"])
+    self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
+                                               default_for_fp16=2), 5)
+
+
+    with self.assertRaises(SystemExit):
+      flags_core.parse_flags([__file__, "--dtype", "int8"])
+
+    with self.assertRaises(SystemExit):
+      flags_core.parse_flags([__file__, "--dtype", "fp16",
+                              "--loss_scale", "abc"])
+
+  def test_get_nondefault_flags_as_str(self):
+    defaults = dict(
+        clean=True,
+        data_dir="abc",
+        hooks=["LoggingTensorHook"],
+        stop_threshold=1.5,
+        use_synthetic_data=False
+    )
+    flags_core.set_defaults(**defaults)
+    flags_core.parse_flags()
+
+    expected_flags = ""
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.clean = False
+    expected_flags += "--noclean"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.data_dir = "xyz"
+    expected_flags += " --data_dir=xyz"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.hooks = ["aaa", "bbb", "ccc"]
+    expected_flags += " --hooks=aaa,bbb,ccc"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.stop_threshold = 3.
+    expected_flags += " --stop_threshold=3.0"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    flags.FLAGS.use_synthetic_data = True
+    expected_flags += " --use_synthetic_data"
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+    # Assert that explicit setting a flag to its default value does not cause it
+    # to appear in the string
+    flags.FLAGS.use_synthetic_data = False
+    expected_flags = expected_flags[:-len(" --use_synthetic_data")]
+    self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
+
+
+if __name__ == "__main__":
+  unittest.main()
@@ -0,0 +1,65 @@
+# Using flags in official models
+
+1. **All common flags must be incorporated in the models.**
+
+   Common flags (i.e. batch_size, model_dir, etc.) are provided by various flag definition functions,
+   and channeled through `official.utils.flags.core`. For instance to define common supervised
+   learning parameters one could use the following code:
+
+   ```$xslt
+   from absl import app as absl_app
+   from absl import flags
+
+   from official.utils.flags import core as flags_core
+
+
+   def define_flags():
+     flags_core.define_base()
+     flags.adopt_key_flags(flags_core)
+
+
+   def main(_):
+     flags_obj = flags.FLAGS
+     print(flags_obj)
+
+
+   if __name__ == "__main__"
+     absl_app.run(main)
+   ```
+2. **Validate flag values.**
+
+   See the [Validators](#validators) section for implementation details.
+
+   Validators in the official model repo should not access the file system, such as verifying
+   that files exist, due to the strict ordering requirements.
+
+3. **Flag values should not be mutated.**
+
+   Instead of mutating flag values, use getter functions to return the desired values. An example
+   getter function is `get_tf_dtype` function below:
+
+   ```
+   # Map string to TensorFlow dtype
+   DTYPE_MAP = {
+       "fp16": tf.float16,
+       "fp32": tf.float32,
+   }
+
+   def get_tf_dtype(flags_obj):
+     if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
+       # If the graph_rewrite is used, we build the graph with fp32, and let the
+       # graph rewrite change ops to fp16.
+       return tf.float32
+     return DTYPE_MAP[flags_obj.dtype]
+
+
+   def main(_):
+     flags_obj = flags.FLAGS()
+
+     # Do not mutate flags_obj
+     # if flags_obj.fp16_implementation == "graph_rewrite":
+     #   flags_obj.dtype = "float32" # Don't do this
+
+     print(get_tf_dtype(flags_obj))
+     ...
+   ```
@@ -0,0 +1,119 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common flags for importing hyperparameters."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+from absl import flags
+from official.utils.flags import core as flags_core
+
+FLAGS = flags.FLAGS
+
+
+def define_common_hparams_flags():
+  """Define the common flags across models."""
+
+  flags.DEFINE_string(
+      'model_dir',
+      default=None,
+      help=('The directory where the model and training/evaluation summaries'
+            'are stored.'))
+
+  flags.DEFINE_integer(
+      'train_batch_size', default=None, help='Batch size for training.')
+
+  flags.DEFINE_integer(
+      'eval_batch_size', default=None, help='Batch size for evaluation.')
+
+  flags.DEFINE_string(
+      'precision',
+      default=None,
+      help=('Precision to use; one of: {bfloat16, float32}'))
+
+  flags.DEFINE_string(
+      'config_file',
+      default=None,
+      help=('A YAML file which specifies overrides. Note that this file can be '
+            'used as an override template to override the default parameters '
+            'specified in Python. If the same parameter is specified in both '
+            '`--config_file` and `--params_override`, the one in '
+            '`--params_override` will be used finally.'))
+
+  flags.DEFINE_string(
+      'params_override',
+      default=None,
+      help=('a YAML/JSON string or a YAML file which specifies additional '
+            'overrides over the default parameters and those specified in '
+            '`--config_file`. Note that this is supposed to be used only to '
+            'override the model parameters, but not the parameters like TPU '
+            'specific flags. One canonical use case of `--config_file` and '
+            '`--params_override` is users first define a template config file '
+            'using `--config_file`, then use `--params_override` to adjust the '
+            'minimal set of tuning parameters, for example setting up different'
+            ' `train_batch_size`. '
+            'The final override order of parameters: default_model_params --> '
+            'params from config_file --> params in params_override.'
+            'See also the help message of `--config_file`.'))
+  flags.DEFINE_integer('save_checkpoint_freq', None,
+                       'Number of steps to save checkpoint.')
+
+
+def initialize_common_flags():
+  """Define the common flags across models."""
+  define_common_hparams_flags()
+
+  flags_core.define_device(tpu=True)
+  flags_core.define_base(
+      num_gpu=True, model_dir=False, data_dir=False, batch_size=False)
+  flags_core.define_distribution(worker_hosts=True, task_index=True)
+  flags_core.define_performance(all_reduce_alg=True, num_packs=True)
+
+  # Reset the default value of num_gpus to zero.
+  FLAGS.num_gpus = 0
+
+  flags.DEFINE_string(
+      'strategy_type', 'mirrored', 'Type of distribute strategy.'
+      'One of mirrored, tpu and multiworker.')
+
+
+def strategy_flags_dict():
+  """Returns TPU and/or GPU related flags in a dictionary."""
+  return {
+      # TPUStrategy related flags.
+      'tpu': FLAGS.tpu,
+      # MultiWorkerMirroredStrategy related flags.
+      'all_reduce_alg': FLAGS.all_reduce_alg,
+      'worker_hosts': FLAGS.worker_hosts,
+      'task_index': FLAGS.task_index,
+      # MirroredStrategy and OneDeviceStrategy
+      'num_gpus': FLAGS.num_gpus,
+      'num_packs': FLAGS.num_packs,
+  }
+
+
+def hparam_flags_dict():
+  """Returns model params related flags in a dictionary."""
+  return {
+      'data_dir': FLAGS.data_dir,
+      'model_dir': FLAGS.model_dir,
+      'train_batch_size': FLAGS.train_batch_size,
+      'eval_batch_size': FLAGS.eval_batch_size,
+      'precision': FLAGS.precision,
+      'config_file': FLAGS.config_file,
+      'params_override': FLAGS.params_override,
+  }
@@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utilities that interact with cloud service.
+"""
+
+import requests
+
+GCP_METADATA_URL = "http://metadata/computeMetadata/v1/instance/hostname"
+GCP_METADATA_HEADER = {"Metadata-Flavor": "Google"}
+
+
+def on_gcp():
+  """Detect whether the current running environment is on GCP."""
+  try:
+    # Timeout in 5 seconds, in case the test environment has connectivity issue.
+    # There is not default timeout, which means it might block forever.
+    response = requests.get(
+        GCP_METADATA_URL, headers=GCP_METADATA_HEADER, timeout=5)
+    return response.status_code == 200
+  except requests.exceptions.RequestException:
+    return False
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for cloud_lib."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import mock
+import requests
+
+from official.utils.logs import cloud_lib
+
+
+class CloudLibTest(unittest.TestCase):
+
+  @mock.patch("requests.get")
+  def test_on_gcp(self, mock_requests_get):
+    mock_response = mock.MagicMock()
+    mock_requests_get.return_value = mock_response
+    mock_response.status_code = 200
+
+    self.assertEqual(cloud_lib.on_gcp(), True)
+
+  @mock.patch("requests.get")
+  def test_not_on_gcp(self, mock_requests_get):
+    mock_requests_get.side_effect = requests.exceptions.ConnectionError()
+
+    self.assertEqual(cloud_lib.on_gcp(), False)
+
+
+if __name__ == "__main__":
+  unittest.main()
@@ -0,0 +1,58 @@
+# Logging in official models
+
+This library adds logging functions that print or save tensor values. Official models should define all common hooks
+(using hooks helper) and a benchmark logger.
+
+1. **Training Hooks**
+
+   Hooks are a TensorFlow concept that define specific actions at certain points of the execution. We use them to obtain and log
+   tensor values during training.
+
+   hooks_helper.py provides an easy way to create common hooks. The following hooks are currently defined:
+   * LoggingTensorHook: Logs tensor values
+   * ProfilerHook: Writes a timeline json that can be loaded into chrome://tracing.
+   * ExamplesPerSecondHook: Logs the number of examples processed per second.
+   * LoggingMetricHook: Similar to LoggingTensorHook, except that the tensors are logged in a format defined by our data
+     anaylsis pipeline.
+
+
+2. **Benchmarks**
+
+   The benchmark logger provides useful functions for logging environment information, and evaluation results.
+   The module also contains a context which is used to update the status of the run.
+
+Example usage:
+
+```
+from absl import app as absl_app
+
+from official.utils.logs import hooks_helper
+from official.utils.logs import logger
+
+def model_main(flags_obj):
+  estimator = ...
+
+  benchmark_logger = logger.get_benchmark_logger()
+  benchmark_logger.log_run_info(...)
+
+  train_hooks = hooks_helper.get_train_hooks(...)
+
+  for epoch in range(10):
+    estimator.train(..., hooks=train_hooks)
+    eval_results = estimator.evaluate(...)
+
+    # Log a dictionary of metrics
+    benchmark_logger.log_evaluation_result(eval_results)
+
+    # Log an individual metric
+    benchmark_logger.log_metric(...)
+
+
+def main(_):
+  with logger.benchmark_context(flags.FLAGS):
+    model_main(flags.FLAGS)
+
+if __name__ == "__main__":
+  # define flags
+  absl_app.run(main)
+```
@@ -0,0 +1,146 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Hook that counts examples per second every N steps or seconds."""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from hccl.manage.api import get_rank_size
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from official.utils.logs import logger
+from benchmark_log import hwlog
+import time
+import sys
+
+
+class ExamplesPerSecondHook(tf.estimator.SessionRunHook):
+  """Hook to print out examples per second.
+
+  Total time is tracked and then divided by the total number of steps
+  to get the average step time and then batch_size is used to determine
+  the running average of examples per second. The examples per second for the
+  most recent interval is also logged.
+  """
+
+  def __init__(self,
+               batch_size,
+               every_n_steps=None,
+               every_n_secs=None,
+               warm_steps=0,
+               metric_logger=None):
+    """Initializer for ExamplesPerSecondHook.
+
+    Args:
+      batch_size: Total batch size across all workers used to calculate
+        examples/second from global time.
+      every_n_steps: Log stats every n steps.
+      every_n_secs: Log stats every n seconds. Exactly one of the
+        `every_n_steps` or `every_n_secs` should be set.
+      warm_steps: The number of steps to be skipped before logging and running
+        average calculation. warm_steps steps refers to global steps across all
+        workers, not on each worker
+      metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
+          hook should use to write the log. If None, BaseBenchmarkLogger will
+          be used.
+
+    Raises:
+      ValueError: if neither `every_n_steps` or `every_n_secs` is set, or
+      both are set.
+    """
+    if (every_n_steps is None) == (every_n_secs is None):
+      raise ValueError("exactly one of every_n_steps"
+                       " and every_n_secs should be provided.")
+
+    self._logger = metric_logger or logger.BaseBenchmarkLogger()
+
+    self._timer = tf.estimator.SecondOrStepTimer(
+        every_steps=every_n_steps, every_secs=every_n_secs)
+
+    self._step_train_time = 0
+    self._total_steps = 0
+    self._batch_size = batch_size
+    self._warm_steps = warm_steps
+    # List of examples per second logged every_n_steps.
+    self.current_examples_per_sec_list = []
+
+  def begin(self):
+    """Called once before using the session to check global step."""
+    tf.compat.v1.logging.warning("##########ExamplesPerSecondHook begin")
+    self._global_step_tensor = tf.compat.v1.train.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use StepCounterHook.")
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    """Called before each call to run().
+
+    Args:
+      run_context: A SessionRunContext object.
+
+    Returns:
+      A SessionRunArgs object or None if never triggered.
+    """
+    self.t0 = time.time()
+    tf.compat.v1.logging.warning("##########ExamplesPerSecondHook before")
+    return tf.estimator.SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
+    """Called after each call to run().
+
+    Args:
+      run_context: A SessionRunContext object.
+      run_values: A SessionRunValues object.
+    """
+    tf.compat.v1.logging.warning("##########ExamplesPerSecondHook after_run")
+    global_step = run_values.results
+
+    #if self._timer.should_trigger_for_step(
+        #global_step) and global_step > self._warm_steps:
+    elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
+        global_step)
+    batch_time = time.time() - self.t0
+    ips = self._batch_size/batch_time
+    if elapsed_time is not None:
+      self._step_train_time += elapsed_time
+      self._total_steps += elapsed_steps
+
+      # average examples per second is based on the total (accumulative)
+      # training steps and training time so far
+      average_examples_per_sec = self._batch_size * (
+          self._total_steps / self._step_train_time)
+      # current examples per second is based on the elapsed training steps
+      # and training time per batch
+      current_examples_per_sec = self._batch_size * get_rank_size() * (
+          elapsed_steps / elapsed_time)
+      # Logs entries to be read from hook during or after run.
+      self.current_examples_per_sec_list.append(current_examples_per_sec)
+      self._logger.log_metric(
+          "average_examples_per_sec", average_examples_per_sec,
+          global_step=global_step)
+
+      self._logger.log_metric(
+          "current_examples_per_sec", current_examples_per_sec,
+          global_step=global_step)
+      tf.compat.v1.logging.warning(
+            "steps: %s,elapsed_steps:%d,batch:%d,FPS:%f,ips:%f,batch_time:%f", int(self._total_steps),
+            int(elapsed_steps),int(self._batch_size),float(current_examples_per_sec),float(ips),
+            float(batch_time))
+      # get FPS info, add by wx933135
+      #date_time = hwlog.get_time()
+      #remark_logger.info("ABK time_ts: %s, fps: %f, steps: %s, file: %s, lineno: %s" % (date_time,
+      # float(current_examples_per_sec), int(self._total_steps), file_name, sys._getframe().f_lineno))
+      hwlog.remark_print(key=hwlog.FPS, value=float(current_examples_per_sec))
@@ -0,0 +1,172 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Hooks helper to return a list of TensorFlow hooks for training by name.
+
+More hooks can be added to this set. To add a new hook, 1) add the new hook to
+the registry in HOOKS, 2) add a corresponding function that parses out necessary
+parameters.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.utils.logs import hooks
+from official.utils.logs import logger
+from official.utils.logs import metric_hook
+
+_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
+                                        'cross_entropy',
+                                        'train_accuracy'])
+
+
+def get_train_hooks(name_list, use_tpu=False, **kwargs):
+  """Factory for getting a list of TensorFlow hooks for training by name.
+
+  Args:
+    name_list: a list of strings to name desired hook classes. Allowed:
+      LoggingTensorHook, ProfilerHook, ExamplesPerSecondHook, which are defined
+      as keys in HOOKS
+    use_tpu: Boolean of whether computation occurs on a TPU. This will disable
+      hooks altogether.
+    **kwargs: a dictionary of arguments to the hooks.
+
+  Returns:
+    list of instantiated hooks, ready to be used in a classifier.train call.
+
+  Raises:
+    ValueError: if an unrecognized name is passed.
+  """
+
+  if not name_list:
+    return []
+
+  if use_tpu:
+    tf.compat.v1.logging.warning('hooks_helper received name_list `{}`, but a '
+                                 'TPU is specified. No hooks will be used.'
+                                 .format(name_list))
+    return []
+
+  train_hooks = []
+  for name in name_list:
+    hook_name = HOOKS.get(name.strip().lower())
+    if hook_name is None:
+      raise ValueError('Unrecognized training hook requested: {}'.format(name))
+    else:
+      train_hooks.append(hook_name(**kwargs))
+
+  return train_hooks
+
+
+def get_logging_tensor_hook(every_n_iter=100, tensors_to_log=None, **kwargs):  # pylint: disable=unused-argument
+  """Function to get LoggingTensorHook.
+
+  Args:
+    every_n_iter: `int`, print the values of `tensors` once every N local
+      steps taken on the current worker.
+    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
+      names. If not set, log _TENSORS_TO_LOG by default.
+    **kwargs: a dictionary of arguments to LoggingTensorHook.
+
+  Returns:
+    Returns a LoggingTensorHook with a standard set of tensors that will be
+    printed to stdout.
+  """
+  if tensors_to_log is None:
+    tensors_to_log = _TENSORS_TO_LOG
+
+  return tf.estimator.LoggingTensorHook(
+      tensors=tensors_to_log,
+      every_n_iter=every_n_iter)
+
+
+def get_profiler_hook(model_dir, save_steps=1000, **kwargs):  # pylint: disable=unused-argument
+  """Function to get ProfilerHook.
+
+  Args:
+    model_dir: The directory to save the profile traces to.
+    save_steps: `int`, print profile traces every N steps.
+    **kwargs: a dictionary of arguments to ProfilerHook.
+
+  Returns:
+    Returns a ProfilerHook that writes out timelines that can be loaded into
+    profiling tools like chrome://tracing.
+  """
+  return tf.estimator.ProfilerHook(save_steps=save_steps, output_dir=model_dir)
+
+
+def get_examples_per_second_hook(every_n_steps=100,
+                                 batch_size=128,
+                                 warm_steps=5,
+                                 **kwargs):  # pylint: disable=unused-argument
+  """Function to get ExamplesPerSecondHook.
+
+  Args:
+    every_n_steps: `int`, print current and average examples per second every
+      N steps.
+    batch_size: `int`, total batch size used to calculate examples/second from
+      global time.
+    warm_steps: skip this number of steps before logging and running average.
+    **kwargs: a dictionary of arguments to ExamplesPerSecondHook.
+
+  Returns:
+    Returns a ProfilerHook that writes out timelines that can be loaded into
+    profiling tools like chrome://tracing.
+  """
+  return hooks.ExamplesPerSecondHook(
+      batch_size=batch_size, every_n_steps=every_n_steps,
+      warm_steps=warm_steps, metric_logger=logger.get_benchmark_logger())
+
+
+def get_logging_metric_hook(tensors_to_log=None,
+                            every_n_secs=600,
+                            **kwargs):  # pylint: disable=unused-argument
+  """Function to get LoggingMetricHook.
+
+  Args:
+    tensors_to_log: List of tensor names or dictionary mapping labels to tensor
+      names. If not set, log _TENSORS_TO_LOG by default.
+    every_n_secs: `int`, the frequency for logging the metric. Default to every
+      10 mins.
+    **kwargs: a dictionary of arguments.
+
+  Returns:
+    Returns a LoggingMetricHook that saves tensor values in a JSON format.
+  """
+  if tensors_to_log is None:
+    tensors_to_log = _TENSORS_TO_LOG
+  return metric_hook.LoggingMetricHook(
+      tensors=tensors_to_log,
+      metric_logger=logger.get_benchmark_logger(),
+      every_n_secs=every_n_secs)
+
+
+def get_step_counter_hook(**kwargs):
+  """Function to get StepCounterHook."""
+  del kwargs
+  return tf.estimator.StepCounterHook()
+
+
+# A dictionary to map one hook name and its corresponding function
+HOOKS = {
+    'loggingtensorhook': get_logging_tensor_hook,
+    'profilerhook': get_profiler_hook,
+    'examplespersecondhook': get_examples_per_second_hook,
+    'loggingmetrichook': get_logging_metric_hook,
+    'stepcounterhook': get_step_counter_hook
+}
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for hooks_helper."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.utils.logs import hooks_helper
+from official.utils.misc import keras_utils
+
+
+class BaseTest(unittest.TestCase):
+
+  def setUp(self):
+    super(BaseTest, self).setUp()
+    if keras_utils.is_v2_0:
+      tf.compat.v1.disable_eager_execution()
+
+  def test_raise_in_non_list_names(self):
+    with self.assertRaises(ValueError):
+      hooks_helper.get_train_hooks(
+          'LoggingTensorHook, ProfilerHook', model_dir="", batch_size=256)
+
+  def test_raise_in_invalid_names(self):
+    invalid_names = ['StepCounterHook', 'StopAtStepHook']
+    with self.assertRaises(ValueError):
+      hooks_helper.get_train_hooks(invalid_names, model_dir="", batch_size=256)
+
+  def validate_train_hook_name(self,
+                               test_hook_name,
+                               expected_hook_name,
+                               **kwargs):
+    returned_hook = hooks_helper.get_train_hooks(
+        [test_hook_name], model_dir="", **kwargs)
+    self.assertEqual(len(returned_hook), 1)
+    self.assertIsInstance(returned_hook[0], tf.estimator.SessionRunHook)
+    self.assertEqual(returned_hook[0].__class__.__name__.lower(),
+                     expected_hook_name)
+
+  def test_get_train_hooks_logging_tensor_hook(self):
+    self.validate_train_hook_name('LoggingTensorHook', 'loggingtensorhook')
+
+  def test_get_train_hooks_profiler_hook(self):
+    self.validate_train_hook_name('ProfilerHook', 'profilerhook')
+
+  def test_get_train_hooks_examples_per_second_hook(self):
+    self.validate_train_hook_name('ExamplesPerSecondHook',
+                                  'examplespersecondhook')
+
+  def test_get_logging_metric_hook(self):
+    test_hook_name = 'LoggingMetricHook'
+    self.validate_train_hook_name(test_hook_name, 'loggingmetrichook')
+
+if __name__ == '__main__':
+  tf.test.main()
@@ -0,0 +1,158 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.utils.logs import hooks
+from official.utils.testing import mock_lib
+
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
+
+
+class ExamplesPerSecondHookTest(tf.test.TestCase):
+  """Tests for the ExamplesPerSecondHook.
+
+  In the test, we explicitly run global_step tensor after train_op in order to
+  keep the global_step value and the train_op (which increase the glboal_step
+  by 1) consistent. This is to correct the discrepancies in reported global_step
+  value when running on GPUs.
+  """
+
+  def setUp(self):
+    """Mock out logging calls to verify if correct info is being monitored."""
+    self._logger = mock_lib.MockBenchmarkLogger()
+
+    self.graph = tf.Graph()
+    with self.graph.as_default():
+      tf.compat.v1.train.create_global_step()
+      self.train_op = tf.compat.v1.assign_add(
+          tf.compat.v1.train.get_global_step(), 1)
+      self.global_step = tf.compat.v1.train.get_global_step()
+
+  def test_raise_in_both_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      hooks.ExamplesPerSecondHook(
+          batch_size=256,
+          every_n_steps=10,
+          every_n_secs=20,
+          metric_logger=self._logger)
+
+  def test_raise_in_none_secs_and_steps(self):
+    with self.assertRaises(ValueError):
+      hooks.ExamplesPerSecondHook(
+          batch_size=256,
+          every_n_steps=None,
+          every_n_secs=None,
+          metric_logger=self._logger)
+
+  def _validate_log_every_n_steps(self, every_n_steps, warm_steps):
+    hook = hooks.ExamplesPerSecondHook(
+        batch_size=256,
+        every_n_steps=every_n_steps,
+        warm_steps=warm_steps,
+        metric_logger=self._logger)
+
+    with tf.compat.v1.train.MonitoredSession(
+        tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess:
+      for _ in range(every_n_steps):
+        # Explicitly run global_step after train_op to get the accurate
+        # global_step value
+        mon_sess.run(self.train_op)
+        mon_sess.run(self.global_step)
+        # Nothing should be in the list yet
+        self.assertFalse(self._logger.logged_metric)
+
+      mon_sess.run(self.train_op)
+      global_step_val = mon_sess.run(self.global_step)
+
+      if global_step_val > warm_steps:
+        self._assert_metrics()
+      else:
+        # Nothing should be in the list yet
+        self.assertFalse(self._logger.logged_metric)
+
+      # Add additional run to verify proper reset when called multiple times.
+      prev_log_len = len(self._logger.logged_metric)
+      mon_sess.run(self.train_op)
+      global_step_val = mon_sess.run(self.global_step)
+
+      if every_n_steps == 1 and global_step_val > warm_steps:
+        # Each time, we log two additional metrics. Did exactly 2 get added?
+        self.assertEqual(len(self._logger.logged_metric), prev_log_len + 2)
+      else:
+        # No change in the size of the metric list.
+        self.assertEqual(len(self._logger.logged_metric), prev_log_len)
+
+  def test_examples_per_sec_every_1_steps(self):
+    with self.graph.as_default():
+      self._validate_log_every_n_steps(1, 0)
+
+  def test_examples_per_sec_every_5_steps(self):
+    with self.graph.as_default():
+      self._validate_log_every_n_steps(5, 0)
+
+  def test_examples_per_sec_every_1_steps_with_warm_steps(self):
+    with self.graph.as_default():
+      self._validate_log_every_n_steps(1, 10)
+
+  def test_examples_per_sec_every_5_steps_with_warm_steps(self):
+    with self.graph.as_default():
+      self._validate_log_every_n_steps(5, 10)
+
+  def _validate_log_every_n_secs(self, every_n_secs):
+    hook = hooks.ExamplesPerSecondHook(
+        batch_size=256,
+        every_n_steps=None,
+        every_n_secs=every_n_secs,
+        metric_logger=self._logger)
+
+    with tf.compat.v1.train.MonitoredSession(
+        tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess:
+      # Explicitly run global_step after train_op to get the accurate
+      # global_step value
+      mon_sess.run(self.train_op)
+      mon_sess.run(self.global_step)
+      # Nothing should be in the list yet
+      self.assertFalse(self._logger.logged_metric)
+      time.sleep(every_n_secs)
+
+      mon_sess.run(self.train_op)
+      mon_sess.run(self.global_step)
+      self._assert_metrics()
+
+  def test_examples_per_sec_every_1_secs(self):
+    with self.graph.as_default():
+      self._validate_log_every_n_secs(1)
+
+  def test_examples_per_sec_every_5_secs(self):
+    with self.graph.as_default():
+      self._validate_log_every_n_secs(5)
+
+  def _assert_metrics(self):
+    metrics = self._logger.logged_metric
+    self.assertEqual(metrics[-2]["name"], "average_examples_per_sec")
+    self.assertEqual(metrics[-1]["name"], "current_examples_per_sec")
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,423 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Logging utilities for benchmark.
+
+For collecting local environment metrics like CPU and memory, certain python
+packages need be installed. See README for details.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import datetime
+import json
+import multiprocessing
+import numbers
+import os
+import threading
+import uuid
+
+from six.moves import _thread as thread
+from absl import flags
+import tensorflow as tf
+from tensorflow.python.client import device_lib
+
+from official.utils.logs import cloud_lib
+
+METRIC_LOG_FILE_NAME = "metric.log"
+BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
+_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
+GCP_TEST_ENV = "GCP"
+RUN_STATUS_SUCCESS = "success"
+RUN_STATUS_FAILURE = "failure"
+RUN_STATUS_RUNNING = "running"
+
+
+FLAGS = flags.FLAGS
+
+# Don't use it directly. Use get_benchmark_logger to access a logger.
+_benchmark_logger = None
+_logger_lock = threading.Lock()
+
+
+def config_benchmark_logger(flag_obj=None):
+  """Config the global benchmark logger."""
+  _logger_lock.acquire()
+  try:
+    global _benchmark_logger
+    if not flag_obj:
+      flag_obj = FLAGS
+
+    if (not hasattr(flag_obj, "benchmark_logger_type") or
+        flag_obj.benchmark_logger_type == "BaseBenchmarkLogger"):
+      _benchmark_logger = BaseBenchmarkLogger()
+    elif flag_obj.benchmark_logger_type == "BenchmarkFileLogger":
+      _benchmark_logger = BenchmarkFileLogger(flag_obj.benchmark_log_dir)
+    elif flag_obj.benchmark_logger_type == "BenchmarkBigQueryLogger":
+      from official.benchmark import benchmark_uploader as bu  # pylint: disable=g-import-not-at-top
+      bq_uploader = bu.BigQueryUploader(gcp_project=flag_obj.gcp_project)
+      _benchmark_logger = BenchmarkBigQueryLogger(
+          bigquery_uploader=bq_uploader,
+          bigquery_data_set=flag_obj.bigquery_data_set,
+          bigquery_run_table=flag_obj.bigquery_run_table,
+          bigquery_run_status_table=flag_obj.bigquery_run_status_table,
+          bigquery_metric_table=flag_obj.bigquery_metric_table,
+          run_id=str(uuid.uuid4()))
+    else:
+      raise ValueError("Unrecognized benchmark_logger_type: %s"
+                       % flag_obj.benchmark_logger_type)
+
+  finally:
+    _logger_lock.release()
+  return _benchmark_logger
+
+
+def get_benchmark_logger():
+  if not _benchmark_logger:
+    config_benchmark_logger()
+  return _benchmark_logger
+
+
+@contextlib.contextmanager
+def benchmark_context(flag_obj):
+  """Context of benchmark, which will update status of the run accordingly."""
+  benchmark_logger = config_benchmark_logger(flag_obj)
+  try:
+    yield
+    benchmark_logger.on_finish(RUN_STATUS_SUCCESS)
+  except Exception:  # pylint: disable=broad-except
+    # Catch all the exception, update the run status to be failure, and re-raise
+    benchmark_logger.on_finish(RUN_STATUS_FAILURE)
+    raise
+
+
+class BaseBenchmarkLogger(object):
+  """Class to log the benchmark information to STDOUT."""
+
+  def log_evaluation_result(self, eval_results):
+    """Log the evaluation result.
+
+    The evaluate result is a dictionary that contains metrics defined in
+    model_fn. It also contains a entry for global_step which contains the value
+    of the global step when evaluation was performed.
+
+    Args:
+      eval_results: dict, the result of evaluate.
+    """
+    if not isinstance(eval_results, dict):
+      tf.compat.v1.logging.warning(
+          "eval_results should be dictionary for logging. Got %s",
+          type(eval_results))
+      return
+    global_step = eval_results[tf.compat.v1.GraphKeys.GLOBAL_STEP]
+    for key in sorted(eval_results):
+      if key != tf.compat.v1.GraphKeys.GLOBAL_STEP:
+        self.log_metric(key, eval_results[key], global_step=global_step)
+
+  def log_metric(self, name, value, unit=None, global_step=None, extras=None):
+    """Log the benchmark metric information to local file.
+
+    Currently the logging is done in a synchronized way. This should be updated
+    to log asynchronously.
+
+    Args:
+      name: string, the name of the metric to log.
+      value: number, the value of the metric. The value will not be logged if it
+        is not a number type.
+      unit: string, the unit of the metric, E.g "image per second".
+      global_step: int, the global_step when the metric is logged.
+      extras: map of string:string, the extra information about the metric.
+    """
+    metric = _process_metric_to_json(name, value, unit, global_step, extras)
+    if metric:
+      tf.compat.v1.logging.info("Benchmark metric: %s", metric)
+
+  def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
+    tf.compat.v1.logging.info(
+        "Benchmark run: %s", _gather_run_info(model_name, dataset_name,
+                                              run_params, test_id))
+
+  def on_finish(self, status):
+    pass
+
+
+class BenchmarkFileLogger(BaseBenchmarkLogger):
+  """Class to log the benchmark information to local disk."""
+
+  def __init__(self, logging_dir):
+    super(BenchmarkFileLogger, self).__init__()
+    self._logging_dir = logging_dir
+    if not tf.io.gfile.isdir(self._logging_dir):
+      tf.io.gfile.makedirs(self._logging_dir)
+    self._metric_file_handler = tf.io.gfile.GFile(
+        os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a")
+
+  def log_metric(self, name, value, unit=None, global_step=None, extras=None):
+    """Log the benchmark metric information to local file.
+
+    Currently the logging is done in a synchronized way. This should be updated
+    to log asynchronously.
+
+    Args:
+      name: string, the name of the metric to log.
+      value: number, the value of the metric. The value will not be logged if it
+        is not a number type.
+      unit: string, the unit of the metric, E.g "image per second".
+      global_step: int, the global_step when the metric is logged.
+      extras: map of string:string, the extra information about the metric.
+    """
+    metric = _process_metric_to_json(name, value, unit, global_step, extras)
+    if metric:
+      try:
+        json.dump(metric, self._metric_file_handler)
+        self._metric_file_handler.write("\n")
+        self._metric_file_handler.flush()
+      except (TypeError, ValueError) as e:
+        tf.compat.v1.logging.warning(
+            "Failed to dump metric to log file: name %s, value %s, error %s",
+            name, value, e)
+
+  def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
+    """Collect most of the TF runtime information for the local env.
+
+    The schema of the run info follows official/benchmark/datastore/schema.
+
+    Args:
+      model_name: string, the name of the model.
+      dataset_name: string, the name of dataset for training and evaluation.
+      run_params: dict, the dictionary of parameters for the run, it could
+        include hyperparameters or other params that are important for the run.
+      test_id: string, the unique name of the test run by the combination of key
+        parameters, eg batch size, num of GPU. It is hardware independent.
+    """
+    run_info = _gather_run_info(model_name, dataset_name, run_params, test_id)
+
+    with tf.io.gfile.GFile(os.path.join(
+        self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
+      try:
+        json.dump(run_info, f)
+        f.write("\n")
+      except (TypeError, ValueError) as e:
+        tf.compat.v1.logging.warning(
+            "Failed to dump benchmark run info to log file: %s", e)
+
+  def on_finish(self, status):
+    self._metric_file_handler.flush()
+    self._metric_file_handler.close()
+
+
+class BenchmarkBigQueryLogger(BaseBenchmarkLogger):
+  """Class to log the benchmark information to BigQuery data store."""
+
+  def __init__(self,
+               bigquery_uploader,
+               bigquery_data_set,
+               bigquery_run_table,
+               bigquery_run_status_table,
+               bigquery_metric_table,
+               run_id):
+    super(BenchmarkBigQueryLogger, self).__init__()
+    self._bigquery_uploader = bigquery_uploader
+    self._bigquery_data_set = bigquery_data_set
+    self._bigquery_run_table = bigquery_run_table
+    self._bigquery_run_status_table = bigquery_run_status_table
+    self._bigquery_metric_table = bigquery_metric_table
+    self._run_id = run_id
+
+  def log_metric(self, name, value, unit=None, global_step=None, extras=None):
+    """Log the benchmark metric information to bigquery.
+
+    Args:
+      name: string, the name of the metric to log.
+      value: number, the value of the metric. The value will not be logged if it
+        is not a number type.
+      unit: string, the unit of the metric, E.g "image per second".
+      global_step: int, the global_step when the metric is logged.
+      extras: map of string:string, the extra information about the metric.
+    """
+    metric = _process_metric_to_json(name, value, unit, global_step, extras)
+    if metric:
+      # Starting new thread for bigquery upload in case it might take long time
+      # and impact the benchmark and performance measurement. Starting a new
+      # thread might have potential performance impact for model that run on
+      # CPU.
+      thread.start_new_thread(
+          self._bigquery_uploader.upload_benchmark_metric_json,
+          (self._bigquery_data_set,
+           self._bigquery_metric_table,
+           self._run_id,
+           [metric]))
+
+  def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
+    """Collect most of the TF runtime information for the local env.
+
+    The schema of the run info follows official/benchmark/datastore/schema.
+
+    Args:
+      model_name: string, the name of the model.
+      dataset_name: string, the name of dataset for training and evaluation.
+      run_params: dict, the dictionary of parameters for the run, it could
+        include hyperparameters or other params that are important for the run.
+      test_id: string, the unique name of the test run by the combination of key
+        parameters, eg batch size, num of GPU. It is hardware independent.
+    """
+    run_info = _gather_run_info(model_name, dataset_name, run_params, test_id)
+    # Starting new thread for bigquery upload in case it might take long time
+    # and impact the benchmark and performance measurement. Starting a new
+    # thread might have potential performance impact for model that run on CPU.
+    thread.start_new_thread(
+        self._bigquery_uploader.upload_benchmark_run_json,
+        (self._bigquery_data_set,
+         self._bigquery_run_table,
+         self._run_id,
+         run_info))
+    thread.start_new_thread(
+        self._bigquery_uploader.insert_run_status,
+        (self._bigquery_data_set,
+         self._bigquery_run_status_table,
+         self._run_id,
+         RUN_STATUS_RUNNING))
+
+  def on_finish(self, status):
+    self._bigquery_uploader.update_run_status(
+        self._bigquery_data_set,
+        self._bigquery_run_status_table,
+        self._run_id,
+        status)
+
+
+def _gather_run_info(model_name, dataset_name, run_params, test_id):
+  """Collect the benchmark run information for the local environment."""
+  run_info = {
+      "model_name": model_name,
+      "dataset": {"name": dataset_name},
+      "machine_config": {},
+      "test_id": test_id,
+      "run_date": datetime.datetime.utcnow().strftime(
+          _DATE_TIME_FORMAT_PATTERN)}
+  _collect_tensorflow_info(run_info)
+  _collect_tensorflow_environment_variables(run_info)
+  _collect_run_params(run_info, run_params)
+  _collect_cpu_info(run_info)
+  _collect_memory_info(run_info)
+  _collect_test_environment(run_info)
+  return run_info
+
+
+def _process_metric_to_json(
+    name, value, unit=None, global_step=None, extras=None):
+  """Validate the metric data and generate JSON for insert."""
+  if not isinstance(value, numbers.Number):
+    tf.compat.v1.logging.warning(
+        "Metric value to log should be a number. Got %s", type(value))
+    return None
+
+  extras = _convert_to_json_dict(extras)
+  return {
+      "name": name,
+      "value": float(value),
+      "unit": unit,
+      "global_step": global_step,
+      "timestamp": datetime.datetime.utcnow().strftime(
+          _DATE_TIME_FORMAT_PATTERN),
+      "extras": extras}
+
+
+def _collect_tensorflow_info(run_info):
+  run_info["tensorflow_version"] = {
+      "version": tf.version.VERSION, "git_hash": tf.version.GIT_VERSION}
+
+
+def _collect_run_params(run_info, run_params):
+  """Log the parameter information for the benchmark run."""
+  def process_param(name, value):
+    type_check = {
+        str: {"name": name, "string_value": value},
+        int: {"name": name, "long_value": value},
+        bool: {"name": name, "bool_value": str(value)},
+        float: {"name": name, "float_value": value},
+    }
+    return type_check.get(type(value),
+                          {"name": name, "string_value": str(value)})
+  if run_params:
+    run_info["run_parameters"] = [
+        process_param(k, v) for k, v in sorted(run_params.items())]
+
+
+def _collect_tensorflow_environment_variables(run_info):
+  run_info["tensorflow_environment_variables"] = [
+      {"name": k, "value": v}
+      for k, v in sorted(os.environ.items()) if k.startswith("TF_")]
+
+
+# The following code is mirrored from tensorflow/tools/test/system_info_lib
+# which is not exposed for import.
+def _collect_cpu_info(run_info):
+  """Collect the CPU information for the local environment."""
+  cpu_info = {}
+
+  cpu_info["num_cores"] = multiprocessing.cpu_count()
+
+  try:
+    # Note: cpuinfo is not installed in the TensorFlow OSS tree.
+    # It is installable via pip.
+    import cpuinfo    # pylint: disable=g-import-not-at-top
+
+    info = cpuinfo.get_cpu_info()
+    cpu_info["cpu_info"] = info["brand"]
+    cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6
+
+    run_info["machine_config"]["cpu_info"] = cpu_info
+  except ImportError:
+    tf.compat.v1.logging.warn(
+        "'cpuinfo' not imported. CPU info will not be logged.")
+
+
+def _collect_memory_info(run_info):
+  try:
+    # Note: psutil is not installed in the TensorFlow OSS tree.
+    # It is installable via pip.
+    import psutil   # pylint: disable=g-import-not-at-top
+    vmem = psutil.virtual_memory()
+    run_info["machine_config"]["memory_total"] = vmem.total
+    run_info["machine_config"]["memory_available"] = vmem.available
+  except ImportError:
+    tf.compat.v1.logging.warn(
+        "'psutil' not imported. Memory info will not be logged.")
+
+
+def _collect_test_environment(run_info):
+  """Detect the local environment, eg GCE, AWS or DGX, etc."""
+  if cloud_lib.on_gcp():
+    run_info["test_environment"] = GCP_TEST_ENV
+  # TODO(scottzhu): Add more testing env detection for other platform
+
+
+def _parse_gpu_model(physical_device_desc):
+  # Assume all the GPU connected are same model
+  for kv in physical_device_desc.split(","):
+    k, _, v = kv.partition(":")
+    if k.strip() == "name":
+      return v.strip()
+  return None
+
+
+def _convert_to_json_dict(input_dict):
+  if input_dict:
+    return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
+  else:
+    return []
@@ -0,0 +1,365 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for benchmark logger."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import tempfile
+import time
+import unittest
+
+import mock
+from absl.testing import flagsaver
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+try:
+  from google.cloud import bigquery
+except ImportError:
+  bigquery = None
+
+from official.utils.misc import keras_utils
+from official.utils.flags import core as flags_core
+from official.utils.logs import logger
+
+
+class BenchmarkLoggerTest(tf.test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(BenchmarkLoggerTest, cls).setUpClass()
+    flags_core.define_benchmark()
+
+  def test_get_default_benchmark_logger(self):
+    with flagsaver.flagsaver(benchmark_logger_type="foo"):
+      self.assertIsInstance(logger.get_benchmark_logger(),
+                            logger.BaseBenchmarkLogger)
+
+  def test_config_base_benchmark_logger(self):
+    with flagsaver.flagsaver(benchmark_logger_type="BaseBenchmarkLogger"):
+      logger.config_benchmark_logger()
+      self.assertIsInstance(logger.get_benchmark_logger(),
+                            logger.BaseBenchmarkLogger)
+
+  def test_config_benchmark_file_logger(self):
+    # Set the benchmark_log_dir first since the benchmark_logger_type will need
+    # the value to be set when it does the validation.
+    with flagsaver.flagsaver(benchmark_log_dir="/tmp"):
+      with flagsaver.flagsaver(benchmark_logger_type="BenchmarkFileLogger"):
+        logger.config_benchmark_logger()
+        self.assertIsInstance(logger.get_benchmark_logger(),
+                              logger.BenchmarkFileLogger)
+
+  @unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
+  @mock.patch.object(bigquery, "Client")
+  def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
+    with flagsaver.flagsaver(benchmark_logger_type="BenchmarkBigQueryLogger"):
+      logger.config_benchmark_logger()
+      self.assertIsInstance(logger.get_benchmark_logger(),
+                            logger.BenchmarkBigQueryLogger)
+
+  @mock.patch("official.utils.logs.logger.config_benchmark_logger")
+  def test_benchmark_context(self, mock_config_benchmark_logger):
+    mock_logger = mock.MagicMock()
+    mock_config_benchmark_logger.return_value = mock_logger
+    with logger.benchmark_context(None):
+      tf.compat.v1.logging.info("start benchmarking")
+    mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_SUCCESS)
+
+  @mock.patch("official.utils.logs.logger.config_benchmark_logger")
+  def test_benchmark_context_failure(self, mock_config_benchmark_logger):
+    mock_logger = mock.MagicMock()
+    mock_config_benchmark_logger.return_value = mock_logger
+    with self.assertRaises(RuntimeError):
+      with logger.benchmark_context(None):
+        raise RuntimeError("training error")
+    mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_FAILURE)
+
+
+class BaseBenchmarkLoggerTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BaseBenchmarkLoggerTest, self).setUp()
+    self._actual_log = tf.compat.v1.logging.info
+    self.logged_message = None
+
+    def mock_log(*args, **kwargs):
+      self.logged_message = args
+      self._actual_log(*args, **kwargs)
+
+    tf.compat.v1.logging.info = mock_log
+
+  def tearDown(self):
+    super(BaseBenchmarkLoggerTest, self).tearDown()
+    tf.compat.v1.logging.info = self._actual_log
+
+  def test_log_metric(self):
+    log = logger.BaseBenchmarkLogger()
+    log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
+
+    expected_log_prefix = "Benchmark metric:"
+    self.assertRegexpMatches(str(self.logged_message), expected_log_prefix)
+
+
+class BenchmarkFileLoggerTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BenchmarkFileLoggerTest, self).setUp()
+    # Avoid pulling extra env vars from test environment which affects the test
+    # result, eg. Kokoro test has a TF_PKG env which affect the test case
+    # test_collect_tensorflow_environment_variables()
+    self.original_environ = dict(os.environ)
+    os.environ.clear()
+
+  def tearDown(self):
+    super(BenchmarkFileLoggerTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+    os.environ.clear()
+    os.environ.update(self.original_environ)
+
+  def test_create_logging_dir(self):
+    non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
+    self.assertFalse(tf.io.gfile.isdir(non_exist_temp_dir))
+
+    logger.BenchmarkFileLogger(non_exist_temp_dir)
+    self.assertTrue(tf.io.gfile.isdir(non_exist_temp_dir))
+
+  def test_log_metric(self):
+    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    log = logger.BenchmarkFileLogger(log_dir)
+    log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
+
+    metric_log = os.path.join(log_dir, "metric.log")
+    self.assertTrue(tf.io.gfile.exists(metric_log))
+    with tf.io.gfile.GFile(metric_log) as f:
+      metric = json.loads(f.readline())
+      self.assertEqual(metric["name"], "accuracy")
+      self.assertEqual(metric["value"], 0.999)
+      self.assertEqual(metric["unit"], None)
+      self.assertEqual(metric["global_step"], 1e4)
+      self.assertEqual(metric["extras"], [{"name": "name", "value": "value"}])
+
+  def test_log_multiple_metrics(self):
+    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    log = logger.BenchmarkFileLogger(log_dir)
+    log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
+    log.log_metric("loss", 0.02, global_step=1e4)
+
+    metric_log = os.path.join(log_dir, "metric.log")
+    self.assertTrue(tf.io.gfile.exists(metric_log))
+    with tf.io.gfile.GFile(metric_log) as f:
+      accuracy = json.loads(f.readline())
+      self.assertEqual(accuracy["name"], "accuracy")
+      self.assertEqual(accuracy["value"], 0.999)
+      self.assertEqual(accuracy["unit"], None)
+      self.assertEqual(accuracy["global_step"], 1e4)
+      self.assertEqual(accuracy["extras"], [{"name": "name", "value": "value"}])
+
+      loss = json.loads(f.readline())
+      self.assertEqual(loss["name"], "loss")
+      self.assertEqual(loss["value"], 0.02)
+      self.assertEqual(loss["unit"], None)
+      self.assertEqual(loss["global_step"], 1e4)
+      self.assertEqual(loss["extras"], [])
+
+  def test_log_non_number_value(self):
+    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    log = logger.BenchmarkFileLogger(log_dir)
+    const = tf.constant(1)
+    log.log_metric("accuracy", const)
+
+    metric_log = os.path.join(log_dir, "metric.log")
+    self.assertFalse(tf.io.gfile.exists(metric_log))
+
+  def test_log_evaluation_result(self):
+    eval_result = {"loss": 0.46237424,
+                   "global_step": 207082,
+                   "accuracy": 0.9285}
+    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    log = logger.BenchmarkFileLogger(log_dir)
+    log.log_evaluation_result(eval_result)
+
+    metric_log = os.path.join(log_dir, "metric.log")
+    self.assertTrue(tf.io.gfile.exists(metric_log))
+    with tf.io.gfile.GFile(metric_log) as f:
+      accuracy = json.loads(f.readline())
+      self.assertEqual(accuracy["name"], "accuracy")
+      self.assertEqual(accuracy["value"], 0.9285)
+      self.assertEqual(accuracy["unit"], None)
+      self.assertEqual(accuracy["global_step"], 207082)
+
+      loss = json.loads(f.readline())
+      self.assertEqual(loss["name"], "loss")
+      self.assertEqual(loss["value"], 0.46237424)
+      self.assertEqual(loss["unit"], None)
+      self.assertEqual(loss["global_step"], 207082)
+
+  def test_log_evaluation_result_with_invalid_type(self):
+    eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
+    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    log = logger.BenchmarkFileLogger(log_dir)
+    log.log_evaluation_result(eval_result)
+
+    metric_log = os.path.join(log_dir, "metric.log")
+    self.assertFalse(tf.io.gfile.exists(metric_log))
+
+  @mock.patch("official.utils.logs.logger._gather_run_info")
+  def test_log_run_info(self, mock_gather_run_info):
+    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    log = logger.BenchmarkFileLogger(log_dir)
+    run_info = {"model_name": "model_name",
+                "dataset": "dataset_name",
+                "run_info": "run_value"}
+    mock_gather_run_info.return_value = run_info
+    log.log_run_info("model_name", "dataset_name", {})
+
+    run_log = os.path.join(log_dir, "benchmark_run.log")
+    self.assertTrue(tf.io.gfile.exists(run_log))
+    with tf.io.gfile.GFile(run_log) as f:
+      run_info = json.loads(f.readline())
+      self.assertEqual(run_info["model_name"], "model_name")
+      self.assertEqual(run_info["dataset"], "dataset_name")
+      self.assertEqual(run_info["run_info"], "run_value")
+
+  def test_collect_tensorflow_info(self):
+    run_info = {}
+    logger._collect_tensorflow_info(run_info)
+    self.assertNotEqual(run_info["tensorflow_version"], {})
+    self.assertEqual(run_info["tensorflow_version"]["version"],
+                     tf.version.VERSION)
+    self.assertEqual(run_info["tensorflow_version"]["git_hash"],
+                     tf.version.GIT_VERSION)
+
+  def test_collect_run_params(self):
+    run_info = {}
+    run_parameters = {
+        "batch_size": 32,
+        "synthetic_data": True,
+        "train_epochs": 100.00,
+        "dtype": "fp16",
+        "resnet_size": 50,
+        "random_tensor": tf.constant(2.0)
+    }
+    logger._collect_run_params(run_info, run_parameters)
+    self.assertEqual(len(run_info["run_parameters"]), 6)
+    self.assertEqual(run_info["run_parameters"][0],
+                     {"name": "batch_size", "long_value": 32})
+    self.assertEqual(run_info["run_parameters"][1],
+                     {"name": "dtype", "string_value": "fp16"})
+    v1_tensor = {"name": "random_tensor", "string_value":
+                     "Tensor(\"Const:0\", shape=(), dtype=float32)"}
+    v2_tensor = {"name": "random_tensor", "string_value":
+                     "tf.Tensor(2.0, shape=(), dtype=float32)"}
+    self.assertIn(run_info["run_parameters"][2], [v1_tensor, v2_tensor])
+
+
+    self.assertEqual(run_info["run_parameters"][3],
+                     {"name": "resnet_size", "long_value": 50})
+    self.assertEqual(run_info["run_parameters"][4],
+                     {"name": "synthetic_data", "bool_value": "True"})
+    self.assertEqual(run_info["run_parameters"][5],
+                     {"name": "train_epochs", "float_value": 100.00})
+
+  def test_collect_tensorflow_environment_variables(self):
+    os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
+    os.environ["TF_OTHER"] = "2"
+    os.environ["OTHER"] = "3"
+
+    run_info = {}
+    logger._collect_tensorflow_environment_variables(run_info)
+    self.assertIsNotNone(run_info["tensorflow_environment_variables"])
+    expected_tf_envs = [
+        {"name": "TF_ENABLE_WINOGRAD_NONFUSED", "value": "1"},
+        {"name": "TF_OTHER", "value": "2"},
+    ]
+    self.assertEqual(run_info["tensorflow_environment_variables"],
+                     expected_tf_envs)
+
+  def test_collect_memory_info(self):
+    run_info = {"machine_config": {}}
+    logger._collect_memory_info(run_info)
+    self.assertIsNotNone(run_info["machine_config"]["memory_total"])
+    self.assertIsNotNone(run_info["machine_config"]["memory_available"])
+
+
+@unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
+class BenchmarkBigQueryLoggerTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BenchmarkBigQueryLoggerTest, self).setUp()
+    # Avoid pulling extra env vars from test environment which affects the test
+    # result, eg. Kokoro test has a TF_PKG env which affect the test case
+    # test_collect_tensorflow_environment_variables()
+    self.original_environ = dict(os.environ)
+    os.environ.clear()
+
+    self.mock_bq_uploader = mock.MagicMock()
+    self.logger = logger.BenchmarkBigQueryLogger(
+        self.mock_bq_uploader, "dataset", "run_table", "run_status_table",
+        "metric_table", "run_id")
+
+  def tearDown(self):
+    super(BenchmarkBigQueryLoggerTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+    os.environ.clear()
+    os.environ.update(self.original_environ)
+
+  def test_log_metric(self):
+    self.logger.log_metric(
+        "accuracy", 0.999, global_step=1e4, extras={"name": "value"})
+    expected_metric_json = [{
+        "name": "accuracy",
+        "value": 0.999,
+        "unit": None,
+        "global_step": 1e4,
+        "timestamp": mock.ANY,
+        "extras": [{"name": "name", "value": "value"}]
+    }]
+    # log_metric will call upload_benchmark_metric_json in a separate thread.
+    # Give it some grace period for the new thread before assert.
+    time.sleep(1)
+    self.mock_bq_uploader.upload_benchmark_metric_json.assert_called_once_with(
+        "dataset", "metric_table", "run_id", expected_metric_json)
+
+  @mock.patch("official.utils.logs.logger._gather_run_info")
+  def test_log_run_info(self, mock_gather_run_info):
+    run_info = {"model_name": "model_name",
+                "dataset": "dataset_name",
+                "run_info": "run_value"}
+    mock_gather_run_info.return_value = run_info
+    self.logger.log_run_info("model_name", "dataset_name", {})
+    # log_metric will call upload_benchmark_metric_json in a separate thread.
+    # Give it some grace period for the new thread before assert.
+    time.sleep(1)
+    self.mock_bq_uploader.upload_benchmark_run_json.assert_called_once_with(
+        "dataset", "run_table", "run_id", run_info)
+    self.mock_bq_uploader.insert_run_status.assert_called_once_with(
+        "dataset", "run_status_table", "run_id", "running")
+
+  def test_on_finish(self):
+    self.logger.on_finish(logger.RUN_STATUS_SUCCESS)
+    # log_metric will call upload_benchmark_metric_json in a separate thread.
+    # Give it some grace period for the new thread before assert.
+    time.sleep(1)
+    self.mock_bq_uploader.update_run_status.assert_called_once_with(
+        "dataset", "run_status_table", "run_id", logger.RUN_STATUS_SUCCESS)
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,97 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Session hook for logging benchmark metric."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+
+class LoggingMetricHook(tf.estimator.LoggingTensorHook):
+  """Hook to log benchmark metric information.
+
+  This hook is very similar as tf.train.LoggingTensorHook, which logs given
+  tensors every N local steps, every N seconds, or at the end. The metric
+  information will be logged to given log_dir or via metric_logger in JSON
+  format, which can be consumed by data analysis pipeline later.
+
+  Note that if `at_end` is True, `tensors` should not include any tensor
+  whose evaluation produces a side effect such as consuming additional inputs.
+  """
+
+  def __init__(self, tensors, metric_logger=None,
+               every_n_iter=None, every_n_secs=None, at_end=False):
+    """Initializer for LoggingMetricHook.
+
+    Args:
+      tensors: `dict` that maps string-valued tags to tensors/tensor names,
+          or `iterable` of tensors/tensor names.
+      metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
+          hook should use to write the log.
+      every_n_iter: `int`, print the values of `tensors` once every N local
+          steps taken on the current worker.
+      every_n_secs: `int` or `float`, print the values of `tensors` once every N
+          seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
+          provided.
+      at_end: `bool` specifying whether to print the values of `tensors` at the
+          end of the run.
+
+    Raises:
+      ValueError:
+        1. `every_n_iter` is non-positive, or
+        2. Exactly one of every_n_iter and every_n_secs should be provided.
+        3. Exactly one of log_dir and metric_logger should be provided.
+    """
+    super(LoggingMetricHook, self).__init__(
+        tensors=tensors,
+        every_n_iter=every_n_iter,
+        every_n_secs=every_n_secs,
+        at_end=at_end)
+
+    if metric_logger is None:
+      raise ValueError("metric_logger should be provided.")
+    self._logger = metric_logger
+
+  def begin(self):
+    super(LoggingMetricHook, self).begin()
+    self._global_step_tensor = tf.compat.v1.train.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          "Global step should be created to use LoggingMetricHook.")
+    if self._global_step_tensor.name not in self._current_tensors:
+      self._current_tensors[self._global_step_tensor.name] = (
+          self._global_step_tensor)
+
+  def after_run(self, unused_run_context, run_values):
+    # should_trigger is a internal state that populated at before_run, and it is
+    # using self_timer to determine whether it should trigger.
+    if self._should_trigger:
+      self._log_metric(run_values.results)
+
+    self._iter_count += 1
+
+  def end(self, session):
+    if self._log_at_end:
+      values = session.run(self._current_tensors)
+      self._log_metric(values)
+
+  def _log_metric(self, tensor_values):
+    self._timer.update_last_triggered_step(self._iter_count)
+    global_step = tensor_values[self._global_step_tensor.name]
+    # self._tag_order is populated during the init of LoggingTensorHook
+    for tag in self._tag_order:
+      self._logger.log_metric(tag, tensor_values[tag], global_step=global_step)
@@ -0,0 +1,217 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for metric_hook."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+import time
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+from tensorflow.python.training import monitored_session  # pylint: disable=g-bad-import-order
+
+from official.utils.logs import metric_hook
+from official.utils.testing import mock_lib
+
+
+class LoggingMetricHookTest(tf.test.TestCase):
+  """Tests for LoggingMetricHook."""
+
+  def setUp(self):
+    super(LoggingMetricHookTest, self).setUp()
+
+    self._log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
+    self._logger = mock_lib.MockBenchmarkLogger()
+
+  def tearDown(self):
+    super(LoggingMetricHookTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+
+  def test_illegal_args(self):
+    with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"):
+      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=0)
+    with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"):
+      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=-10)
+    with self.assertRaisesRegexp(ValueError, "xactly one of"):
+      metric_hook.LoggingMetricHook(
+          tensors=["t"], every_n_iter=5, every_n_secs=5)
+    with self.assertRaisesRegexp(ValueError, "xactly one of"):
+      metric_hook.LoggingMetricHook(tensors=["t"])
+    with self.assertRaisesRegexp(ValueError, "metric_logger"):
+      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=5)
+
+  def test_print_at_end_only(self):
+    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+      tf.compat.v1.train.get_or_create_global_step()
+      t = tf.constant(42.0, name="foo")
+      train_op = tf.constant(3)
+      hook = metric_hook.LoggingMetricHook(
+          tensors=[t.name], at_end=True, metric_logger=self._logger)
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
+      sess.run(tf.compat.v1.global_variables_initializer())
+
+      for _ in range(3):
+        mon_sess.run(train_op)
+        self.assertEqual(self._logger.logged_metric, [])
+
+      hook.end(sess)
+      self.assertEqual(len(self._logger.logged_metric), 1)
+      metric = self._logger.logged_metric[0]
+      self.assertRegexpMatches(metric["name"], "foo")
+      self.assertEqual(metric["value"], 42.0)
+      self.assertEqual(metric["unit"], None)
+      self.assertEqual(metric["global_step"], 0)
+
+  def test_global_step_not_found(self):
+    with tf.Graph().as_default():
+      t = tf.constant(42.0, name="foo")
+      hook = metric_hook.LoggingMetricHook(
+          tensors=[t.name], at_end=True, metric_logger=self._logger)
+
+      with self.assertRaisesRegexp(
+          RuntimeError, "should be created to use LoggingMetricHook."):
+        hook.begin()
+
+  def test_log_tensors(self):
+    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+      tf.compat.v1.train.get_or_create_global_step()
+      t1 = tf.constant(42.0, name="foo")
+      t2 = tf.constant(43.0, name="bar")
+      train_op = tf.constant(3)
+      hook = metric_hook.LoggingMetricHook(
+          tensors=[t1, t2], at_end=True, metric_logger=self._logger)
+      hook.begin()
+      mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
+      sess.run(tf.compat.v1.global_variables_initializer())
+
+      for _ in range(3):
+        mon_sess.run(train_op)
+        self.assertEqual(self._logger.logged_metric, [])
+
+      hook.end(sess)
+      self.assertEqual(len(self._logger.logged_metric), 2)
+      metric1 = self._logger.logged_metric[0]
+      self.assertRegexpMatches(str(metric1["name"]), "foo")
+      self.assertEqual(metric1["value"], 42.0)
+      self.assertEqual(metric1["unit"], None)
+      self.assertEqual(metric1["global_step"], 0)
+
+      metric2 = self._logger.logged_metric[1]
+      self.assertRegexpMatches(str(metric2["name"]), "bar")
+      self.assertEqual(metric2["value"], 43.0)
+      self.assertEqual(metric2["unit"], None)
+      self.assertEqual(metric2["global_step"], 0)
+
+  def _validate_print_every_n_steps(self, sess, at_end):
+    t = tf.constant(42.0, name="foo")
+
+    train_op = tf.constant(3)
+    hook = metric_hook.LoggingMetricHook(
+        tensors=[t.name], every_n_iter=10, at_end=at_end,
+        metric_logger=self._logger)
+    hook.begin()
+    mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
+    sess.run(tf.compat.v1.global_variables_initializer())
+    mon_sess.run(train_op)
+    self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
+    for _ in range(3):
+      self._logger.logged_metric = []
+      for _ in range(9):
+        mon_sess.run(train_op)
+        # assertNotRegexpMatches is not supported by python 3.1 and later
+        self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
+      mon_sess.run(train_op)
+      self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
+
+    # Add additional run to verify proper reset when called multiple times.
+    self._logger.logged_metric = []
+    mon_sess.run(train_op)
+    # assertNotRegexpMatches is not supported by python 3.1 and later
+    self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
+
+    self._logger.logged_metric = []
+    hook.end(sess)
+    if at_end:
+      self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
+    else:
+      # assertNotRegexpMatches is not supported by python 3.1 and later
+      self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
+
+  def test_print_every_n_steps(self):
+    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+      tf.compat.v1.train.get_or_create_global_step()
+      self._validate_print_every_n_steps(sess, at_end=False)
+      # Verify proper reset.
+      self._validate_print_every_n_steps(sess, at_end=False)
+
+  def test_print_every_n_steps_and_end(self):
+    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+      tf.compat.v1.train.get_or_create_global_step()
+      self._validate_print_every_n_steps(sess, at_end=True)
+      # Verify proper reset.
+      self._validate_print_every_n_steps(sess, at_end=True)
+
+  def _validate_print_every_n_secs(self, sess, at_end):
+    t = tf.constant(42.0, name="foo")
+    train_op = tf.constant(3)
+
+    hook = metric_hook.LoggingMetricHook(
+        tensors=[t.name], every_n_secs=1.0, at_end=at_end,
+        metric_logger=self._logger)
+    hook.begin()
+    mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
+    sess.run(tf.compat.v1.global_variables_initializer())
+
+    mon_sess.run(train_op)
+    self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
+
+    # assertNotRegexpMatches is not supported by python 3.1 and later
+    self._logger.logged_metric = []
+    mon_sess.run(train_op)
+    self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
+    time.sleep(1.0)
+
+    self._logger.logged_metric = []
+    mon_sess.run(train_op)
+    self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
+
+    self._logger.logged_metric = []
+    hook.end(sess)
+    if at_end:
+      self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
+    else:
+      # assertNotRegexpMatches is not supported by python 3.1 and later
+      self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
+
+  def test_print_every_n_secs(self):
+    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+      tf.compat.v1.train.get_or_create_global_step()
+      self._validate_print_every_n_secs(sess, at_end=False)
+      # Verify proper reset.
+      self._validate_print_every_n_secs(sess, at_end=False)
+
+  def test_print_every_n_secs_and_end(self):
+    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
+      tf.compat.v1.train.get_or_create_global_step()
+      self._validate_print_every_n_secs(sess, at_end=True)
+      # Verify proper reset.
+      self._validate_print_every_n_secs(sess, at_end=True)
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Wrapper for the mlperf logging utils.
+
+MLPerf compliance logging is only desired under a limited set of circumstances.
+This module is intended to keep users from needing to consider logging (or
+install the module) unless they are performing mlperf runs.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import json
+import os
+import re
+import subprocess
+import sys
+import typing
+
+import tensorflow as tf
+
+_MIN_VERSION = (0, 0, 10)
+_STACK_OFFSET = 2
+
+SUDO = "sudo" if os.geteuid() else ""
+
+# This indirection is used in docker.
+DROP_CACHE_LOC = os.getenv("DROP_CACHE_LOC", "/proc/sys/vm/drop_caches")
+
+_NCF_PREFIX = "NCF_RAW_"
+
+# TODO(robieta): move line parsing to mlperf util
+_PREFIX = r"(?:{})?:::MLPv([0-9]+).([0-9]+).([0-9]+)".format(_NCF_PREFIX)
+_BENCHMARK = r"([a-zA-Z0-9_]+)"
+_TIMESTAMP = r"([0-9]+\.[0-9]+)"
+_CALLSITE = r"\((.+):([0-9]+)\)"
+_TAG = r"([a-zA-Z0-9_]+)"
+_VALUE = r"(.*)"
+
+ParsedLine = namedtuple("ParsedLine", ["version", "benchmark", "timestamp",
+                                       "callsite", "tag", "value"])
+
+LINE_PATTERN = re.compile(
+    "^{prefix} {benchmark} {timestamp} {callsite} {tag}(: |$){value}?$".format(
+        prefix=_PREFIX, benchmark=_BENCHMARK, timestamp=_TIMESTAMP,
+        callsite=_CALLSITE, tag=_TAG, value=_VALUE))
+
+
+def parse_line(line): # type: (str) -> typing.Optional[ParsedLine]
+  match = LINE_PATTERN.match(line.strip())
+  if not match:
+    return
+
+  major, minor, micro, benchmark, timestamp = match.groups()[:5]
+  call_file, call_line, tag, _, value = match.groups()[5:]
+
+  return ParsedLine(version=(int(major), int(minor), int(micro)),
+                    benchmark=benchmark, timestamp=timestamp,
+                    callsite=(call_file, call_line), tag=tag, value=value)
+
+
+def unparse_line(parsed_line): # type: (ParsedLine) -> str
+  version_str = "{}.{}.{}".format(*parsed_line.version)
+  callsite_str = "({}:{})".format(*parsed_line.callsite)
+  value_str = ": {}".format(parsed_line.value) if parsed_line.value else ""
+  return ":::MLPv{} {} {} {} {} {}".format(
+      version_str, parsed_line.benchmark, parsed_line.timestamp, callsite_str,
+      parsed_line.tag, value_str)
+
+
+def get_mlperf_log():
+  """Shielded import of mlperf_log module."""
+  try:
+    import mlperf_compliance
+
+    def test_mlperf_log_pip_version():
+      """Check that mlperf_compliance is up to date."""
+      import pkg_resources
+      version = pkg_resources.get_distribution("mlperf_compliance")
+      version = tuple(int(i) for i in version.version.split("."))
+      if version < _MIN_VERSION:
+        tf.compat.v1.logging.warning(
+            "mlperf_compliance is version {}, must be >= {}".format(
+                ".".join([str(i) for i in version]),
+                ".".join([str(i) for i in _MIN_VERSION])))
+        raise ImportError
+      return mlperf_compliance.mlperf_log
+
+    mlperf_log = test_mlperf_log_pip_version()
+
+  except ImportError:
+    mlperf_log = None
+
+  return mlperf_log
+
+
+class Logger(object):
+  """MLPerf logger indirection class.
+
+  This logger only logs for MLPerf runs, and prevents various errors associated
+  with not having the mlperf_compliance package installed.
+  """
+  class Tags(object):
+    def __init__(self, mlperf_log):
+      self._enabled = False
+      self._mlperf_log = mlperf_log
+
+    def __getattr__(self, item):
+      if self._mlperf_log is None or not self._enabled:
+        return
+      return getattr(self._mlperf_log, item)
+
+  def __init__(self):
+    self._enabled = False
+    self._mlperf_log = get_mlperf_log()
+    self.tags = self.Tags(self._mlperf_log)
+
+  def __call__(self, enable=False):
+    if enable and self._mlperf_log is None:
+      raise ImportError("MLPerf logging was requested, but mlperf_compliance "
+                        "module could not be loaded.")
+
+    self._enabled = enable
+    self.tags._enabled = enable
+    return self
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    self._enabled = False
+    self.tags._enabled = False
+
+  @property
+  def log_file(self):
+    if self._mlperf_log is None:
+      return
+    return self._mlperf_log.LOG_FILE
+
+  @property
+  def enabled(self):
+    return self._enabled
+
+  def ncf_print(self, key, value=None, stack_offset=_STACK_OFFSET,
+                deferred=False, extra_print=False, prefix=_NCF_PREFIX):
+    if self._mlperf_log is None or not self.enabled:
+      return
+    self._mlperf_log.ncf_print(key=key, value=value, stack_offset=stack_offset,
+                               deferred=deferred, extra_print=extra_print,
+                               prefix=prefix)
+
+  def set_ncf_root(self, path):
+    if self._mlperf_log is None:
+      return
+    self._mlperf_log.ROOT_DIR_NCF = path
+
+
+LOGGER = Logger()
+ncf_print, set_ncf_root = LOGGER.ncf_print, LOGGER.set_ncf_root
+TAGS = LOGGER.tags
+
+
+def clear_system_caches():
+  if not LOGGER.enabled:
+    return
+  ret_code = subprocess.call(
+      ["sync && echo 3 | {} tee {}".format(SUDO, DROP_CACHE_LOC)],
+      shell=True)
+
+  if ret_code:
+    raise ValueError("Failed to clear caches")
+
+
+if __name__ == "__main__":
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  with LOGGER(True):
+    ncf_print(key=TAGS.RUN_START)
@@ -0,0 +1,62 @@
+"""A simple Python callstack sampler."""
+
+import contextlib
+import datetime
+import signal
+import traceback
+
+
+class CallstackSampler(object):
+  """A simple signal-based Python callstack sampler.
+  """
+
+  def __init__(self, interval=None):
+    self.stacks = []
+    self.interval = 0.001 if interval is None else interval
+
+  def _sample(self, signum, frame):
+    """Samples the current stack."""
+    del signum
+    stack = traceback.extract_stack(frame)
+    formatted_stack = []
+    formatted_stack.append(datetime.datetime.utcnow())
+    for filename, lineno, function_name, text in stack:
+      formatted_frame = '{}:{}({})({})'.format(filename, lineno, function_name,
+                                               text)
+      formatted_stack.append(formatted_frame)
+    self.stacks.append(formatted_stack)
+    signal.setitimer(signal.ITIMER_VIRTUAL, self.interval, 0)
+
+  @contextlib.contextmanager
+  def profile(self):
+    signal.signal(signal.SIGVTALRM, self._sample)
+    signal.setitimer(signal.ITIMER_VIRTUAL, self.interval, 0)
+    try:
+      yield
+    finally:
+      signal.setitimer(signal.ITIMER_VIRTUAL, 0)
+
+  def save(self, fname):
+    with open(fname, 'w') as f:
+      for s in self.stacks:
+        for l in s:
+          f.write('%s\n' % l)
+        f.write('\n')
+
+
+@contextlib.contextmanager
+def callstack_sampling(filename, interval=None):
+  """Periodically samples the Python callstack.
+
+  Args:
+    filename: the filename
+    interval: the sampling interval, in seconds. Defaults to 0.001.
+
+  Yields:
+   nothing
+  """
+  sampler = CallstackSampler(interval=interval)
+  with sampler.profile():
+    yield
+  sampler.save(filename)
+
@@ -0,0 +1,338 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions for running models in a distributed setting."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+import random
+import string
+import tensorflow.compat.v2 as tf
+
+from official.utils.misc import tpu_lib
+
+
+def _collective_communication(all_reduce_alg):
+  """Return a CollectiveCommunication based on all_reduce_alg.
+
+  Args:
+    all_reduce_alg: a string specifying which collective communication to pick,
+      or None.
+
+  Returns:
+    tf.distribute.experimental.CollectiveCommunication object
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl']
+  """
+  collective_communication_options = {
+      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+  }
+  if all_reduce_alg not in collective_communication_options:
+    raise ValueError(
+        "When used with `multi_worker_mirrored`, valid values for "
+        "all_reduce_alg are ['ring', 'nccl'].  Supplied value: {}".format(
+            all_reduce_alg))
+  return collective_communication_options[all_reduce_alg]
+
+
+def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
+  """Return a CrossDeviceOps based on all_reduce_alg and num_packs.
+
+  Args:
+    all_reduce_alg: a string specifying which cross device op to pick, or None.
+    num_packs: an integer specifying number of packs for the cross device op.
+
+  Returns:
+    tf.distribute.CrossDeviceOps object or None.
+
+  Raises:
+    ValueError: if `all_reduce_alg` not in [None, 'nccl', 'hierarchical_copy'].
+  """
+  if all_reduce_alg is None:
+    return None
+  mirrored_all_reduce_options = {
+      "nccl": tf.distribute.NcclAllReduce,
+      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+  }
+  if all_reduce_alg not in mirrored_all_reduce_options:
+    raise ValueError(
+        "When used with `mirrored`, valid values for all_reduce_alg are "
+        "['nccl', 'hierarchical_copy'].  Supplied value: {}".format(
+            all_reduce_alg))
+  cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
+  return cross_device_ops_class(num_packs=num_packs)
+
+
+def get_distribution_strategy(distribution_strategy="mirrored",
+                              num_gpus=0,
+                              all_reduce_alg=None,
+                              num_packs=1,
+                              tpu_address=None):
+  """Return a DistributionStrategy for running the model.
+
+  Args:
+    distribution_strategy: a string specifying which distribution strategy to
+      use. Accepted values are 'off', 'one_device', 'mirrored',
+      'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
+      'off' means not to use Distribution Strategy; 'tpu' means to use
+      TPUStrategy using `tpu_address`.
+    num_gpus: Number of GPUs to run this model.
+    all_reduce_alg: Optional. Specifies which algorithm to use when performing
+      all-reduce. For `MirroredStrategy`, valid values are "nccl" and
+      "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
+      "ring" and "nccl".  If None, DistributionStrategy will choose based on
+      device topology.
+    num_packs: Optional.  Sets the `num_packs` in `tf.distribute.NcclAllReduce`
+      or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
+    tpu_address: Optional. String that represents TPU to connect to. Must not
+      be None if `distribution_strategy` is set to `tpu`.
+  Returns:
+    tf.distribute.DistibutionStrategy object.
+  Raises:
+    ValueError: if `distribution_strategy` is 'off' or 'one_device' and
+      `num_gpus` is larger than 1; or `num_gpus` is negative or if
+      `distribution_strategy` is `tpu` but `tpu_address` is not specified.
+  """
+  if num_gpus < 0:
+    raise ValueError("`num_gpus` can not be negative.")
+
+  distribution_strategy = distribution_strategy.lower()
+  if distribution_strategy == "off":
+    if num_gpus > 1:
+      raise ValueError(
+          "When {} GPUs are specified, distribution_strategy "
+          "flag cannot be set to 'off'.".format(num_gpus))
+    return None
+
+  if distribution_strategy == "tpu":
+    # When tpu_address is an empty string, we communicate with local TPUs.
+    cluster_resolver = tpu_lib.tpu_initialize(tpu_address)
+    return tf.distribute.experimental.TPUStrategy(cluster_resolver)
+
+  if distribution_strategy == "multi_worker_mirrored":
+    return tf.distribute.experimental.MultiWorkerMirroredStrategy(
+        communication=_collective_communication(all_reduce_alg))
+
+  if distribution_strategy == "one_device":
+    if num_gpus == 0:
+      return tf.distribute.OneDeviceStrategy("device:CPU:0")
+    if num_gpus > 1:
+      raise ValueError("`OneDeviceStrategy` can not be used for more than "
+                       "one device.")
+    return tf.distribute.OneDeviceStrategy("device:GPU:0")
+
+  if distribution_strategy == "mirrored":
+    if num_gpus == 0:
+      devices = ["device:CPU:0"]
+    else:
+      devices = ["device:GPU:%d" % i for i in range(num_gpus)]
+    return tf.distribute.MirroredStrategy(
+        devices=devices,
+        cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
+
+  if distribution_strategy == "parameter_server":
+    return tf.distribute.experimental.ParameterServerStrategy()
+
+  raise ValueError(
+      "Unrecognized Distribution Strategy: %r" % distribution_strategy)
+
+
+def per_replica_batch_size(batch_size, num_gpus):
+  """For multi-gpu, batch-size must be a multiple of the number of GPUs.
+
+
+  Note that distribution strategy handles this automatically when used with
+  Keras. For using with Estimator, we need to get per GPU batch.
+
+  Args:
+    batch_size: Global batch size to be divided among devices. This should be
+      equal to num_gpus times the single-GPU batch_size for multi-gpu training.
+    num_gpus: How many GPUs are used with DistributionStrategies.
+
+  Returns:
+    Batch size per device.
+
+  Raises:
+    ValueError: if batch_size is not divisible by number of devices
+  """
+  if num_gpus <= 1:
+    return batch_size
+
+  remainder = batch_size % num_gpus
+  if remainder:
+    err = ('When running with multiple GPUs, batch size '
+           'must be a multiple of the number of available GPUs. Found {} '
+           'GPUs with a batch size of {}; try --batch_size={} instead.'
+          ).format(num_gpus, batch_size, batch_size - remainder)
+    raise ValueError(err)
+  return int(batch_size / num_gpus)
+
+
+# The `SyntheticDataset` is a temporary solution for generating synthetic data
+# directly on devices. It is only useful for Keras with Distribution
+# Strategies. We will have better support in `tf.data` or Distribution Strategy
+# later.
+class SyntheticDataset(object):
+  """A dataset that generates synthetic data on each device."""
+
+  def __init__(self, dataset, split_by=1):
+    # dataset.take(1) doesn't have GPU kernel.
+    with tf.device('device:CPU:0'):
+      tensor = tf.data.experimental.get_single_element(dataset.take(1))
+    flat_tensor = tf.nest.flatten(tensor)
+    variable_data = []
+    initializers = []
+    for t in flat_tensor:
+      rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
+      assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
+      v = tf.compat.v1.get_local_variable(self._random_name(),
+                                          initializer=rebatched_t)
+      variable_data.append(v)
+      initializers.append(v.initializer)
+    input_data = tf.nest.pack_sequence_as(tensor, variable_data)
+    self._iterator = SyntheticIterator(input_data, initializers)
+
+  def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
+    return ''.join(random.choice(chars) for _ in range(size))
+
+  def __iter__(self):
+    return self._iterator
+
+  def make_one_shot_iterator(self):
+    return self._iterator
+
+  def make_initializable_iterator(self):
+    return self._iterator
+
+
+class SyntheticIterator(object):
+  """A dataset that generates synthetic data on each device."""
+
+  def __init__(self, input_data, initializers):
+    self._input_data = input_data
+    self._initializers = initializers
+
+  def get_next(self):
+    return self._input_data
+
+  def next(self):
+    return self.__next__()
+
+  def __next__(self):
+    try:
+      return self.get_next()
+    except tf.errors.OutOfRangeError:
+      raise StopIteration
+
+  def initialize(self):
+    if tf.executing_eagerly():
+      return tf.no_op()
+    else:
+      return self._initializers
+
+
+def _monkey_patch_dataset_method(strategy):
+  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
+  def make_dataset(self, dataset):
+    tf.compat.v1.logging.info('Using pure synthetic data.')
+    with self.scope():
+      if self.extended._global_batch_size:  # pylint: disable=protected-access
+        return SyntheticDataset(dataset, self.num_replicas_in_sync)
+      else:
+        return SyntheticDataset(dataset)
+
+  def make_iterator(self, dataset):
+    dist_dataset = make_dataset(self, dataset)
+    return iter(dist_dataset)
+
+  strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
+  strategy.make_dataset_iterator = make_iterator
+  strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
+  strategy.experimental_distribute_dataset = make_dataset
+
+
+def _undo_monkey_patch_dataset_method(strategy):
+  if hasattr(strategy, 'orig_make_dataset_iterator'):
+    strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
+  if hasattr(strategy, 'orig_distribute_dataset'):
+    strategy.make_dataset_iterator = strategy.orig_distribute_dataset
+
+
+def set_up_synthetic_data():
+  _monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
+  _monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
+  _monkey_patch_dataset_method(
+      tf.distribute.experimental.MultiWorkerMirroredStrategy)
+
+
+def undo_set_up_synthetic_data():
+  _undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
+  _undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
+  _undo_monkey_patch_dataset_method(
+      tf.distribute.experimental.MultiWorkerMirroredStrategy)
+
+
+def configure_cluster(worker_hosts=None, task_index=-1):
+  """Set multi-worker cluster spec in TF_CONFIG environment variable.
+
+  Args:
+    worker_hosts: comma-separated list of worker ip:port pairs.
+
+  Returns:
+    Number of workers in the cluster.
+  """
+  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
+  if tf_config:
+    num_workers = (len(tf_config['cluster'].get('chief', [])) +
+                   len(tf_config['cluster'].get('worker', [])))
+  elif worker_hosts:
+    workers = worker_hosts.split(',')
+    num_workers = len(workers)
+    if num_workers > 1 and task_index < 0:
+      raise ValueError('Must specify task_index when number of workers > 1')
+    task_index = 0 if num_workers == 1 else task_index
+    os.environ['TF_CONFIG'] = json.dumps({
+        'cluster': {
+            'worker': workers
+        },
+        'task': {'type': 'worker', 'index': task_index}
+    })
+  else:
+    num_workers = 1
+  return num_workers
+
+
+def get_strategy_scope(strategy):
+  if strategy:
+    strategy_scope = strategy.scope()
+  else:
+    strategy_scope = DummyContextManager()
+
+  return strategy_scope
+
+
+class DummyContextManager(object):
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, *args):
+    pass
@@ -0,0 +1,65 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+""" Tests for distribution util functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v2 as tf
+
+from official.utils.misc import distribution_utils
+
+
+class GetDistributionStrategyTest(tf.test.TestCase):
+  """Tests for get_distribution_strategy."""
+  def test_one_device_strategy_cpu(self):
+    ds = distribution_utils.get_distribution_strategy(num_gpus=0)
+    self.assertEquals(ds.num_replicas_in_sync, 1)
+    self.assertEquals(len(ds.extended.worker_devices), 1)
+    self.assertIn('CPU', ds.extended.worker_devices[0])
+
+  def test_one_device_strategy_gpu(self):
+    ds = distribution_utils.get_distribution_strategy(num_gpus=1)
+    self.assertEquals(ds.num_replicas_in_sync, 1)
+    self.assertEquals(len(ds.extended.worker_devices), 1)
+    self.assertIn('GPU', ds.extended.worker_devices[0])
+
+  def test_mirrored_strategy(self):
+    ds = distribution_utils.get_distribution_strategy(num_gpus=5)
+    self.assertEquals(ds.num_replicas_in_sync, 5)
+    self.assertEquals(len(ds.extended.worker_devices), 5)
+    for device in ds.extended.worker_devices:
+      self.assertIn('GPU', device)
+
+
+class PerReplicaBatchSizeTest(tf.test.TestCase):
+  """Tests for per_replica_batch_size."""
+
+  def test_batch_size(self):
+    self.assertEquals(
+        distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
+    self.assertEquals(
+        distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
+    self.assertEquals(
+        distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
+
+  def test_batch_size_with_remainder(self):
+    with self.assertRaises(ValueError):
+        distribution_utils.per_replica_batch_size(147, num_gpus=5)
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,262 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Helper functions for the Keras implementations of models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import time
+
+from absl import logging
+import tensorflow.compat.v2 as tf
+from tensorflow.python import tf2
+from tensorflow.python.profiler import profiler as profiler
+
+
+class BatchTimestamp(object):
+  """A structure to store batch time stamp."""
+
+  def __init__(self, batch_index, timestamp):
+    self.batch_index = batch_index
+    self.timestamp = timestamp
+
+  def __repr__(self):
+    return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(
+        self.batch_index, self.timestamp)
+
+
+class TimeHistory(tf.keras.callbacks.Callback):
+  """Callback for Keras models."""
+
+  def __init__(self, batch_size, log_steps, logdir=None):
+    """Callback for logging performance.
+
+    Args:
+      batch_size: Total batch size.
+      log_steps: Interval of steps between logging of batch level stats.
+      logdir: Optional directory to write TensorBoard summaries.
+    """
+    # TODO(wcromar): remove this parameter and rely on `logs` parameter of
+    # on_train_batch_end()
+    self.batch_size = batch_size
+    super(TimeHistory, self).__init__()
+    self.log_steps = log_steps
+    self.last_log_step = 0
+    self.steps_before_epoch = 0
+    self.steps_in_epoch = 0
+    self.start_time = None
+
+    if logdir:
+      self.summary_writer = tf.summary.create_file_writer(logdir)
+    else:
+      self.summary_writer = None
+
+    # Logs start of step 1 then end of each step based on log_steps interval.
+    self.timestamp_log = []
+
+    # Records the time each epoch takes to run from start to finish of epoch.
+    self.epoch_runtime_log = []
+
+  @property
+  def global_steps(self):
+    """The current 1-indexed global step."""
+    return self.steps_before_epoch + self.steps_in_epoch
+
+  @property
+  def average_steps_per_second(self):
+    """The average training steps per second across all epochs."""
+    return self.global_steps / sum(self.epoch_runtime_log)
+
+  @property
+  def average_examples_per_second(self):
+    """The average number of training examples per second across all epochs."""
+    return self.average_steps_per_second * self.batch_size
+
+  def on_train_end(self, logs=None):
+    self.train_finish_time = time.time()
+
+    if self.summary_writer:
+      self.summary_writer.flush()
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self.epoch_start = time.time()
+
+  def on_batch_begin(self, batch, logs=None):
+    if not self.start_time:
+      self.start_time = time.time()
+
+    # Record the timestamp of the first global step
+    if not self.timestamp_log:
+      self.timestamp_log.append(BatchTimestamp(self.global_steps,
+                                               self.start_time))
+
+  def on_batch_end(self, batch, logs=None):
+    """Records elapse time of the batch and calculates examples per second."""
+    self.steps_in_epoch = batch + 1
+    steps_since_last_log = self.global_steps - self.last_log_step
+    if steps_since_last_log >= self.log_steps:
+      now = time.time()
+      elapsed_time = now - self.start_time
+      steps_per_second = steps_since_last_log / elapsed_time
+      examples_per_second = steps_per_second * self.batch_size
+
+      self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
+      logging.info(
+          'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
+          'and %d', elapsed_time, examples_per_second, self.last_log_step,
+          self.global_steps)
+
+      if self.summary_writer:
+        with self.summary_writer.as_default():
+          tf.summary.scalar('global_step/sec', steps_per_second,
+                            self.global_steps)
+          tf.summary.scalar('examples/sec', examples_per_second,
+                            self.global_steps)
+
+      self.last_log_step = self.global_steps
+      self.start_time = None
+
+  def on_epoch_end(self, epoch, logs=None):
+    epoch_run_time = time.time() - self.epoch_start
+    self.epoch_runtime_log.append(epoch_run_time)
+
+    self.steps_before_epoch += self.steps_in_epoch
+    self.steps_in_epoch = 0
+
+
+def get_profiler_callback(model_dir, profile_steps, enable_tensorboard,
+                          steps_per_epoch):
+  """Validate profile_steps flag value and return profiler callback."""
+  profile_steps_error_message = (
+      'profile_steps must be a comma separated pair of positive integers, '
+      'specifying the first and last steps to be profiled.'
+  )
+  try:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+  except ValueError:
+    raise ValueError(profile_steps_error_message)
+  if len(profile_steps) != 2:
+    raise ValueError(profile_steps_error_message)
+  start_step, stop_step = profile_steps
+  if start_step < 0 or start_step > stop_step:
+    raise ValueError(profile_steps_error_message)
+  if enable_tensorboard:
+    logging.warning(
+        'Both TensorBoard and profiler callbacks are used. Note that the '
+        'TensorBoard callback profiles the 2nd step (unless otherwise '
+        'specified). Please make sure the steps profiled by the two callbacks '
+        'do not overlap.')
+  return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
+
+
+class ProfilerCallback(tf.keras.callbacks.Callback):
+  """Save profiles in specified step range to log directory."""
+
+  def __init__(self, log_dir, start_step, stop_step, steps_per_epoch):
+    super(ProfilerCallback, self).__init__()
+    self.log_dir = log_dir
+    self.start_step = start_step
+    self.stop_step = stop_step
+    self.start_epoch = start_step // steps_per_epoch
+    self.stop_epoch = stop_step // steps_per_epoch
+    self.start_step_in_epoch = start_step % steps_per_epoch
+    self.stop_step_in_epoch = stop_step % steps_per_epoch
+    self.should_start = False
+    self.should_stop = False
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if epoch == self.start_epoch:
+      self.should_start = True
+    if epoch == self.stop_epoch:
+      self.should_stop = True
+
+  def on_batch_begin(self, batch, logs=None):
+    if batch == self.start_step_in_epoch and self.should_start:
+      self.should_start = False
+      profiler.start(self.log_dir)
+      logging.info('Profiler started at Step %s', self.start_step)
+
+  def on_batch_end(self, batch, logs=None):
+    if batch == self.stop_step_in_epoch and self.should_stop:
+      self.should_stop = False
+      profiler.stop()
+      logging.info('Profiler saved profiles for steps between %s and %s to %s',
+                   self.start_step, self.stop_step, self.log_dir)
+
+
+def set_session_config(enable_eager=False,
+                       enable_xla=False):
+  """Sets the session config."""
+  if is_v2_0():
+    set_config_v2(enable_xla=enable_xla)
+  else:
+    config = get_config_proto_v1(enable_xla=enable_xla)
+    if enable_eager:
+      tf.compat.v1.enable_eager_execution(config=config)
+    else:
+      sess = tf.compat.v1.Session(config=config)
+      tf.compat.v1.keras.backend.set_session(sess)
+
+
+def get_config_proto_v1(enable_xla=False):
+  """Return config proto according to flag settings, or None to use default."""
+  config = None
+  if enable_xla:
+    config = tf.compat.v1.ConfigProto()
+    config.graph_options.optimizer_options.global_jit_level = (
+        tf.OptimizerOptions.ON_2)
+  return config
+
+
+def set_config_v2(enable_xla=False):
+  """Config eager context according to flag values using TF 2.0 API."""
+  if enable_xla:
+    tf.config.optimizer.set_jit(True)
+
+
+def is_v2_0():
+  """Returns true if using tf 2.0."""
+  return tf2.enabled()
+
+
+def set_gpu_thread_mode_and_count(gpu_thread_mode,
+                                  datasets_num_private_threads,
+                                  num_gpus, per_gpu_thread_count):
+  """Set GPU thread mode and count, and adjust dataset threads count."""
+  cpu_count = multiprocessing.cpu_count()
+  logging.info('Logical CPU cores: %s', cpu_count)
+
+  # Allocate private thread pool for each GPU to schedule and launch kernels
+  per_gpu_thread_count = per_gpu_thread_count or 2
+  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
+  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+  logging.info('TF_GPU_THREAD_COUNT: %s',
+               os.environ['TF_GPU_THREAD_COUNT'])
+  logging.info('TF_GPU_THREAD_MODE: %s',
+               os.environ['TF_GPU_THREAD_MODE'])
+
+  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
+  # private threads and memory copy threads.
+  total_gpu_thread_count = per_gpu_thread_count * num_gpus
+  num_runtime_threads = num_gpus
+  if not datasets_num_private_threads:
+    datasets_num_private_threads = min(
+        cpu_count - total_gpu_thread_count - num_runtime_threads,
+        num_gpus * 8)
+    logging.info('Set datasets_num_private_threads to %s',
+                 datasets_num_private_threads)
@@ -0,0 +1,93 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Miscellaneous functions that can be called by models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numbers
+
+import tensorflow as tf
+from tensorflow.python.util import nest
+
+
+def past_stop_threshold(stop_threshold, eval_metric):
+  """Return a boolean representing whether a model should be stopped.
+
+  Args:
+    stop_threshold: float, the threshold above which a model should stop
+      training.
+    eval_metric: float, the current value of the relevant metric to check.
+
+  Returns:
+    True if training should stop, False otherwise.
+
+  Raises:
+    ValueError: if either stop_threshold or eval_metric is not a number
+  """
+  if stop_threshold is None:
+    return False
+
+  if not isinstance(stop_threshold, numbers.Number):
+    raise ValueError("Threshold for checking stop conditions must be a number.")
+  if not isinstance(eval_metric, numbers.Number):
+    raise ValueError("Eval metric being checked against stop conditions "
+                     "must be a number.")
+
+  if eval_metric >= stop_threshold:
+    tf.compat.v1.logging.info(
+        "Stop threshold of {} was passed with metric value {}.".format(
+            stop_threshold, eval_metric))
+    return True
+
+  return False
+
+
+def generate_synthetic_data(
+    input_shape, input_value=0, input_dtype=None, label_shape=None,
+    label_value=0, label_dtype=None):
+  """Create a repeating dataset with constant values.
+
+  Args:
+    input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
+      the input data.
+    input_value: Value of each input element.
+    input_dtype: Input dtype. If None, will be inferred by the input value.
+    label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
+      the label data.
+    label_value: Value of each input element.
+    label_dtype: Input dtype. If None, will be inferred by the target value.
+
+  Returns:
+    Dataset of tensors or tuples of tensors (if label_shape is set).
+  """
+  # TODO(kathywu): Replace with SyntheticDataset once it is in contrib.
+  element = input_element = nest.map_structure(
+      lambda s: tf.constant(input_value, input_dtype, s), input_shape)
+
+  if label_shape:
+    label_element = nest.map_structure(
+        lambda s: tf.constant(label_value, label_dtype, s), label_shape)
+    element = (input_element, label_element)
+
+  return tf.data.Dataset.from_tensors(element).repeat()
+
+
+def apply_clean(flags_obj):
+  if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
+    tf.compat.v1.logging.info("--clean flag set. Removing existing model dir:"
+                              " {}".format(flags_obj.model_dir))
+    tf.io.gfile.rmtree(flags_obj.model_dir)
@@ -0,0 +1,127 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Model Helper functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf  # pylint: disable=g-bad-import-order
+
+from official.utils.misc import keras_utils
+from official.utils.misc import model_helpers
+
+
+class PastStopThresholdTest(tf.test.TestCase):
+  """Tests for past_stop_threshold."""
+
+  def setUp(self):
+    super(PastStopThresholdTest, self).setUp()
+    if keras_utils.is_v2_0:
+      tf.compat.v1.disable_eager_execution()
+
+  def test_past_stop_threshold(self):
+    """Tests for normal operating conditions."""
+    self.assertTrue(model_helpers.past_stop_threshold(0.54, 1))
+    self.assertTrue(model_helpers.past_stop_threshold(54, 100))
+    self.assertFalse(model_helpers.past_stop_threshold(0.54, 0.1))
+    self.assertFalse(model_helpers.past_stop_threshold(-0.54, -1.5))
+    self.assertTrue(model_helpers.past_stop_threshold(-0.54, 0))
+    self.assertTrue(model_helpers.past_stop_threshold(0, 0))
+    self.assertTrue(model_helpers.past_stop_threshold(0.54, 0.54))
+
+  def test_past_stop_threshold_none_false(self):
+    """Tests that check None returns false."""
+    self.assertFalse(model_helpers.past_stop_threshold(None, -1.5))
+    self.assertFalse(model_helpers.past_stop_threshold(None, None))
+    self.assertFalse(model_helpers.past_stop_threshold(None, 1.5))
+    # Zero should be okay, though.
+    self.assertTrue(model_helpers.past_stop_threshold(0, 1.5))
+
+  def test_past_stop_threshold_not_number(self):
+    """Tests for error conditions."""
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold("str", 1)
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold("str", tf.constant(5))
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold("str", "another")
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(0, None)
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(0.7, "str")
+
+    with self.assertRaises(ValueError):
+      model_helpers.past_stop_threshold(tf.constant(4), None)
+
+
+class SyntheticDataTest(tf.test.TestCase):
+  """Tests for generate_synthetic_data."""
+
+  def test_generate_synethetic_data(self):
+    input_element, label_element = tf.compat.v1.data.make_one_shot_iterator(
+        model_helpers.generate_synthetic_data(input_shape=tf.TensorShape([5]),
+                                              input_value=123,
+                                              input_dtype=tf.float32,
+                                              label_shape=tf.TensorShape([]),
+                                              label_value=456,
+                                              label_dtype=tf.int32)).get_next()
+
+    with self.session() as sess:
+      for n in range(5):
+        inp, lab = sess.run((input_element, label_element))
+        self.assertAllClose(inp, [123., 123., 123., 123., 123.])
+        self.assertEquals(lab, 456)
+
+  def test_generate_only_input_data(self):
+    d = model_helpers.generate_synthetic_data(
+        input_shape=tf.TensorShape([4]),
+        input_value=43.5,
+        input_dtype=tf.float32)
+
+    element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
+    self.assertFalse(isinstance(element, tuple))
+
+    with self.session() as sess:
+      inp = sess.run(element)
+      self.assertAllClose(inp, [43.5, 43.5, 43.5, 43.5])
+
+  def test_generate_nested_data(self):
+    d = model_helpers.generate_synthetic_data(
+        input_shape={'a': tf.TensorShape([2]),
+                     'b': {'c': tf.TensorShape([3]), 'd': tf.TensorShape([])}},
+        input_value=1.1)
+
+    element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
+    self.assertIn('a', element)
+    self.assertIn('b', element)
+    self.assertEquals(len(element['b']), 2)
+    self.assertIn('c', element['b'])
+    self.assertIn('d', element['b'])
+    self.assertNotIn('c', element)
+
+    with self.session() as sess:
+      inp = sess.run(element)
+      self.assertAllClose(inp['a'], [1.1, 1.1])
+      self.assertAllClose(inp['b']['c'], [1.1, 1.1, 1.1])
+      self.assertAllClose(inp['b']['d'], 1.1)
+
+
+if __name__ == "__main__":
+  tf.test.main()
@@ -0,0 +1,34 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Initializes TPU system for TF 2.0."""
+
+import tensorflow as tf
+
+
+def tpu_initialize(tpu_address):
+  """Initializes TPU for TF 2.0 training.
+
+  Args:
+    tpu_address: string, bns address of master TPU worker.
+
+  Returns:
+    A TPUClusterResolver.
+  """
+  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
+      tpu=tpu_address)
+  if tpu_address not in ('', 'local'):
+    tf.config.experimental_connect_to_cluster(cluster_resolver)
+  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
+  return cluster_resolver
@@ -0,0 +1,83 @@
+# Lint as: python3
+"""Utils to annotate and trace benchmarks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+from absl import logging
+from absl.testing import flagsaver
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_multi_string(
+    'benchmark_method_flags', None,
+    'Optional list of runtime flags of the form key=value. Specify '
+    'multiple times to specify different flags. These will override the FLAGS '
+    'object directly after hardcoded settings in individual benchmark methods '
+    'before they call _run_and_report benchmark. Example if we set '
+    '--benchmark_method_flags=train_steps=10 and a benchmark method hardcodes '
+    'FLAGS.train_steps=10000 and later calls _run_and_report_benchmark, '
+    'it\'ll only run for 10 steps. This is useful for '
+    'debugging/profiling workflows.')
+
+
+def enable_runtime_flags(decorated_func):
+  """Sets attributes from --benchmark_method_flags for method execution.
+
+  @enable_runtime_flags decorator temporarily adds flags passed in via
+  --benchmark_method_flags and runs the decorated function in that context.
+
+  A user can set --benchmark_method_flags=train_steps=5 to run the benchmark
+  method in the snippet below with FLAGS.train_steps=5 for debugging (without
+  modifying the benchmark code).
+
+  class ModelBenchmark():
+
+    @benchmark_wrappers.enable_runtime_flags
+    def _run_and_report_benchmark(self):
+      # run benchmark ...
+      # report benchmark results ...
+
+    def benchmark_method(self):
+      FLAGS.train_steps = 1000
+      ...
+      self._run_and_report_benchmark()
+
+  Args:
+    decorated_func: The method that runs the benchmark after previous setup
+      execution that set some flags.
+
+  Returns:
+    new_func: The same method which executes in a temporary context where flag
+      overrides from --benchmark_method_flags are active.
+  """
+
+  def runner(*args, **kwargs):
+    """Creates a temporary context to activate --benchmark_method_flags."""
+    if FLAGS.benchmark_method_flags:
+      saved_flag_values = flagsaver.save_flag_values()
+      for key_value in FLAGS.benchmark_method_flags:
+        key, value = key_value.split('=', 1)
+        try:
+          numeric_float = float(value)
+          numeric_int = int(numeric_float)
+          if abs(numeric_int) == abs(numeric_float):
+            flag_value = numeric_int
+          else:
+            flag_value = numeric_float
+        except ValueError:
+          flag_value = value
+        logging.info('Setting --%s=%s', key, flag_value)
+        setattr(FLAGS, key, flag_value)
+    else:
+      saved_flag_values = None
+    try:
+      result = decorated_func(*args, **kwargs)
+      return result
+    finally:
+      if saved_flag_values:
+        flagsaver.restore_flag_values(saved_flag_values)
+
+  return runner
--- a/Show More
+++ b/Show More