[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,36 @@
FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04
WORKDIR /research
RUN apt-get update
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
build-essential \
git \
python \
python-pip
ENV HOME /research
ENV PYENV_ROOT $HOME/.pyenv
ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH
RUN apt-get install -y python-setuptools
RUN apt-get install -y python-pip python3-pip virtualenv htop
RUN pip3 install --upgrade numpy scipy sklearn tf-nightly-gpu
# Mount data into the docker
ADD . /research/resnet
WORKDIR /research/resnet
RUN pip3 install -r official/requirements.txt
ENTRYPOINT ["/bin/bash"]
@@ -0,0 +1,47 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
DEVICE_LIST=$@
export exec_type={MODE}
prog_exit()
{
if [ x"${exec_type}" = xdocker ];
then
# stop slogd progress
bash /usr/local/Ascend/driver/tools/docker_stop_post_sys.sh
fi
}
# register prog_exit
trap "prog_exit" SIGTERM
if [ x"${exec_type}" = xdocker ];
then
#set env
. ${currentDir}/npu_set_env.sh
# start slogd progress
mkdir -p /var/log/npu/slog/slogd
/usr/local/Ascend/driver/tools/docker/slogd &
# start main.sh
${currentDir}/main.sh ${DEVICE_LIST} &
# wait slogd stop
flag=1
while [ $flag -ne 0 ];
do
sleep 5;
flag=`ps -ef | grep train.sh | grep -v grep | wc -l`
ps -ef >> ${currentDir}/ps.log
echo "" >> ${currentDir}/ps.log
done
else
# start main.sh
su - HwHiAiUser -c ". ${currentDir}/npu_set_env.sh;${currentDir}/main.sh ${DEVICE_LIST}" &
wait
fi
@@ -0,0 +1,13 @@
{
"group_count": "1",
"group_list": [
{
"group_name": "worker",
"device_count": "{device_count}",
"instance_count": "{instance_count}",
"instance_list": [{instance_list}]
}
],
"status": "completed"
}
@@ -0,0 +1,18 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
device_group=$@
device_num=$#
touch ${currentDir}/main.log
for device_phy_id in ${device_group}
do
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train.sh ${device_phy_id} & " >> ${currentDir}/main.log
${currentDir}/train.sh ${device_phy_id} &
done
wait
echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train.sh exit " >> ${currentDir}/main.log
@@ -0,0 +1,28 @@
# main env
export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe:/code
export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin
export ASCEND_OPP_PATH=/usr/local/Ascend/opp
export DDK_VERSION_FLAG=1.60.T17.B830
export HCCL_CONNECT_TIMEOUT=600
# user env
export JOB_ID={JOB_ID}
export RANK_TABLE_FILE={RANK_TABLE_FILE}
export RANK_SIZE={RANK_SIZE}
export RANK_INDEX={RANK_INDEX}
export RANK_ID={RANK_ID}
# profiling env
export PROFILING_MODE={PROFILING_MODE}
export AICPU_PROFILING_MODE={AICPU_PROFILING_MODE}
export PROFILING_OPTIONS={PROFILING_OPTIONS}
export FP_POINT={FP_POINT}
export BP_POINT={BP_POINT}
# debug env
#export DUMP_GE_GRAPH=2
#export DUMP_OP=1
#export DUMP_OP_LESS=1
#export PRINT_MODEL=1
#export TE_PARALLEL_COMPILER=0
@@ -0,0 +1,33 @@
#!/bin/sh
currentDir=$(cd "$(dirname "$0")"; pwd)
cd ${currentDir}
PWD=${currentDir}
device_id=$1
if [ x"${device_id}" = x ] ;
then
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
exit
else
export DEVICE_ID=${device_id}
fi
DEVICE_INDEX=$(( DEVICE_ID + RANK_INDEX * 8 ))
export DEVICE_INDEX=${DEVICE_INDEX}
env > ${currentDir}/env_${device_id}.log
#mkdir exec path
mkdir -p ${currentDir}/${device_id}
rm -rf ${currentDir}/${device_id}/*
cd ${currentDir}/${device_id}
#start exec
python3.7 {RUN_ALGORITHM_CMD} {CHECKPOINT_DIR} > ${currentDir}/train_${device_id}.log 2>&1
if [ $? -eq 0 ] ;
then
echo "turing train success" >> ${currentDir}/train_${device_id}.log
else
echo "turing train fail" >> ${currentDir}/train_${device_id}.log
fi
@@ -0,0 +1,203 @@
Copyright 2015 The TensorFlow Authors. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2015, The TensorFlow Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
@@ -0,0 +1,20 @@
# Offically Supported TensorFlow 2.1 Models on Cloud TPU
## Natural Language Processing
* [bert](nlp/bert): A powerful pre-trained language representation model:
BERT, which stands for Bidirectional Encoder Representations from
Transformers.
[BERT FineTuning with Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/bert-2.x) provides step by step instructions on Cloud TPU training. You can look [Bert MNLI Tensorboard.dev metrics](https://tensorboard.dev/experiment/mIah5lppTASvrHqWrdr6NA) for MNLI fine tuning task.
* [transformer](nlp/transformer): A transformer model to translate the WMT
English to German dataset.
[Training transformer on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/transformer-2.x) for step by step instructions on Cloud TPU training.
## Computer Vision
* [mnist](vision/image_classification): A basic model to classify digits
from the MNIST dataset. See [Running MNIST on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/mnist-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/mIah5lppTASvrHqWrdr6NA).
* [resnet](vision/image_classification): A deep residual network that can
be used to classify ImageNet's dataset of 1000 classes.
See [Training ResNet on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/resnet-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/CxlDK8YMRrSpYEGtBRpOhg).
* [retinanet](vision/detection): A fast and powerful object detector. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/b8NRnWU3TqG6Rw0UxueU6Q).
@@ -0,0 +1,149 @@
# TensorFlow Official Models
The TensorFlow official models are a collection of models that use
TensorFlow's high-level APIs. They are intended to be well-maintained, tested,
and kept up to date with the latest TensorFlow API. They should also be
reasonably optimized for fast performance while still being easy to read.
These models are used as end-to-end tests, ensuring that the models run with the
same or improved speed and performance with each new TensorFlow build.
## Tensorflow releases
The master branch of the models are **in development** with TensorFlow 2.x, and
they target the
[nightly binaries](https://github.com/tensorflow/tensorflow#installation) built
from the
[master branch of TensorFlow](https://github.com/tensorflow/tensorflow/tree/master).
You may start from installing with pip:
```shell
pip3 install tf-nightly
```
**Stable versions** of the official models targeting releases of TensorFlow are
available as tagged branches or
[downloadable releases](https://github.com/tensorflow/models/releases). Model
repository version numbers match the target TensorFlow release, such that
[release v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0) are
compatible with
[TensorFlow v2.1.0](https://github.com/tensorflow/tensorflow/releases/tag/v2.1.0).
If you are on a version of TensorFlow earlier than 1.4, please
[update your installation](https://www.tensorflow.org/install/).
## Requirements
Please follow the below steps before running models in this repo:
1. TensorFlow
[nightly binaries](https://github.com/tensorflow/tensorflow#installation)
2. If users would like to clone this repo but do not care about change history,
please consider:
```shell
export repo_version="master"
git clone -b ${repo_version} https://github.com/tensorflow/models.git --depth=1
```
3. Add the top-level ***/models*** folder to the Python path with the command:
```shell
export PYTHONPATH=$PYTHONPATH:/path/to/models
```
Using Colab:
```python
import os
os.environ['PYTHONPATH'] += ":/path/to/models"
```
4. Install dependencies:
```shell
pip3 install --user -r official/requirements.txt
```
To make Official Models easier to use, we are planning to create a pip
installable Official Models package. This is being tracked in
[#917](https://github.com/tensorflow/models/issues/917).
## Available models
**NOTE: For Officially Supported TPU models please check [README-TPU](README-TPU.md).**
**NOTE:** Please make sure to follow the steps in the
[Requirements](#requirements) section.
### Natural Language Processing
* [bert](nlp/bert): A powerful pre-trained language representation model:
BERT, which stands for Bidirectional Encoder Representations from
Transformers.
* [transformer](nlp/transformer): A transformer model to translate the WMT English
to German dataset.
* [xlnet](nlp/xlnet): XLNet: Generalized Autoregressive Pretraining for
Language Understanding.
### Computer Vision
* [mnist](vision/image_classification): A basic model to classify digits from
the MNIST dataset.
* [resnet](vision/image_classification): A deep residual network that can be
used to classify both CIFAR-10 and ImageNet's dataset of 1000 classes.
* [retinanet](vision/detection): A fast and powerful object detector.
### Others
* [ncf](recommendation): Neural Collaborative Filtering model for
recommendation tasks.
Models that will not update to TensorFlow 2.x stay inside R1 directory:
* [boosted_trees](r1/boosted_trees): A Gradient Boosted Trees model to
classify higgs boson process from HIGGS Data Set.
* [wide_deep](r1/wide_deep): A model that combines a wide model and deep
network to classify census income data.
## More models to come!
We are in the progress to revamp official model garden with TensorFlow 2.0 and
Keras. In the near future, we will bring:
* State-of-the-art language understanding models: XLNet, GPT2, and more
members in Transformer family.
* Start-of-the-art image classification models: EfficientNet, MnasNet and
variants.
* A set of excellent objection detection models.
If you would like to make any fixes or improvements to the models, please
[submit a pull request](https://github.com/tensorflow/models/compare).
## New Models
The team is actively working to add new models to the repository. Every model
should follow the following guidelines, to uphold the our objectives of
readable, usable, and maintainable code.
**General guidelines**
* Code should be well documented and tested.
* Runnable from a blank environment with relative ease.
* Trainable on: single GPU/CPU (baseline), multiple GPUs, TPU
* Compatible with Python 3 (using [six](https://pythonhosted.org/six/) when
being compatible with Python 2 is necessary)
* Conform to [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md)
**Implementation guidelines**
These guidelines exist so the model implementations are consistent for better
readability and maintainability.
* Use [common utility functions](utils)
* Export SavedModel at the end of training.
* Consistent flags and flag-parsing library
([read more here](utils/flags/guidelines.md))
* Produce benchmarks and logs ([read more here](utils/logs/guidelines.md))
@@ -0,0 +1,56 @@
[
{
"description": "The ID of the benchmark run, where this metric should tie to.",
"mode": "REQUIRED",
"name": "run_id",
"type": "STRING"
},
{
"description": "The name of the metric, which should be descriptive. E.g. training_loss, accuracy.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The unit of the metric. E.g. MB per sec.",
"mode": "NULLABLE",
"name": "unit",
"type": "STRING"
},
{
"description": "The value of the metric.",
"mode": "NULLABLE",
"name": "value",
"type": "FLOAT"
},
{
"description": "The timestamp when the metric is recorded.",
"mode": "REQUIRED",
"name": "timestamp",
"type": "TIMESTAMP"
},
{
"description": "The global step when this metric is recorded.",
"mode": "NULLABLE",
"name": "global_step",
"type": "INTEGER"
},
{
"description": "Free format metadata for the extra information about the metric.",
"mode": "REPEATED",
"name": "extras",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "name",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
]
}
]
@@ -0,0 +1,368 @@
[
{
"description": "The UUID of the run for the benchmark.",
"mode": "REQUIRED",
"name": "model_id",
"type": "STRING"
},
{
"description": "The name of the model, E.g ResNet50, LeNet-5 etc.",
"mode": "REQUIRED",
"name": "model_name",
"type": "STRING"
},
{
"description": "The date when the test of the model is started",
"mode": "REQUIRED",
"name": "run_date",
"type": "TIMESTAMP"
},
{
"description": "The unique name for a test by the combination of key parameters, eg batch size, num of GPU, etc. It is hardware independent.",
"mode": "NULLABLE",
"name": "test_id",
"type": "STRING"
},
{
"description": "The tensorflow version information.",
"fields": [
{
"description": "Version of the tensorflow. E.g. 1.7.0-rc0",
"mode": "REQUIRED",
"name": "version",
"type": "STRING"
},
{
"description": "Git Hash of the tensorflow",
"mode": "NULLABLE",
"name": "git_hash",
"type": "STRING"
},
{
"description": "The channel of the tensorflow binary, eg, nightly, RC, final, custom.",
"mode": "NULLABLE",
"name": "channel",
"type": "STRING"
},
{
"description": "Identify anything special about the build, eg CUDA 10, NCCL, MKL, etc.",
"mode": "NULLABLE",
"name": "build_type",
"type": "STRING"
}
],
"mode": "REQUIRED",
"name": "tensorflow_version",
"type": "RECORD"
},
{
"description": "The arbitrary attribute of the model.",
"fields": [
{
"description": "The name of the attribute.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the attribute.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "attribute",
"type": "RECORD"
},
{
"description": "Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "environment_variable",
"type": "RECORD"
},
{
"description": "TF Environment variables when the benchmark run is executed.",
"fields": [
{
"description": "The name of the variable.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the variable.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "tensorflow_environment_variables",
"type": "RECORD"
},
{
"description": "The list of parameters run with the model. It could contain hyperparameters or others.",
"fields": [
{
"description": "The name of the parameter.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The string value of the parameter.",
"mode": "NULLABLE",
"name": "string_value",
"type": "STRING"
},
{
"description": "The bool value of the parameter.",
"mode": "NULLABLE",
"name": "bool_value",
"type": "STRING"
},
{
"description": "The int/long value of the parameter.",
"mode": "NULLABLE",
"name": "long_value",
"type": "INTEGER"
},
{
"description": "The double/float value of parameter.",
"mode": "NULLABLE",
"name": "float_value",
"type": "FLOAT"
}
],
"mode": "REPEATED",
"name": "run_parameters",
"type": "RECORD"
},
{
"description": "The dataset that run with the benchmark.",
"mode": "NULLABLE",
"name": "dataset",
"type": "RECORD",
"fields": [
{
"description": "The name of the dataset that the model is trained/validated with. E.g ImageNet, mnist.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The arbitrary attribute of the dataset.",
"fields": [
{
"description": "The name of the attribute.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the attribute.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "attribute",
"type": "RECORD"
}
]
},
{
"description": "Used to differentiate from AWS, GCE or DGX-1 at a high level",
"mode": "NULLABLE",
"name": "test_environment",
"type": "STRING"
},
{
"description": "The machine configuration of the benchmark run.",
"mode": "NULLABLE",
"name": "machine_config",
"type": "RECORD",
"fields": [
{
"description": "The platform information of the benchmark run.",
"mode": "NULLABLE",
"name": "platform_info",
"type": "RECORD",
"fields": [
{
"description": "Eg: 64bit.",
"mode": "NULLABLE",
"name": "bits",
"type": "STRING"
},
{
"description": "Eg: ELF.",
"mode": "NULLABLE",
"name": "linkage",
"type": "STRING"
},
{
"description": "Eg: i386.",
"mode": "NULLABLE",
"name": "machine",
"type": "STRING"
},
{
"description": "Eg: 3.13.0-76-generic.",
"mode": "NULLABLE",
"name": "release",
"type": "STRING"
},
{
"description": "Eg: Linux.",
"mode": "NULLABLE",
"name": "system",
"type": "STRING"
},
{
"description": "Eg: #120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016.",
"mode": "NULLABLE",
"name": "version",
"type": "STRING"
}
]
},
{
"description": "The CPU information of the benchmark run.",
"mode": "NULLABLE",
"name": "cpu_info",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "num_cores",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "num_cores_allowed",
"type": "INTEGER"
},
{
"description" : "How fast are those CPUs.",
"mode": "NULLABLE",
"name": "mhz_per_cpu",
"type": "FLOAT"
},
{
"description" : "Additional CPU info, Eg: Intel Ivybridge with HyperThreading (24 cores).",
"mode": "NULLABLE",
"name": "cpu_info",
"type": "STRING"
},
{
"description" : "What kind of cpu scaling is enabled on the host. Eg performance, ondemand, conservative, mixed.",
"mode": "NULLABLE",
"name": "cpu_governor",
"type": "STRING"
},
{
"description": "Cache size of the CPUs.",
"mode": "NULLABLE",
"name": "cache_size",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "level",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "size",
"type": "INTEGER"
}
]
}
]
},
{
"mode": "NULLABLE",
"name": "gpu_info",
"type": "RECORD",
"fields": [
{
"mode": "NULLABLE",
"name": "count",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "model",
"type": "STRING"
},
{
"mode": "NULLABLE",
"name": "cuda_version",
"type": "STRING"
}
]
},
{
"description": "The cloud instance inforation if the benchmark run is executed on cloud",
"mode": "NULLABLE",
"name": "cloud_info",
"type": "RECORD",
"fields": [
{
"description": "The instance type, E.g. n1-standard-4.",
"mode": "NULLABLE",
"name": "instance_type",
"type": "STRING"
},
{
"description": "The arbitrary attribute of the cloud info.",
"fields": [
{
"description": "The name of the attribute.",
"mode": "REQUIRED",
"name": "name",
"type": "STRING"
},
{
"description": "The value of the attribute.",
"mode": "NULLABLE",
"name": "value",
"type": "STRING"
}
],
"mode": "REPEATED",
"name": "attribute",
"type": "RECORD"
}
]
},
{
"mode": "NULLABLE",
"name": "memory_total",
"type": "INTEGER"
},
{
"mode": "NULLABLE",
"name": "memory_available",
"type": "STRING"
}
]
}
]
@@ -0,0 +1,14 @@
[
{
"description": "The UUID of the run for the benchmark.",
"mode": "REQUIRED",
"name": "run_id",
"type": "STRING"
},
{
"description": "The status of the run for the benchmark. Eg, running, failed, success",
"mode": "REQUIRED",
"name": "status",
"type": "STRING"
}
]
@@ -0,0 +1,285 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the Cifar-10 dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
import numpy as np
import tensorflow as tf
from official.benchmark.models import resnet_cifar_model
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
from official.vision.image_classification.resnet import cifar_preprocessing
from official.vision.image_classification.resnet import common
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(0.1, 91), (0.01, 136), (0.001, 182)
]
def learning_rate_schedule(current_epoch,
current_batch,
batches_per_epoch,
batch_size):
"""Handles linear scaling rule and LR decay.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
provided scaling factor.
Args:
current_epoch: integer, current epoch indexed from 0.
current_batch: integer, current batch in the current epoch, indexed from 0.
batches_per_epoch: integer, number of steps in an epoch.
batch_size: integer, total batch sized.
Returns:
Adjusted learning rate.
"""
del current_batch, batches_per_epoch # not used
initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
learning_rate = initial_learning_rate
for mult, start_epoch in LR_SCHEDULE:
if current_epoch >= start_epoch:
learning_rate = initial_learning_rate * mult
else:
break
return learning_rate
class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
"""Callback to update learning rate on every batch (not epoch boundaries).
N.B. Only support Keras optimizers, not TF optimizers.
Attributes:
schedule: a function that takes an epoch index and a batch index as input
(both integer, indexed from 0) and returns a new learning rate as
output (float).
"""
def __init__(self, schedule, batch_size, steps_per_epoch):
super(LearningRateBatchScheduler, self).__init__()
self.schedule = schedule
self.steps_per_epoch = steps_per_epoch
self.batch_size = batch_size
self.epochs = -1
self.prev_lr = -1
def on_epoch_begin(self, epoch, logs=None):
if not hasattr(self.model.optimizer, 'learning_rate'):
raise ValueError('Optimizer must have a "learning_rate" attribute.')
self.epochs += 1
def on_batch_begin(self, batch, logs=None):
"""Executes before step begins."""
lr = self.schedule(self.epochs,
batch,
self.steps_per_epoch,
self.batch_size)
if not isinstance(lr, (float, np.float32, np.float64)):
raise ValueError('The output of the "schedule" function should be float.')
if lr != self.prev_lr:
self.model.optimizer.learning_rate = lr # lr should be a float here
self.prev_lr = lr
tf.compat.v1.logging.debug(
'Epoch %05d Batch %05d: LearningRateBatchScheduler '
'change learning rate to %s.', self.epochs, batch, lr)
def run(flags_obj):
"""Run ResNet Cifar-10 training and eval loop using native Keras APIs.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
# Execute flag override logic for better model performance
if flags_obj.tf_gpu_thread_mode:
keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus,
datasets_num_private_threads=flags_obj.datasets_num_private_threads)
common.set_cudnn_batchnorm_mode()
dtype = flags_core.get_tf_dtype(flags_obj)
if dtype == 'fp16':
raise ValueError('dtype fp16 is not supported in Keras. Use the default '
'value(fp32).')
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs)
if strategy:
# flags_obj.enable_get_next_as_optional controls whether enabling
# get_next_as_optional behavior in DistributedIterator. If true, last
# partial batch can be supported.
strategy.extended.experimental_enable_get_next_as_optional = (
flags_obj.enable_get_next_as_optional
)
strategy_scope = distribution_utils.get_strategy_scope(strategy)
if flags_obj.use_synthetic_data:
distribution_utils.set_up_synthetic_data()
input_fn = common.get_synth_input_fn(
height=cifar_preprocessing.HEIGHT,
width=cifar_preprocessing.WIDTH,
num_channels=cifar_preprocessing.NUM_CHANNELS,
num_classes=cifar_preprocessing.NUM_CLASSES,
dtype=flags_core.get_tf_dtype(flags_obj),
drop_remainder=True)
else:
distribution_utils.undo_set_up_synthetic_data()
input_fn = cifar_preprocessing.input_fn
train_input_dataset = input_fn(
is_training=True,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
parse_record_fn=cifar_preprocessing.parse_record,
datasets_num_private_threads=flags_obj.datasets_num_private_threads,
dtype=dtype,
# Setting drop_remainder to avoid the partial batch logic in normalization
# layer, which triggers tf.where and leads to extra memory copy of input
# sizes between host and GPU.
drop_remainder=(not flags_obj.enable_get_next_as_optional))
eval_input_dataset = None
if not flags_obj.skip_eval:
eval_input_dataset = input_fn(
is_training=False,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
parse_record_fn=cifar_preprocessing.parse_record)
steps_per_epoch = (
cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
lr_schedule = 0.1
if flags_obj.use_tensor_lr:
initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
values=[initial_learning_rate] +
list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
with strategy_scope:
optimizer = common.get_optimizer(lr_schedule)
model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
model.compile(
loss='sparse_categorical_crossentropy',
optimizer=optimizer,
metrics=(['sparse_categorical_accuracy']
if flags_obj.report_accuracy_metrics else None),
run_eagerly=flags_obj.run_eagerly)
train_epochs = flags_obj.train_epochs
callbacks = common.get_callbacks(steps_per_epoch)
if not flags_obj.use_tensor_lr:
lr_callback = LearningRateBatchScheduler(
schedule=learning_rate_schedule,
batch_size=flags_obj.batch_size,
steps_per_epoch=steps_per_epoch)
callbacks.append(lr_callback)
# if mutliple epochs, ignore the train_steps flag.
if train_epochs <= 1 and flags_obj.train_steps:
steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
train_epochs = 1
num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
flags_obj.batch_size)
validation_data = eval_input_dataset
if flags_obj.skip_eval:
if flags_obj.set_learning_phase_to_train:
# TODO(haoyuzhang): Understand slowdown of setting learning phase when
# not using distribution strategy.
tf.keras.backend.set_learning_phase(1)
num_eval_steps = None
validation_data = None
if not strategy and flags_obj.explicit_gpu_placement:
# TODO(b/135607227): Add device scope automatically in Keras training loop
# when not using distribition strategy.
no_dist_strat_device = tf.device('/device:GPU:0')
no_dist_strat_device.__enter__()
history = model.fit(train_input_dataset,
epochs=train_epochs,
steps_per_epoch=steps_per_epoch,
callbacks=callbacks,
validation_steps=num_eval_steps,
validation_data=validation_data,
validation_freq=flags_obj.epochs_between_evals,
verbose=2)
eval_output = None
if not flags_obj.skip_eval:
eval_output = model.evaluate(eval_input_dataset,
steps=num_eval_steps,
verbose=2)
if not strategy and flags_obj.explicit_gpu_placement:
no_dist_strat_device.__exit__()
stats = common.build_stats(history, eval_output, callbacks)
return stats
def define_cifar_flags():
common.define_keras_flags(dynamic_loss_scale=False)
flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
model_dir='/tmp/cifar10_model',
epochs_between_evals=10,
batch_size=128)
def main(_):
with logger.benchmark_context(flags.FLAGS):
return run(flags.FLAGS)
if __name__ == '__main__':
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
define_cifar_flags()
app.run(main)
@@ -0,0 +1,262 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
# Reference:
- [Deep Residual Learning for Image Recognition](
https://arxiv.org/abs/1512.03385)
Adapted from code contributed by BigMoyan.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import tensorflow as tf
from tensorflow.python.keras import backend
from tensorflow.python.keras import initializers
from tensorflow.python.keras import layers
from tensorflow.python.keras import regularizers
BATCH_NORM_DECAY = 0.997
BATCH_NORM_EPSILON = 1e-5
L2_WEIGHT_DECAY = 2e-4
def identity_building_block(input_tensor,
kernel_size,
filters,
stage,
block,
training=None):
"""The identity block is the block that has no conv layer at shortcut.
Arguments:
input_tensor: input tensor
kernel_size: default 3, the kernel size of
middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: current block label, used for generating layer names
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
Output tensor for the block.
"""
filters1, filters2 = filters
if backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = layers.Conv2D(filters1, kernel_size,
padding='same', use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name=conv_name_base + '2a')(input_tensor)
x = layers.BatchNormalization(
axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(x, training=training)
x = layers.Activation('relu')(x)
x = layers.Conv2D(filters2, kernel_size,
padding='same', use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name=conv_name_base + '2b')(x)
x = layers.BatchNormalization(
axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(x, training=training)
x = layers.add([x, input_tensor])
x = layers.Activation('relu')(x)
return x
def conv_building_block(input_tensor,
kernel_size,
filters,
stage,
block,
strides=(2, 2),
training=None):
"""A block that has a conv layer at shortcut.
Arguments:
input_tensor: input tensor
kernel_size: default 3, the kernel size of
middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: current block label, used for generating layer names
strides: Strides for the first conv layer in the block.
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
Output tensor for the block.
Note that from stage 3,
the first conv layer at main path is with strides=(2, 2)
And the shortcut should have strides=(2, 2) as well
"""
filters1, filters2 = filters
if tf.keras.backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = layers.Conv2D(filters1, kernel_size, strides=strides,
padding='same', use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name=conv_name_base + '2a')(input_tensor)
x = layers.BatchNormalization(
axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(x, training=training)
x = layers.Activation('relu')(x)
x = layers.Conv2D(filters2, kernel_size, padding='same', use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name=conv_name_base + '2b')(x)
x = layers.BatchNormalization(
axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(x, training=training)
shortcut = layers.Conv2D(filters2, (1, 1), strides=strides, use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name=conv_name_base + '1')(input_tensor)
shortcut = layers.BatchNormalization(
axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '1')(shortcut, training=training)
x = layers.add([x, shortcut])
x = layers.Activation('relu')(x)
return x
def resnet_block(input_tensor,
size,
kernel_size,
filters,
stage,
conv_strides=(2, 2),
training=None):
"""A block which applies conv followed by multiple identity blocks.
Arguments:
input_tensor: input tensor
size: integer, number of constituent conv/identity building blocks.
A conv block is applied once, followed by (size - 1) identity blocks.
kernel_size: default 3, the kernel size of
middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
conv_strides: Strides for the first conv layer in the block.
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
Output tensor after applying conv and identity blocks.
"""
x = conv_building_block(input_tensor, kernel_size, filters, stage=stage,
strides=conv_strides, block='block_0',
training=training)
for i in range(size - 1):
x = identity_building_block(x, kernel_size, filters, stage=stage,
block='block_%d' % (i + 1), training=training)
return x
def resnet(num_blocks, classes=10, training=None):
"""Instantiates the ResNet architecture.
Arguments:
num_blocks: integer, the number of conv/identity blocks in each block.
The ResNet contains 3 blocks with each block containing one conv block
followed by (layers_per_block - 1) number of idenity blocks. Each
conv/idenity block has 2 convolutional layers. With the input
convolutional layer and the pooling layer towards the end, this brings
the total size of the network to (6*num_blocks + 2)
classes: optional number of classes to classify images into
training: Only used if training keras model with Estimator. In other
scenarios it is handled automatically.
Returns:
A Keras model instance.
"""
input_shape = (32, 32, 3)
img_input = layers.Input(shape=input_shape)
if backend.image_data_format() == 'channels_first':
x = layers.Lambda(lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
name='transpose')(img_input)
bn_axis = 1
else: # channel_last
x = img_input
bn_axis = 3
x = layers.ZeroPadding2D(padding=(1, 1), name='conv1_pad')(x)
x = layers.Conv2D(16, (3, 3),
strides=(1, 1),
padding='valid', use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name='conv1')(x)
x = layers.BatchNormalization(axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name='bn_conv1',)(x, training=training)
x = layers.Activation('relu')(x)
x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[16, 16],
stage=2, conv_strides=(1, 1), training=training)
x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[32, 32],
stage=3, conv_strides=(2, 2), training=training)
x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[64, 64],
stage=4, conv_strides=(2, 2), training=training)
rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
x = layers.Dense(classes,
activation='softmax',
kernel_initializer=initializers.RandomNormal(stddev=0.01),
kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
bias_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
name='fc10')(x)
inputs = img_input
# Create model.
model = tf.keras.models.Model(inputs, x, name='resnet56')
return model
resnet20 = functools.partial(resnet, num_blocks=3)
resnet32 = functools.partial(resnet, num_blocks=5)
resnet56 = functools.partial(resnet, num_blocks=9)
resnet10 = functools.partial(resnet, num_blocks=110)
@@ -0,0 +1,187 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test the keras ResNet model with Cifar data."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tempfile
import tensorflow as tf
from tensorflow.python.eager import context
from tensorflow.python.platform import googletest
from official.benchmark.models import resnet_cifar_main
from official.utils.misc import keras_utils
from official.utils.testing import integration
from official.vision.image_classification.resnet import cifar_preprocessing
class KerasCifarTest(googletest.TestCase):
"""Unit tests for Keras ResNet with Cifar."""
_extra_flags = [
"-batch_size", "4",
"-train_steps", "1",
"-use_synthetic_data", "true"
]
_tempdir = None
def get_temp_dir(self):
if not self._tempdir:
self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
return self._tempdir
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(KerasCifarTest, cls).setUpClass()
resnet_cifar_main.define_cifar_flags()
def setUp(self):
super(KerasCifarTest, self).setUp()
cifar_preprocessing.NUM_IMAGES["validation"] = 4
def tearDown(self):
super(KerasCifarTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
def test_end_to_end_no_dist_strat(self):
"""Test Keras model with 1 GPU, no distribution strategy."""
config = keras_utils.get_config_proto_v1()
tf.compat.v1.enable_eager_execution(config=config)
extra_flags = [
"-distribution_strategy", "off",
"-model_dir", "keras_cifar_no_dist_strat",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_graph_no_dist_strat(self):
"""Test Keras model in legacy graph mode with 1 GPU, no dist strat."""
extra_flags = [
"-enable_eager", "false",
"-distribution_strategy", "off",
"-model_dir", "keras_cifar_graph_no_dist_strat",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_1_gpu(self):
"""Test Keras model with 1 GPU."""
config = keras_utils.get_config_proto_v1()
tf.compat.v1.enable_eager_execution(config=config)
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(1, context.num_gpus()))
extra_flags = [
"-num_gpus", "1",
"-distribution_strategy", "mirrored",
"-model_dir", "keras_cifar_1_gpu",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_graph_1_gpu(self):
"""Test Keras model in legacy graph mode with 1 GPU."""
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(1, context.num_gpus()))
extra_flags = [
"-num_gpus", "1",
"-noenable_eager",
"-distribution_strategy", "mirrored",
"-model_dir", "keras_cifar_graph_1_gpu",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_2_gpu(self):
"""Test Keras model with 2 GPUs."""
config = keras_utils.get_config_proto_v1()
tf.compat.v1.enable_eager_execution(config=config)
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(2, context.num_gpus()))
extra_flags = [
"-num_gpus", "2",
"-distribution_strategy", "mirrored",
"-model_dir", "keras_cifar_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_graph_2_gpu(self):
"""Test Keras model in legacy graph mode with 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(2, context.num_gpus()))
extra_flags = [
"-num_gpus", "2",
"-enable_eager", "false",
"-distribution_strategy", "mirrored",
"-model_dir", "keras_cifar_graph_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_cifar_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
if __name__ == "__main__":
googletest.main()
@@ -0,0 +1,259 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes CTL benchmarks and accuracy tests."""
from __future__ import print_function
import os
import sys
import time
# import pydevd_pycharm
# pydevd_pycharm.settrace('90.253.17.223', port=8008, stdoutToServer=True, stderrToServer=True, suspend=False)
# pylint: disable=g-bad-import-order
from absl import flags
import tensorflow as tf
#sys.path.append(r"/home/wx933135/0708/ResNet50/tensorflow/code")
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../../utils/atlasboost'))
from official.r1.resnet import imagenet_main
from official.utils.testing.perfzero_benchmark import PerfZeroBenchmark
from official.utils.testing import benchmark_wrappers
from official.utils.flags import core as flags_core
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
MIN_TOP_1_ACCURACY = 0.76
MAX_TOP_1_ACCURACY = 0.77
flags.DEFINE_integer('iterations_per_loop', 1000,'iterations per loop')
flags.DEFINE_integer('save_checkpoints_steps', 115200,'save checkpoints steps')
FLAGS = flags.FLAGS
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../config'))
class CtlBenchmark(PerfZeroBenchmark):
"""Base benchmark class with methods to simplify testing."""
def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
self.output_dir = output_dir
self.default_flags = default_flags or {}
self.flag_methods = flag_methods or {}
super(CtlBenchmark, self).__init__(
output_dir=self.output_dir,
default_flags=self.default_flags,
flag_methods=self.flag_methods)
def _report_benchmark(self,
stats,
wall_time_sec,
top_1_max=None,
top_1_min=None,
total_batch_size=None,
log_steps=None,
warmup=1):
"""Report benchmark results by writing to local protobuf file.
Args:
stats: dict returned from keras models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
top_1_max: highest passing level for top_1 accuracy.
top_1_min: lowest passing level for top_1 accuracy.
total_batch_size: Global batch-size.
log_steps: How often the log was created for stats['step_timestamp_log'].
warmup: number of entries in stats['step_timestamp_log'] to ignore.
"""
metrics = []
if 'eval_acc' in stats:
metrics.append({
'name': 'accuracy_top_1',
'value': stats['eval_acc'],
'min_value': top_1_min,
'max_value': top_1_max
})
metrics.append({'name': 'eval_loss', 'value': stats['eval_loss']})
metrics.append({
'name': 'top_1_train_accuracy',
'value': stats['train_acc']
})
metrics.append({'name': 'train_loss', 'value': stats['train_loss']})
if (warmup and 'step_timestamp_log' in stats and
len(stats['step_timestamp_log']) > warmup + 1):
# first entry in the time_log is start of step 0. The rest of the
# entries are the end of each step recorded
time_log = stats['step_timestamp_log']
steps_elapsed = time_log[-1].batch_index - time_log[warmup].batch_index
time_elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
examples_per_sec = total_batch_size * (steps_elapsed / time_elapsed)
metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
if 'avg_exp_per_second' in stats:
metrics.append({
'name': 'avg_exp_per_second',
'value': stats['avg_exp_per_second']
})
print("start flags_core.get_nondefault_flags_as_str")
flags_str = flags_core.get_nondefault_flags_as_str()
self.report_benchmark(
iters=-1,
wall_time=wall_time_sec,
metrics=metrics,
extras={'flags': flags_str})
class Resnet50CtlAccuracy(CtlBenchmark):
"""Benchmark accuracy tests for ResNet50 in CTL."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more named
arguments before updating the constructor.
"""
# flag_methods = [common.define_keras_flags]
self.data_dir = os.path.join(root_data_dir, 'imagenet')
super(Resnet50CtlAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flags)
# @benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = imagenet_main.main(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
super(Resnet50CtlAccuracy, self)._report_benchmark(
stats,
wall_time_sec,
top_1_min=MIN_TOP_1_ACCURACY,
top_1_max=MAX_TOP_1_ACCURACY,
total_batch_size=FLAGS.batch_size,
log_steps=100)
def _get_model_dir(self, folder_name):
return os.path.join(self.output_dir, folder_name)
class Resnet50CtlBenchmarkBase(CtlBenchmark):
"""Resnet50 benchmarks."""
def __init__(self, output_dir=None, default_flags=None):
super(Resnet50CtlBenchmarkBase, self).__init__(
output_dir=output_dir,
flag_methods=flags,
default_flags=default_flags)
# @benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = imagenet_main.benchmark_main()
wall_time_sec = time.time() - start_time_sec
# Number of logged step time entries that are excluded in performance
# report. We keep results from last 100 batches in this case.
warmup = (FLAGS.train_steps - 100) // FLAGS.log_steps
super(Resnet50CtlBenchmarkBase, self)._report_benchmark(
stats,
wall_time_sec,
total_batch_size=FLAGS.batch_size,
log_steps=FLAGS.log_steps,
warmup=warmup)
def benchmark_1_npu_fp16(self, config_dict, cluster_device_id):
"""Test v1 model with 1 NPU with tf mixed precision."""
print("start benchmark_1_npu_fp16")
FLAGS.resnet_size = 50
FLAGS.resnet_version = 1
# FLAGS.max_train_steps = 1000 # this is not global step , only the step per epoch. default is according to train images
FLAGS.max_train_steps = config_dict.get('max_train_steps')
FLAGS.hooks = ['examplespersecondhook']
#FLAGS.data_dir = '/home/w00563133/data/resnet/imagenet_TF'
FLAGS.data_dir = config_dict.get('data_dir')
FLAGS.model_dir = os.getenv('MODEL_CKPT_PATH')
FLAGS.train_epochs = config_dict.get('train_epochs')
FLAGS.batch_size = config_dict.get('batch_size')
# FLAGS.epochs_between_evals = 1
FLAGS.epochs_between_evals = config_dict.get('epochs_between_evals')
FLAGS.iterations_per_loop = config_dict.get('iterations_per_loop')
FLAGS.save_checkpoints_steps = config_dict.get('save_checkpoints_steps')
FLAGS.stop_threshold = MIN_TOP_1_ACCURACY
self._run_and_report_benchmark()
class Resnet50CtlBenchmarkReal(Resnet50CtlBenchmarkBase):
"""Resnet50 real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
# def_flags['skip_eval'] = True
# def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
# def_flags['train_steps'] = 110
# def_flags['steps_per_loop'] = 20
# def_flags['log_steps'] = 10
super(Resnet50CtlBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags)
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.128, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
cluster_device_id = None
rank_count = sys.argv[1]
if rank_count == "1":
from resnet_config_1p_npu import resnet50_config
elif rank_count == "2":
from resnet_config_2p_npu import resnet50_config
elif rank_count == "4":
from resnet_config_4p_npu import resnet50_config
elif rank_count == "16":
from resnet_config_16p_npu import resnet50_config
elif rank_count == "32":
from resnet_config_32p_npu import resnet50_config
else:
from resnet_config_8p_npu import resnet50_config
config_dict = resnet50_config()
print("config dict info is {}".format(config_dict))
imagenet_main.benchmark_pre()
test=Resnet50CtlBenchmarkReal("./result","./result")
test.benchmark_1_npu_fp16(config_dict, cluster_device_id)
@@ -0,0 +1,19 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Activations package definition."""
from official.modeling.activations.gelu import gelu
from official.modeling.activations.swish import hard_swish
from official.modeling.activations.swish import identity
from official.modeling.activations.swish import simple_swish
@@ -0,0 +1,40 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Gaussian error linear unit."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='Text')
def gelu(x):
"""Gaussian Error Linear Unit.
This is a smoother version of the RELU.
Original paper: https://arxiv.org/abs/1606.08415
Args:
x: float Tensor to perform activation.
Returns:
`x` with the GELU activation applied.
"""
cdf = 0.5 * (1.0 + tf.tanh(
(math.sqrt(2 / math.pi) * (x + 0.044715 * tf.pow(x, 3)))))
return x * cdf
@@ -0,0 +1,38 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the Gaussian error linear unit."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.modeling import activations
@keras_parameterized.run_all_keras_modes
class GeluTest(keras_parameterized.TestCase):
def test_gelu(self):
expected_data = [[0.14967535, 0., -0.10032465],
[-0.15880796, -0.04540223, 2.9963627]]
gelu_data = activations.gelu([[.25, 0, -.25], [-1, -2, 3]])
self.assertAllClose(expected_data, gelu_data)
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,75 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Customized Swish activation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
@tf.keras.utils.register_keras_serializable(package='Text')
def simple_swish(features):
"""Computes the Swish activation function.
The tf.nn.swish operation uses a custom gradient to reduce memory usage.
Since saving custom gradients in SavedModel is currently not supported, and
one would not be able to use an exported TF-Hub module for fine-tuning, we
provide this wrapper that can allow to select whether to use the native
TensorFlow swish operation, or whether to use a customized operation that
has uses default TensorFlow gradient computation.
Args:
features: A `Tensor` representing preactivation values.
Returns:
The activation value.
"""
features = tf.convert_to_tensor(features)
return features * tf.nn.sigmoid(features)
@tf.keras.utils.register_keras_serializable(package='Text')
def hard_swish(features):
"""Computes a hard version of the swish function.
This operation can be used to reduce computational cost and improve
quantization for edge devices.
Args:
features: A `Tensor` representing preactivation values.
Returns:
The activation value.
"""
features = tf.convert_to_tensor(features)
return features * tf.nn.relu6(features + tf.constant(3.)) * (1. / 6.)
@tf.keras.utils.register_keras_serializable(package='Text')
def identity(features):
"""Computes the identity function.
Useful for helping in quantization.
Args:
features: A `Tensor` representing preactivation values.
Returns:
The activation value.
"""
features = tf.convert_to_tensor(features)
return tf.identity(features)
@@ -0,0 +1,49 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for the customized Swish activation."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
from tensorflow.python.keras import keras_parameterized # pylint: disable=g-direct-tensorflow-import
from official.modeling import activations
@keras_parameterized.run_all_keras_modes
class CustomizedSwishTest(keras_parameterized.TestCase):
def _hard_swish_np(self, x):
x = np.float32(x)
return x * np.clip(x + 3, 0, 6) / 6
def test_simple_swish(self):
features = [[.25, 0, -.25], [-1, -2, 3]]
customized_swish_data = activations.simple_swish(features)
swish_data = tf.nn.swish(features)
self.assertAllClose(customized_swish_data, swish_data)
def test_hard_swish(self):
features = [[.25, 0, -.25], [-1, -2, 3]]
customized_swish_data = activations.hard_swish(features)
swish_data = self._hard_swish_np(features)
self.assertAllClose(customized_swish_data, swish_data)
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,318 @@
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base configurations to standardize experiments."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import copy
import functools
from typing import Any, List, Mapping, Optional, Type
import dataclasses
import tensorflow as tf
import yaml
from official.modeling.hyperparams import params_dict
@dataclasses.dataclass
class Config(params_dict.ParamsDict):
"""The base configuration class that supports YAML/JSON based overrides.
* It recursively enforces a whitelist of basic types and container types, so
it avoids surprises with copy and reuse caused by unanticipated types.
* It converts dict to Config even within sequences,
e.g. for config = Config({'key': [([{'a': 42}],)]),
type(config.key[0][0][0]) is Config rather than dict.
"""
# It's safe to add bytes and other immutable types here.
IMMUTABLE_TYPES = (str, int, float, bool, type(None))
# It's safe to add set, frozenset and other collections here.
SEQUENCE_TYPES = (list, tuple)
default_params: dataclasses.InitVar[Optional[Mapping[str, Any]]] = None
restrictions: dataclasses.InitVar[Optional[List[str]]] = None
@classmethod
def _isvalidsequence(cls, v):
"""Check if the input values are valid sequences.
Args:
v: Input sequence.
Returns:
True if the sequence is valid. Valid sequence includes the sequence
type in cls.SEQUENCE_TYPES and element type is in cls.IMMUTABLE_TYPES or
is dict or ParamsDict.
"""
if not isinstance(v, cls.SEQUENCE_TYPES):
return False
return (all(isinstance(e, cls.IMMUTABLE_TYPES) for e in v) or
all(isinstance(e, dict) for e in v) or
all(isinstance(e, params_dict.ParamsDict) for e in v))
@classmethod
def _import_config(cls, v, subconfig_type):
"""Returns v with dicts converted to Configs, recursively."""
if not issubclass(subconfig_type, params_dict.ParamsDict):
raise TypeError(
'Subconfig_type should be subclass of ParamsDict, found {!r}'.format(
subconfig_type))
if isinstance(v, cls.IMMUTABLE_TYPES):
return v
elif isinstance(v, cls.SEQUENCE_TYPES):
# Only support one layer of sequence.
if not cls._isvalidsequence(v):
raise TypeError(
'Invalid sequence: only supports single level {!r} of {!r} or '
'dict or ParamsDict found: {!r}'.format(cls.SEQUENCE_TYPES,
cls.IMMUTABLE_TYPES, v))
import_fn = functools.partial(
cls._import_config, subconfig_type=subconfig_type)
return type(v)(map(import_fn, v))
elif isinstance(v, params_dict.ParamsDict):
# Deepcopy here is a temporary solution for preserving type in nested
# Config object.
return copy.deepcopy(v)
elif isinstance(v, dict):
return subconfig_type(v)
else:
raise TypeError('Unknown type: {!r}'.format(type(v)))
@classmethod
def _export_config(cls, v):
"""Returns v with Configs converted to dicts, recursively."""
if isinstance(v, cls.IMMUTABLE_TYPES):
return v
elif isinstance(v, cls.SEQUENCE_TYPES):
return type(v)(map(cls._export_config, v))
elif isinstance(v, params_dict.ParamsDict):
return v.as_dict()
elif isinstance(v, dict):
raise TypeError('dict value not supported in converting.')
else:
raise TypeError('Unknown type: {!r}'.format(type(v)))
@classmethod
def _get_subconfig_type(cls, k) -> Type[params_dict.ParamsDict]:
"""Get element type by the field name.
Args:
k: the key/name of the field.
Returns:
Config as default. If a type annotation is found for `k`,
1) returns the type of the annotation if it is subtype of ParamsDict;
2) returns the element type if the annotation of `k` is List[SubType]
or Tuple[SubType].
"""
subconfig_type = Config
if k in cls.__annotations__:
# Directly Config subtype.
type_annotation = cls.__annotations__[k]
if (isinstance(type_annotation, type) and
issubclass(type_annotation, Config)):
subconfig_type = cls.__annotations__[k]
else:
# Check if the field is a sequence of subtypes.
field_type = getattr(type_annotation, '__origin__', type(None))
if (isinstance(field_type, type) and
issubclass(field_type, cls.SEQUENCE_TYPES)):
element_type = getattr(type_annotation, '__args__', [type(None)])[0]
subconfig_type = (
element_type if issubclass(element_type, params_dict.ParamsDict)
else subconfig_type)
return subconfig_type
def __post_init__(self, default_params, restrictions, *args, **kwargs):
super().__init__(default_params=default_params,
restrictions=restrictions,
*args,
**kwargs)
def _set(self, k, v):
"""Overrides same method in ParamsDict.
Also called by ParamsDict methods.
Args:
k: key to set.
v: value.
Raises:
RuntimeError
"""
subconfig_type = self._get_subconfig_type(k)
if isinstance(v, dict):
if k not in self.__dict__ or not self.__dict__[k]:
# If the key not exist or the value is None, a new Config-family object
# sould be created for the key.
self.__dict__[k] = subconfig_type(v)
else:
self.__dict__[k].override(v)
else:
self.__dict__[k] = self._import_config(v, subconfig_type)
def __setattr__(self, k, v):
if k not in self.RESERVED_ATTR:
if getattr(self, '_locked', False):
raise ValueError('The Config has been locked. ' 'No change is allowed.')
self._set(k, v)
def _override(self, override_dict, is_strict=True):
"""Overrides same method in ParamsDict.
Also called by ParamsDict methods.
Args:
override_dict: dictionary to write to .
is_strict: If True, not allows to add new keys.
Raises:
KeyError: overriding reserved keys or keys not exist (is_strict=True).
"""
for k, v in sorted(override_dict.items()):
if k in self.RESERVED_ATTR:
raise KeyError('The key {!r} is internally reserved. '
'Can not be overridden.'.format(k))
if k not in self.__dict__:
if is_strict:
raise KeyError('The key {!r} does not exist in {!r}. '
'To extend the existing keys, use '
'`override` with `is_strict` = False.'.format(
k, type(self)))
else:
self._set(k, v)
else:
if isinstance(v, dict) and self.__dict__[k]:
self.__dict__[k]._override(v, is_strict) # pylint: disable=protected-access
elif isinstance(v, params_dict.ParamsDict) and self.__dict__[k]:
self.__dict__[k]._override(v.as_dict(), is_strict) # pylint: disable=protected-access
else:
self._set(k, v)
def as_dict(self):
"""Returns a dict representation of params_dict.ParamsDict.
For the nested params_dict.ParamsDict, a nested dict will be returned.
"""
return {
k: self._export_config(v)
for k, v in self.__dict__.items()
if k not in self.RESERVED_ATTR
}
def replace(self, **kwargs):
"""Like `override`, but returns a copy with the current config unchanged."""
params = self.__class__(self)
params.override(kwargs, is_strict=True)
return params
@classmethod
def from_yaml(cls, file_path: str):
# Note: This only works if the Config has all default values.
with tf.io.gfile.GFile(file_path, 'r') as f:
loaded = yaml.load(f)
config = cls()
config.override(loaded)
return config
@classmethod
def from_json(cls, file_path: str):
"""Wrapper for `from_yaml`."""
return cls.from_yaml(file_path)
@classmethod
def from_args(cls, *args, **kwargs):
"""Builds a config from the given list of arguments."""
attributes = list(cls.__annotations__.keys())
default_params = {a: p for a, p in zip(attributes, args)}
default_params.update(kwargs)
return cls(default_params)
@dataclasses.dataclass
class RuntimeConfig(Config):
"""High-level configurations for Runtime.
These include parameters that are not directly related to the experiment,
e.g. directories, accelerator type, etc.
Attributes:
distribution_strategy: e.g. 'mirrored', 'tpu', etc.
enable_eager: Whether or not to enable eager mode.
enable_xla: Whether or not to enable XLA.
per_gpu_thread_count: thread count per GPU.
gpu_threads_enabled: Whether or not GPU threads are enabled.
gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
dataset_num_private_threads: Number of threads for a private threadpool
created for all datasets computation.
tpu: The address of the TPU to use, if any.
num_gpus: The number of GPUs to use, if any.
worker_hosts: comma-separated list of worker ip:port pairs for running
multi-worker models with DistributionStrategy.
task_index: If multi-worker training, the task index of this worker.
all_reduce_alg: Defines the algorithm for performing all-reduce.
num_packs: Sets `num_packs` in the cross device ops used in
MirroredStrategy. For details, see tf.distribute.NcclAllReduce.
"""
distribution_strategy: str = 'mirrored'
enable_eager: bool = False
enable_xla: bool = False
gpu_threads_enabled: bool = False
gpu_thread_mode: Optional[str] = None
dataset_num_private_threads: Optional[int] = None
per_gpu_thread_count: int = 0
tpu: Optional[str] = None
num_gpus: int = 0
worker_hosts: Optional[str] = None
task_index: int = -1
all_reduce_alg: Optional[str] = None
num_packs: int = 1
@dataclasses.dataclass
class TensorboardConfig(Config):
"""Configuration for Tensorboard.
Attributes:
track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
to True.
write_model_weights: Whether or not to write the model weights as
images in Tensorboard. Defaults to False.
"""
track_lr: bool = True
write_model_weights: bool = False
@dataclasses.dataclass
class CallbacksConfig(Config):
"""Configuration for Callbacks.
Attributes:
enable_checkpoint_and_export: Whether or not to enable checkpoints as a
Callback. Defaults to True.
enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
Defaults to True.
"""
enable_checkpoint_and_export: bool = True
enable_tensorboard: bool = True
@@ -0,0 +1,299 @@
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import pprint
from typing import List, Tuple
from absl.testing import parameterized
import dataclasses
import tensorflow as tf
from official.modeling.hyperparams import base_config
@dataclasses.dataclass
class DumpConfig1(base_config.Config):
a: int = 1
b: str = 'text'
@dataclasses.dataclass
class DumpConfig2(base_config.Config):
c: int = 2
d: str = 'text'
e: DumpConfig1 = DumpConfig1()
@dataclasses.dataclass
class DumpConfig3(DumpConfig2):
f: int = 2
g: str = 'text'
h: List[DumpConfig1] = dataclasses.field(
default_factory=lambda: [DumpConfig1(), DumpConfig1()])
g: Tuple[DumpConfig1, ...] = (DumpConfig1(),)
class BaseConfigTest(parameterized.TestCase, tf.test.TestCase):
def assertHasSameTypes(self, c, d, msg=''):
"""Checks if a Config has the same structure as a given dict.
Args:
c: the Config object to be check.
d: the reference dict object.
msg: The error message to show when type mismatched.
"""
# Make sure d is not a Config. Assume d is either
# dictionary or primitive type and c is the Config or primitive types.
self.assertNotIsInstance(d, base_config.Config)
if isinstance(d, base_config.Config.IMMUTABLE_TYPES):
self.assertEqual(pprint.pformat(c), pprint.pformat(d), msg=msg)
elif isinstance(d, base_config.Config.SEQUENCE_TYPES):
self.assertEqual(type(c), type(d), msg=msg)
for i, v in enumerate(d):
self.assertHasSameTypes(c[i], v, msg='{}[{!r}]'.format(msg, i))
elif isinstance(d, dict):
self.assertIsInstance(c, base_config.Config, msg=msg)
for k, v in sorted(d.items()):
self.assertHasSameTypes(getattr(c, k), v, msg='{}[{!r}]'.format(msg, k))
else:
raise TypeError('Unknown type: %r' % type(d))
def assertImportExport(self, v):
config = base_config.Config({'key': v})
back = config.as_dict()['key']
self.assertEqual(pprint.pformat(back), pprint.pformat(v))
self.assertHasSameTypes(config.key, v, msg='=%s v' % pprint.pformat(v))
def test_invalid_keys(self):
params = base_config.Config()
with self.assertRaises(AttributeError):
_ = params.a
def test_nested_config_types(self):
config = DumpConfig3()
self.assertIsInstance(config.e, DumpConfig1)
self.assertIsInstance(config.h[0], DumpConfig1)
self.assertIsInstance(config.h[1], DumpConfig1)
self.assertIsInstance(config.g[0], DumpConfig1)
config.override({'e': {'a': 2, 'b': 'new text'}})
self.assertIsInstance(config.e, DumpConfig1)
self.assertEqual(config.e.a, 2)
self.assertEqual(config.e.b, 'new text')
config.override({'h': [{'a': 3, 'b': 'new text 2'}]})
self.assertIsInstance(config.h[0], DumpConfig1)
self.assertLen(config.h, 1)
self.assertEqual(config.h[0].a, 3)
self.assertEqual(config.h[0].b, 'new text 2')
config.override({'g': [{'a': 4, 'b': 'new text 3'}]})
self.assertIsInstance(config.g[0], DumpConfig1)
self.assertLen(config.g, 1)
self.assertEqual(config.g[0].a, 4)
self.assertEqual(config.g[0].b, 'new text 3')
@parameterized.parameters(
('_locked', "The key '_locked' is internally reserved."),
('_restrictions', "The key '_restrictions' is internally reserved."),
('aa', "The key 'aa' does not exist."),
)
def test_key_error(self, key, msg):
params = base_config.Config()
with self.assertRaisesRegex(KeyError, msg):
params.override({key: True})
@parameterized.parameters(
('str data',),
(123,),
(1.23,),
(None,),
(['str', 1, 2.3, None],),
(('str', 1, 2.3, None),),
)
def test_import_export_immutable_types(self, v):
self.assertImportExport(v)
out = base_config.Config({'key': v})
self.assertEqual(pprint.pformat(v), pprint.pformat(out.key))
def test_override_is_strict_true(self):
params = base_config.Config({
'a': 'aa',
'b': 2,
'c': {
'c1': 'cc',
'c2': 20
}
})
params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
self.assertEqual(params.a, 2)
self.assertEqual(params.c.c1, 'ccc')
with self.assertRaises(KeyError):
params.override({'d': 'ddd'}, is_strict=True)
with self.assertRaises(KeyError):
params.override({'c': {'c3': 30}}, is_strict=True)
config = base_config.Config({'key': [{'a': 42}]})
config.override({'key': [{'b': 43}]})
self.assertEqual(config.key[0].b, 43)
with self.assertRaisesRegex(AttributeError, 'The key `a` does not exist'):
_ = config.key[0].a
@parameterized.parameters(
(lambda x: x, 'Unknown type'),
(object(), 'Unknown type'),
(set(), 'Unknown type'),
(frozenset(), 'Unknown type'),
)
def test_import_unsupport_types(self, v, msg):
with self.assertRaisesRegex(TypeError, msg):
_ = base_config.Config({'key': v})
@parameterized.parameters(
({
'a': [{
'b': 2,
}, {
'c': 3,
}]
},),
({
'c': [{
'f': 1.1,
}, {
'h': [1, 2],
}]
},),
(({
'a': 'aa',
'b': 2,
'c': {
'c1': 10,
'c2': 20,
}
},),),
)
def test_import_export_nested_structure(self, d):
self.assertImportExport(d)
@parameterized.parameters(
([{
'a': 42,
'b': 'hello',
'c': 1.2
}],),
(({
'a': 42,
'b': 'hello',
'c': 1.2
},),),
)
def test_import_export_nested_sequences(self, v):
self.assertImportExport(v)
@parameterized.parameters(
([([{}],)],),
([['str', 1, 2.3, None]],),
((('str', 1, 2.3, None),),),
([
('str', 1, 2.3, None),
],),
([
('str', 1, 2.3, None),
],),
([[{
'a': 42,
'b': 'hello',
'c': 1.2
}]],),
([[[{
'a': 42,
'b': 'hello',
'c': 1.2
}]]],),
((({
'a': 42,
'b': 'hello',
'c': 1.2
},),),),
(((({
'a': 42,
'b': 'hello',
'c': 1.2
},),),),),
([({
'a': 42,
'b': 'hello',
'c': 1.2
},)],),
(([{
'a': 42,
'b': 'hello',
'c': 1.2
}],),),
)
def test_import_export_unsupport_sequence(self, v):
with self.assertRaisesRegex(TypeError,
'Invalid sequence: only supports single level'):
_ = base_config.Config({'key': v})
def test_construct_subtype(self):
pass
def test_import_config(self):
params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
self.assertLen(params.a, 2)
self.assertEqual(params.a[0].b, 2)
self.assertEqual(type(params.a[0]), base_config.Config)
self.assertEqual(pprint.pformat(params.a[0].b), '2')
self.assertEqual(type(params.a[1]), base_config.Config)
self.assertEqual(type(params.a[1].c), base_config.Config)
self.assertEqual(pprint.pformat(params.a[1].c.d), '3')
def test_override(self):
params = base_config.Config({'a': [{'b': 2}, {'c': {'d': 3}}]})
params.override({'a': [{'b': 4}, {'c': {'d': 5}}]}, is_strict=False)
self.assertEqual(type(params.a), list)
self.assertEqual(type(params.a[0]), base_config.Config)
self.assertEqual(pprint.pformat(params.a[0].b), '4')
self.assertEqual(type(params.a[1]), base_config.Config)
self.assertEqual(type(params.a[1].c), base_config.Config)
self.assertEqual(pprint.pformat(params.a[1].c.d), '5')
@parameterized.parameters(
([{}],),
(({},),),
)
def test_config_vs_params_dict(self, v):
d = {'key': v}
self.assertEqual(type(base_config.Config(d).key[0]), base_config.Config)
self.assertEqual(type(base_config.params_dict.ParamsDict(d).key[0]), dict)
def test_ppformat(self):
self.assertEqual(
pprint.pformat([
's', 1, 1.0, True, None, {}, [], (), {
(2,): (3, [4], {
6: 7,
}),
8: 9,
}
]),
"['s', 1, 1.0, True, None, {}, [], (), {8: 9, (2,): (3, [4], {6: 7})}]")
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,410 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A parameter dictionary class which supports the nest structure."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import copy
import re
import six
import tensorflow as tf
import yaml
# regex pattern that matches on key-value pairs in a comma-separated
# key-value pair string. It splits each k-v pair on the = sign, and
# matches on values that are within single quotes, double quotes, single
# values (e.g. floats, ints, etc.), and a lists within brackets.
_PARAM_RE = re.compile(r"""
(?P<name>[a-zA-Z][\w\.]*) # variable name: "var" or "x"
\s*=\s*
((?P<val>\'(.*?)\' # single quote
|
\"(.*?)\" # double quote
|
[^,\[]* # single value
|
\[[^\]]*\])) # list of values
($|,\s*)""", re.VERBOSE)
class ParamsDict(object):
"""A hyperparameter container class."""
RESERVED_ATTR = ['_locked', '_restrictions']
def __init__(self, default_params=None, restrictions=None):
"""Instantiate a ParamsDict.
Instantiate a ParamsDict given a set of default parameters and a list of
restrictions. Upon initialization, it validates itself by checking all the
defined restrictions, and raise error if it finds inconsistency.
Args:
default_params: a Python dict or another ParamsDict object including the
default parameters to initialize.
restrictions: a list of strings, which define a list of restrictions to
ensure the consistency of different parameters internally. Each
restriction string is defined as a binary relation with a set of
operators, including {'==', '!=', '<', '<=', '>', '>='}.
"""
self._locked = False
self._restrictions = []
if restrictions:
self._restrictions = restrictions
if default_params is None:
default_params = {}
self.override(default_params, is_strict=False)
self.validate()
def _set(self, k, v):
if isinstance(v, dict):
self.__dict__[k] = ParamsDict(v)
else:
self.__dict__[k] = copy.deepcopy(v)
def __setattr__(self, k, v):
"""Sets the value of the existing key.
Note that this does not allow directly defining a new key. Use the
`override` method with `is_strict=False` instead.
Args:
k: the key string.
v: the value to be used to set the key `k`.
Raises:
KeyError: if k is not defined in the ParamsDict.
"""
if k not in ParamsDict.RESERVED_ATTR:
if k not in self.__dict__.keys():
raise KeyError('The key `%{}` does not exist. '
'To extend the existing keys, use '
'`override` with `is_strict` = True.'.format(k))
if self._locked:
raise ValueError('The ParamsDict has been locked. '
'No change is allowed.')
self._set(k, v)
def __getattr__(self, k):
"""Gets the value of the existing key.
Args:
k: the key string.
Returns:
the value of the key.
Raises:
AttributeError: if k is not defined in the ParamsDict.
"""
if k not in self.__dict__.keys():
raise AttributeError('The key `{}` does not exist. '.format(k))
return self.__dict__[k]
def __contains__(self, key):
"""Implements the membership test operator."""
return key in self.__dict__
def get(self, key, value=None):
"""Accesses through built-in dictionary get method."""
return self.__dict__.get(key, value)
def override(self, override_params, is_strict=True):
"""Override the ParamsDict with a set of given params.
Args:
override_params: a dict or a ParamsDict specifying the parameters to
be overridden.
is_strict: a boolean specifying whether override is strict or not. If
True, keys in `override_params` must be present in the ParamsDict.
If False, keys in `override_params` can be different from what is
currently defined in the ParamsDict. In this case, the ParamsDict will
be extended to include the new keys.
"""
if self._locked:
raise ValueError('The ParamsDict has been locked. No change is allowed.')
if isinstance(override_params, ParamsDict):
override_params = override_params.as_dict()
self._override(override_params, is_strict) # pylint: disable=protected-access
def _override(self, override_dict, is_strict=True):
"""The implementation of `override`."""
for k, v in six.iteritems(override_dict):
if k in ParamsDict.RESERVED_ATTR:
raise KeyError('The key `%{}` is internally reserved. '
'Can not be overridden.')
if k not in self.__dict__.keys():
if is_strict:
raise KeyError('The key `{}` does not exist. '
'To extend the existing keys, use '
'`override` with `is_strict` = False.'.format(k))
else:
self._set(k, v)
else:
if isinstance(v, dict):
self.__dict__[k]._override(v, is_strict) # pylint: disable=protected-access
elif isinstance(v, ParamsDict):
self.__dict__[k]._override(v.as_dict(), is_strict) # pylint: disable=protected-access
else:
self.__dict__[k] = copy.deepcopy(v)
def lock(self):
"""Makes the ParamsDict immutable."""
self._locked = True
def as_dict(self):
"""Returns a dict representation of ParamsDict.
For the nested ParamsDict, a nested dict will be returned.
"""
params_dict = {}
for k, v in six.iteritems(self.__dict__):
if k not in ParamsDict.RESERVED_ATTR:
if isinstance(v, ParamsDict):
params_dict[k] = v.as_dict()
else:
params_dict[k] = copy.deepcopy(v)
return params_dict
def validate(self):
"""Validate the parameters consistency based on the restrictions.
This method validates the internal consistency using the pre-defined list of
restrictions. A restriction is defined as a string which specfiies a binary
operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
'>='}. Note that the meaning of these operators are consistent with the
underlying Python immplementation. Users should make sure the define
restrictions on their type make sense.
For example, for a ParamsDict like the following
```
a:
a1: 1
a2: 2
b:
bb:
bb1: 10
bb2: 20
ccc:
a1: 1
a3: 3
```
one can define two restrictions like this
['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
What it enforces are:
- a.a1 = 1 == b.ccc.a1 = 2
- a.a2 = 2 <= b.bb.bb2 = 20
Raises:
KeyError: if any of the following happens
(1) any of parameters in any of restrictions is not defined in
ParamsDict,
(2) any inconsistency violating the restriction is found.
ValueError: if the restriction defined in the string is not supported.
"""
def _get_kv(dotted_string, params_dict):
tokenized_params = dotted_string.split('.')
v = params_dict
for t in tokenized_params:
v = v[t]
return tokenized_params[-1], v
def _get_kvs(tokens, params_dict):
if len(tokens) != 2:
raise ValueError('Only support binary relation in restriction.')
stripped_tokens = [t.strip() for t in tokens]
left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
return left_k, left_v, right_k, right_v
params_dict = self.as_dict()
for restriction in self._restrictions:
if '==' in restriction:
tokens = restriction.split('==')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v != right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
.format(tokens[0], tokens[1]))
elif '!=' in restriction:
tokens = restriction.split('!=')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v == right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
.format(tokens[0], tokens[1]))
elif '<' in restriction:
tokens = restriction.split('<')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v >= right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
.format(tokens[0], tokens[1]))
elif '<=' in restriction:
tokens = restriction.split('<=')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v > right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
.format(tokens[0], tokens[1]))
elif '>' in restriction:
tokens = restriction.split('>')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v <= right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
.format(tokens[0], tokens[1]))
elif '>=' in restriction:
tokens = restriction.split('>=')
_, left_v, _, right_v = _get_kvs(tokens, params_dict)
if left_v < right_v:
raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
.format(tokens[0], tokens[1]))
else:
raise ValueError('Unsupported relation in restriction.')
def read_yaml_to_params_dict(file_path):
"""Reads a YAML file to a ParamsDict."""
with tf.io.gfile.GFile(file_path, 'r') as f:
params_dict = yaml.load(f)
return ParamsDict(params_dict)
def save_params_dict_to_yaml(params, file_path):
"""Saves the input ParamsDict to a YAML file."""
with tf.io.gfile.GFile(file_path, 'w') as f:
def _my_list_rep(dumper, data):
# u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
return dumper.represent_sequence(
u'tag:yaml.org,2002:seq', data, flow_style=True)
yaml.add_representer(list, _my_list_rep)
yaml.dump(params.as_dict(), f, default_flow_style=False)
def nested_csv_str_to_json_str(csv_str):
"""Converts a nested (using '.') comma-separated k=v string to a JSON string.
Converts a comma-separated string of key/value pairs that supports
nesting of keys to a JSON string. Nesting is implemented using
'.' between levels for a given key.
Spacing between commas and = is supported (e.g. there is no difference between
"a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
Note that this will only support values supported by CSV, meaning
values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
supported. Strings are supported as well, e.g. "a='hello'".
An example conversion would be:
"a=1, b=2, c.a=2, c.b=3, d.a.a=5"
to
"{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
Args:
csv_str: the comma separated string.
Returns:
the converted JSON string.
Raises:
ValueError: If csv_str is not in a comma separated string or
if the string is formatted incorrectly.
"""
if not csv_str:
return ''
formatted_entries = []
nested_map = collections.defaultdict(list)
pos = 0
while pos < len(csv_str):
m = _PARAM_RE.match(csv_str, pos)
if not m:
raise ValueError('Malformed hyperparameter value while parsing '
'CSV string: %s' % csv_str[pos:])
pos = m.end()
# Parse the values.
m_dict = m.groupdict()
name = m_dict['name']
v = m_dict['val']
# If a GCS path (e.g. gs://...) is provided, wrap this in quotes
# as yaml.load would otherwise throw an exception
if re.match(r'(?=[^\"\'])(?=[gs://])', v):
v = '\'{}\''.format(v)
name_nested = name.split('.')
if len(name_nested) > 1:
grouping = name_nested[0]
value = '.'.join(name_nested[1:]) + '=' + v
nested_map[grouping].append(value)
else:
formatted_entries.append('%s : %s' % (name, v))
for grouping, value in nested_map.items():
value = ','.join(value)
value = nested_csv_str_to_json_str(value)
formatted_entries.append('%s : %s' % (grouping, value))
return '{' + ', '.join(formatted_entries) + '}'
def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
"""Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
The logic of the function is outlined below:
1. Test that the input is a dict. If not, proceed to 2.
2. Tests that the input is a string. If not, raise unknown ValueError
2.1. Test if the string is in a CSV format. If so, parse.
If not, proceed to 2.2.
2.2. Try loading the string as a YAML/JSON. If successful, parse to
dict and use it to override. If not, proceed to 2.3.
2.3. Try using the string as a file path and load the YAML file.
Args:
params: a ParamsDict object to be overridden.
dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or
path to a YAML file specifying the parameters to be overridden.
is_strict: a boolean specifying whether override is strict or not.
Returns:
params: the overridden ParamsDict object.
Raises:
ValueError: if failed to override the parameters.
"""
if not dict_or_string_or_yaml_file:
return params
if isinstance(dict_or_string_or_yaml_file, dict):
params.override(dict_or_string_or_yaml_file, is_strict)
elif isinstance(dict_or_string_or_yaml_file, six.string_types):
try:
dict_or_string_or_yaml_file = (
nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
except ValueError:
pass
params_dict = yaml.load(dict_or_string_or_yaml_file)
if isinstance(params_dict, dict):
params.override(params_dict, is_strict)
else:
with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
params.override(yaml.load(f), is_strict)
else:
raise ValueError('Unknown input type to parse.')
return params
@@ -0,0 +1,322 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for official.modeling.hyperparams.params_dict.py."""
import os
import tensorflow as tf
import yaml
from official.modeling.hyperparams import params_dict
class ParamsDictTest(tf.test.TestCase):
def test_init_from_an_empty_dict(self):
params = params_dict.ParamsDict()
with self.assertRaises(AttributeError):
_ = params.a
with self.assertRaises(KeyError):
params.a = 'aa'
def test_init_from_a_dict(self):
params = params_dict.ParamsDict({'a': 'aa', 'b': 2})
self.assertEqual(params.a, 'aa')
self.assertEqual(params.b, 2)
def test_init_from_a_param_dict(self):
params_init = params_dict.ParamsDict({'a': 'aa', 'b': 2})
params = params_dict.ParamsDict(params_init)
self.assertEqual(params.a, 'aa')
self.assertEqual(params.b, 2)
def test_lock(self):
params = params_dict.ParamsDict({'a': 1, 'b': 2})
params.lock()
with self.assertRaises(ValueError):
params.a = 10
with self.assertRaises(ValueError):
params.override({'b': 20})
def test_setattr(self):
params = params_dict.ParamsDict()
params.override(
{'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
params.c = 'ccc'
self.assertEqual(params.a, 'aa')
self.assertEqual(params.b, 2)
self.assertEqual(params.c, 'ccc')
def test_getattr(self):
params = params_dict.ParamsDict()
params.override(
{'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
self.assertEqual(params.a, 'aa')
self.assertEqual(params.b, 2)
self.assertEqual(params.c, None)
def test_contains(self):
params = params_dict.ParamsDict()
params.override(
{'a': 'aa'}, is_strict=False)
self.assertIn('a', params)
self.assertNotIn('b', params)
def test_get(self):
params = params_dict.ParamsDict()
params.override(
{'a': 'aa'}, is_strict=False)
self.assertEqual(params.get('a'), 'aa')
self.assertEqual(params.get('b', 2), 2)
self.assertEqual(params.get('b'), None)
def test_override_is_strict_true(self):
params = params_dict.ParamsDict(
{'a': 'aa', 'b': 2, 'c': {'c1': 'cc', 'c2': 20}})
params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
self.assertEqual(params.a, 2)
self.assertEqual(params.c.c1, 'ccc')
with self.assertRaises(KeyError):
params.override({'d': 'ddd'}, is_strict=True)
with self.assertRaises(KeyError):
params.override({'c': {'c3': 30}}, is_strict=True)
def test_override_is_strict_false(self):
params = params_dict.ParamsDict(
{'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
params.override({'a': 2, 'c': {'c3': 3000}}, is_strict=False)
self.assertEqual(params.a, 2)
self.assertEqual(params.c.c3, 3000)
params.override({'d': 'ddd'}, is_strict=False)
self.assertEqual(params.d, 'ddd')
params.override({'c': {'c4': 4444}}, is_strict=False)
self.assertEqual(params.c.c4, 4444)
def test_as_dict(self):
params = params_dict.ParamsDict(
{'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
params_d = params.as_dict()
self.assertEqual(params_d['a'], 'aa')
self.assertEqual(params_d['b'], 2)
self.assertEqual(params_d['c']['c1'], 10)
self.assertEqual(params_d['c']['c2'], 20)
def test_validate(self):
# Raise error due to the unknown parameter.
with self.assertRaises(KeyError):
params = params_dict.ParamsDict(
{'a': 1, 'b': {'a': 11}}, ['a == c'])
# OK to check equality of two nested dicts.
params = params_dict.ParamsDict(
{'a': 1, 'b': {'a': 10}, 'c': {'a': 10}}, ['b == c'])
# Raise error due to inconsistency
with self.assertRaises(KeyError):
params = params_dict.ParamsDict(
{'a': 1, 'c': {'a': 10}}, ['a == c.a'])
# Valid rule.
params = params_dict.ParamsDict(
{'a': 1, 'c': {'a': 1}}, ['a == c.a'])
# Overridding violates the existing rule, raise error upon validate.
params.override({'a': 11})
with self.assertRaises(KeyError):
params.validate()
class ParamsDictIOTest(tf.test.TestCase):
def write_temp_file(self, filename, text):
temp_file = os.path.join(self.get_temp_dir(), filename)
with tf.io.gfile.GFile(temp_file, 'w') as writer:
writer.write(text)
return temp_file
def test_save_params_dict_to_yaml(self):
params = params_dict.ParamsDict(
{'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
output_yaml_file = os.path.join(self.get_temp_dir(), 'params.yaml')
params_dict.save_params_dict_to_yaml(params, output_yaml_file)
with tf.io.gfile.GFile(output_yaml_file, 'r') as f:
params_d = yaml.load(f)
self.assertEqual(params.a, params_d['a'])
self.assertEqual(params.b, params_d['b'])
self.assertEqual(params.c.c1, params_d['c']['c1'])
self.assertEqual(params.c.c2, params_d['c']['c2'])
def test_read_yaml_to_params_dict(self):
input_yaml_file = self.write_temp_file(
'params.yaml', r"""
a: 'aa'
b: 2
c:
c1: 10
c2: 20
""")
params = params_dict.read_yaml_to_params_dict(input_yaml_file)
self.assertEqual(params.a, 'aa')
self.assertEqual(params.b, 2)
self.assertEqual(params.c.c1, 10)
self.assertEqual(params.c.c2, 20)
def test_override_params_dict_using_dict(self):
params = params_dict.ParamsDict({
'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
override_dict = {'b': 5.2, 'c': [30, 40]}
params = params_dict.override_params_dict(
params, override_dict, is_strict=True)
self.assertEqual(1, params.a)
self.assertEqual(5.2, params.b)
self.assertEqual([30, 40], params.c)
self.assertEqual('hello', params.d)
self.assertEqual(False, params.e)
def test_override_params_dict_using_yaml_string(self):
params = params_dict.ParamsDict({
'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
override_yaml_string = "'b': 5.2\n'c': [30, 40]"
params = params_dict.override_params_dict(
params, override_yaml_string, is_strict=True)
self.assertEqual(1, params.a)
self.assertEqual(5.2, params.b)
self.assertEqual([30, 40], params.c)
self.assertEqual('hello', params.d)
self.assertEqual(False, params.e)
def test_override_params_dict_using_json_string(self):
params = params_dict.ParamsDict({
'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
'd': {'d1': {'d2': 'hello'}}, 'e': False})
override_json_string = "{ b: { b2: [3, 4] }, d: { d1: { d2: 'hi' } } }"
params = params_dict.override_params_dict(
params, override_json_string, is_strict=True)
self.assertEqual(1, params.a)
self.assertEqual(2, params.b.b1)
self.assertEqual([3, 4], params.b.b2)
self.assertEqual('hi', params.d.d1.d2)
self.assertEqual(False, params.e)
def test_override_params_dict_using_csv_string(self):
params = params_dict.ParamsDict({
'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
'd': {'d1': {'d2': 'hello'}}, 'e': False})
override_csv_string = "b.b2=[3,4], d.d1.d2='hi, world', e=gs://test"
params = params_dict.override_params_dict(
params, override_csv_string, is_strict=True)
self.assertEqual(1, params.a)
self.assertEqual(2, params.b.b1)
self.assertEqual([3, 4], params.b.b2)
self.assertEqual('hi, world', params.d.d1.d2)
self.assertEqual('gs://test', params.e)
def test_override_params_dict_using_yaml_file(self):
params = params_dict.ParamsDict({
'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
override_yaml_file = self.write_temp_file(
'params.yaml', r"""
b: 5.2
c: [30, 40]
""")
params = params_dict.override_params_dict(
params, override_yaml_file, is_strict=True)
self.assertEqual(1, params.a)
self.assertEqual(5.2, params.b)
self.assertEqual([30, 40], params.c)
self.assertEqual('hello', params.d)
self.assertEqual(False, params.e)
class IOTest(tf.test.TestCase):
def test_basic_csv_str_to_json_str(self):
csv_str = 'a=1,b=2,c=3'
json_str = '{a : 1, b : 2, c : 3}'
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
self.assertEqual(converted_csv_str, json_str)
def test_basic_csv_str_load(self):
csv_str = 'a=1,b=2,c=3'
expected_output = {'a': 1, 'b': 2, 'c': 3}
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
converted_dict = yaml.load(converted_csv_str)
self.assertDictEqual(converted_dict, expected_output)
def test_basic_nested_csv_str_to_json_str(self):
csv_str = 'a=1,b.b1=2'
json_str = '{a : 1, b : {b1 : 2}}'
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
self.assertEqual(converted_csv_str, json_str)
def test_basic_nested_csv_str_load(self):
csv_str = 'a=1,b.b1=2,c.c1=3'
expected_output = {'a': 1, 'b': {'b1': 2}, 'c': {'c1': 3}}
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
converted_dict = yaml.load(converted_csv_str)
self.assertDictEqual(converted_dict, expected_output)
def test_complex_nested_csv_str_to_json_str(self):
csv_str = 'a.aa.aaa.aaaaa.a=1'
json_str = '{a : {aa : {aaa : {aaaaa : {a : 1}}}}}'
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
self.assertEqual(converted_csv_str, json_str)
def test_complex_nested_csv_str_load(self):
csv_str = 'a.aa.aaa.aaaaa.a=1,a.a=2'
expected_output = {'a': {'aa': {'aaa': {'aaaaa': {'a': 1}}}, 'a': 2}}
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
converted_dict = yaml.load(converted_csv_str)
self.assertDictEqual(converted_dict, expected_output)
def test_csv_str_load_supported_datatypes(self):
csv_str = 'a=1,b=2.,c=[1,2,3],d=\'hello, there\',e=\"Hi.\"'
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
converted_dict = yaml.load(converted_csv_str)
self.assertEqual(converted_dict['a'], 1)
self.assertEqual(converted_dict['b'], 2.)
self.assertEqual(converted_dict['c'], [1, 2, 3])
self.assertEqual(converted_dict['d'], 'hello, there')
self.assertEqual(converted_dict['e'], 'Hi.')
def test_csv_str_load_unsupported_datatypes(self):
csv_str = 'a=[[1,2,3],[4,5,6]]'
self.assertRaises(ValueError,
params_dict.nested_csv_str_to_json_str,
csv_str)
def test_csv_str_to_json_str_spacing(self):
csv_str1 = 'a=1,b=2,c=3'
csv_str2 = 'a = 1, b = 2, c = 3'
json_str = '{a : 1, b : 2, c : 3}'
converted_csv_str1 = params_dict.nested_csv_str_to_json_str(csv_str1)
converted_csv_str2 = params_dict.nested_csv_str_to_json_str(csv_str2)
self.assertEqual(converted_csv_str1, converted_csv_str2)
self.assertEqual(converted_csv_str1, json_str)
self.assertEqual(converted_csv_str2, json_str)
def test_gcs_added_quotes(self):
csv_str = 'a=gs://abc, b=gs://def'
expected_output = '{a : \'gs://abc\', b : \'gs://def\'}'
converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
self.assertEqual(converted_csv_str, expected_output)
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,491 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""A light weight utilities to train NLP models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import tempfile
from absl import logging
import tensorflow as tf
from official.staging.training import grad_utils
from official.utils.misc import distribution_utils
_SUMMARY_TXT = 'training_summary.txt'
_MIN_SUMMARY_STEPS = 10
def _should_export_checkpoint(strategy):
return (not strategy) or strategy.extended.should_checkpoint
def _should_export_summary(strategy):
return (not strategy) or strategy.extended.should_save_summary
def _save_checkpoint(strategy, checkpoint, model_dir, checkpoint_prefix):
"""Saves model to with provided checkpoint prefix."""
if _should_export_checkpoint(strategy):
checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
saved_path = checkpoint.save(checkpoint_path)
logging.info('Saving model as TF checkpoint: %s', saved_path)
else:
# In multi worker training we need every worker to save checkpoint, because
# variables can trigger synchronization on read and synchronization needs
# all workers to participate. To avoid workers overriding each other we save
# to a temporary directory on non-chief workers.
tmp_dir = tempfile.mkdtemp()
checkpoint.save(os.path.join(tmp_dir, 'ckpt'))
tf.io.gfile.rmtree(tmp_dir)
return
def _get_input_iterator(input_fn, strategy):
"""Returns distributed dataset iterator."""
# When training with TPU pods, datasets needs to be cloned across
# workers. Since Dataset instance cannot be cloned in eager mode, we instead
# pass callable that returns a dataset.
if not callable(input_fn):
raise ValueError('`input_fn` should be a closure that returns a dataset.')
iterator = iter(
strategy.experimental_distribute_datasets_from_function(input_fn))
return iterator
def _float_metric_value(metric):
"""Gets the value of a float-value keras metric."""
return metric.result().numpy().astype(float)
def steps_to_run(current_step, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
remainder_in_epoch = current_step % steps_per_epoch
if remainder_in_epoch != 0:
return min(steps_per_epoch - remainder_in_epoch, steps_per_loop)
else:
return steps_per_loop
def write_txt_summary(training_summary, summary_dir):
"""Writes a summary text file to record stats."""
summary_path = os.path.join(summary_dir, _SUMMARY_TXT)
with tf.io.gfile.GFile(summary_path, 'wb') as f:
logging.info('Training Summary: \n%s', str(training_summary))
f.write(json.dumps(training_summary, indent=4))
def run_customized_training_loop(
# pylint: disable=invalid-name
_sentinel=None,
# pylint: enable=invalid-name
strategy=None,
model_fn=None,
loss_fn=None,
scale_loss=True,
model_dir=None,
train_input_fn=None,
steps_per_epoch=None,
steps_per_loop=1,
epochs=1,
eval_input_fn=None,
eval_steps=None,
metric_fn=None,
init_checkpoint=None,
custom_callbacks=None,
run_eagerly=False,
sub_model_export_name=None,
explicit_allreduce=False,
pre_allreduce_callbacks=None,
post_allreduce_callbacks=None):
"""Run BERT pretrain model training using low-level API.
Arguments:
_sentinel: Used to prevent positional parameters. Internal, do not use.
strategy: Distribution strategy on which to run low level training loop.
model_fn: Function that returns a tuple (model, sub_model). Caller of this
function should add optimizer to the `model` via calling
`model.compile()` API or manually setting `model.optimizer` attribute.
Second element of the returned tuple(sub_model) is an optional sub model
to be used for initial checkpoint -- if provided.
loss_fn: Function with signature func(labels, logits) and returns a loss
tensor.
scale_loss: Whether to divide the raw loss by number of replicas before
gradients calculation.
model_dir: Model directory used during training for restoring/saving model
weights.
train_input_fn: Function that returns a tf.data.Dataset used for training.
steps_per_epoch: Number of steps to run per epoch. At the end of each
epoch, model checkpoint will be saved and evaluation will be conducted
if evaluation dataset is provided.
steps_per_loop: Number of steps per graph-mode loop. In order to reduce
communication in eager context, training logs are printed every
steps_per_loop.
epochs: Number of epochs to train.
eval_input_fn: Function that returns evaluation dataset. If none,
evaluation is skipped.
eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
is not none.
metric_fn: A metrics function that returns a Keras Metric object to record
evaluation result using evaluation dataset or with training dataset
after every epoch.
init_checkpoint: Optional checkpoint to load to `sub_model` returned by
`model_fn`.
custom_callbacks: A list of Keras Callbacks objects to run during
training. More specifically, `on_batch_begin()`, `on_batch_end()`,
methods are invoked during training.
run_eagerly: Whether to run model training in pure eager execution. This
should be disable for TPUStrategy.
sub_model_export_name: If not None, will export `sub_model` returned by
`model_fn` into checkpoint files. The name of intermediate checkpoint
file is {sub_model_export_name}_step_{step}.ckpt and the last
checkpint's name is {sub_model_export_name}.ckpt;
if None, `sub_model` will not be exported as checkpoint.
explicit_allreduce: Whether to explicitly perform gradient allreduce,
instead of relying on implicit allreduce in optimizer.apply_gradients().
default is False. For now, if training using FP16 mixed precision,
explicit allreduce will aggregate gradients in FP16 format. For TPU and
GPU training using FP32, explicit allreduce will aggregate gradients in
FP32 format.
pre_allreduce_callbacks: A list of callback functions that takes gradients
and model variables pairs as input, manipulate them, and returns a new
gradients and model variables paris. The callback functions will be
invoked in the list order and before gradients are allreduced.
With mixed precision training, the pre_allreduce_allbacks will be
applied on scaled_gradients. Default is no callbacks.
Only used when explicit_allreduce=True.
post_allreduce_callbacks: A list of callback functions that takes
gradients and model variables pairs as input, manipulate them, and
returns a new gradients and model variables paris. The callback
functions will be invoked in the list order and right before gradients
are applied to variables for updates. Default is no callbacks. Only used
when explicit_allreduce=True.
Returns:
Trained model.
Raises:
ValueError: (1) When model returned by `model_fn` does not have optimizer
attribute or when required parameters are set to none. (2) eval args are
not specified correctly. (3) metric_fn must be a callable if specified.
(4) sub_model_checkpoint_name is specified, but `sub_model` returned
by `model_fn` is None.
"""
if _sentinel is not None:
raise ValueError('only call `run_customized_training_loop()` '
'with named arguments.')
required_arguments = [
strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
]
if [arg for arg in required_arguments if arg is None]:
raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
'`steps_per_loop` and `steps_per_epoch` are required '
'parameters.')
if steps_per_loop > steps_per_epoch:
logging.error(
'steps_per_loop: %d is specified to be greater than '
' steps_per_epoch: %d, we will use steps_per_epoch as'
' steps_per_loop.', steps_per_loop, steps_per_epoch)
steps_per_loop = steps_per_epoch
assert tf.executing_eagerly()
if run_eagerly:
if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
raise ValueError(
'TPUStrategy should not run eagerly as it heavily relies on graph'
' optimization for the distributed system.')
if eval_input_fn and (eval_steps is None or metric_fn is None):
raise ValueError(
'`eval_step` and `metric_fn` are required when `eval_input_fn ` '
'is not none.')
if metric_fn and not callable(metric_fn):
raise ValueError(
'if `metric_fn` is specified, metric_fn must be a callable.')
total_training_steps = steps_per_epoch * epochs
train_iterator = _get_input_iterator(train_input_fn, strategy)
with distribution_utils.get_strategy_scope(strategy):
# To correctly place the model weights on accelerators,
# model and optimizer should be created in scope.
model, sub_model = model_fn()
if not hasattr(model, 'optimizer'):
raise ValueError('User should set optimizer attribute to model '
'inside `model_fn`.')
if sub_model_export_name and sub_model is None:
raise ValueError('sub_model_export_name is specified as %s, but '
'sub_model is None.' % sub_model_export_name)
optimizer = model.optimizer
if init_checkpoint:
logging.info(
'Checkpoint file %s found and restoring from '
'initial checkpoint for core model.', init_checkpoint)
checkpoint = tf.train.Checkpoint(model=sub_model)
checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
logging.info('Loading from checkpoint file completed')
train_loss_metric = tf.keras.metrics.Mean(
'training_loss', dtype=tf.float32)
eval_metrics = [metric_fn()] if metric_fn else []
# If evaluation is required, make a copy of metric as it will be used by
# both train and evaluation.
train_metrics = [
metric.__class__.from_config(metric.get_config())
for metric in eval_metrics
]
# Create summary writers
if _should_export_summary(strategy):
summary_dir = os.path.join(model_dir, 'summaries')
else:
# In multi worker training we need every worker to write summary, because
# variables can trigger synchronization on read and synchronization needs
# all workers to participate.
summary_dir = tempfile.mkdtemp()
eval_summary_writer = tf.summary.create_file_writer(
os.path.join(summary_dir, 'eval'))
if steps_per_loop >= _MIN_SUMMARY_STEPS:
# Only writes summary when the stats are collected sufficiently over
# enough steps.
train_summary_writer = tf.summary.create_file_writer(
os.path.join(summary_dir, 'train'))
else:
train_summary_writer = None
# Collects training variables.
training_vars = model.trainable_variables
def _replicated_step(inputs):
"""Replicated training step."""
inputs, labels = inputs
with tf.GradientTape() as tape:
model_outputs = model(inputs, training=True)
loss = loss_fn(labels, model_outputs)
# Raw loss is used for reporting in metrics/logs.
raw_loss = loss
if scale_loss:
# Scales down the loss for gradients to be invariant from replicas.
loss = loss / strategy.num_replicas_in_sync
if explicit_allreduce:
grad_utils.minimize_using_explicit_allreduce(tape, optimizer, loss,
training_vars,
pre_allreduce_callbacks,
post_allreduce_callbacks)
else:
if isinstance(optimizer,
tf.keras.mixed_precision.experimental.LossScaleOptimizer):
with tape:
scaled_loss = optimizer.get_scaled_loss(loss)
scaled_grads = tape.gradient(scaled_loss, training_vars)
grads = optimizer.get_unscaled_gradients(scaled_grads)
else:
grads = tape.gradient(loss, training_vars)
optimizer.apply_gradients(zip(grads, training_vars))
# For reporting, the metric takes the mean of losses.
train_loss_metric.update_state(raw_loss)
for metric in train_metrics:
metric.update_state(labels, model_outputs)
@tf.function
def train_steps(iterator, steps):
"""Performs distributed training steps in a loop.
Args:
iterator: the distributed iterator of training datasets.
steps: an tf.int32 integer tensor to specify number of steps to run
inside host training loop.
Raises:
ValueError: Any of the arguments or tensor shapes are invalid.
"""
if not isinstance(steps, tf.Tensor):
raise ValueError('steps should be an Tensor. Python object may cause '
'retracing.')
for _ in tf.range(steps):
strategy.run(_replicated_step, args=(next(iterator),))
def train_single_step(iterator):
"""Performs a distributed training step.
Args:
iterator: the distributed iterator of training datasets.
Raises:
ValueError: Any of the arguments or tensor shapes are invalid.
"""
strategy.run(_replicated_step, args=(next(iterator),))
def test_step(iterator):
"""Calculates evaluation metrics on distributed devices."""
def _test_step_fn(inputs):
"""Replicated accuracy calculation."""
inputs, labels = inputs
model_outputs = model(inputs, training=False)
for metric in eval_metrics:
metric.update_state(labels, model_outputs)
strategy.run(_test_step_fn, args=(next(iterator),))
if not run_eagerly:
train_single_step = tf.function(train_single_step)
test_step = tf.function(test_step)
def _run_evaluation(current_training_step, test_iterator):
"""Runs validation steps and aggregate metrics."""
for _ in range(eval_steps):
test_step(test_iterator)
with eval_summary_writer.as_default():
for metric in eval_metrics + model.metrics:
metric_value = _float_metric_value(metric)
logging.info('Step: [%d] Validation %s = %f', current_training_step,
metric.name, metric_value)
tf.summary.scalar(
metric.name, metric_value, step=current_training_step)
eval_summary_writer.flush()
def _run_callbacks_on_batch_begin(batch):
"""Runs custom callbacks at the start of every step."""
if not custom_callbacks:
return
for callback in custom_callbacks:
callback.on_batch_begin(batch)
def _run_callbacks_on_batch_end(batch, logs):
"""Runs custom callbacks at the end of every step."""
if not custom_callbacks:
return
for callback in custom_callbacks:
callback.on_batch_end(batch, logs)
# Training loop starts here.
checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
sub_model_checkpoint = tf.train.Checkpoint(
model=sub_model) if sub_model_export_name else None
latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
if latest_checkpoint_file:
logging.info(
'Checkpoint file %s found and restoring from '
'checkpoint', latest_checkpoint_file)
checkpoint.restore(latest_checkpoint_file)
logging.info('Loading from checkpoint file completed')
current_step = optimizer.iterations.numpy()
checkpoint_name = 'ctl_step_{step}.ckpt'
while current_step < total_training_steps:
# Training loss/metric are taking average over steps inside micro
# training loop. We reset the their values before each round.
train_loss_metric.reset_states()
for metric in train_metrics + model.metrics:
metric.reset_states()
_run_callbacks_on_batch_begin(current_step)
# Runs several steps in the host while loop.
steps = steps_to_run(current_step, steps_per_epoch, steps_per_loop)
if tf.test.is_built_with_cuda():
# TODO(zongweiz): merge with train_steps once tf.while_loop
# GPU performance bugs are fixed.
for _ in range(steps):
train_single_step(train_iterator)
else:
# Converts steps to a Tensor to avoid tf.function retracing.
train_steps(train_iterator,
tf.convert_to_tensor(steps, dtype=tf.int32))
train_loss = _float_metric_value(train_loss_metric)
current_step += steps
_run_callbacks_on_batch_end(current_step - 1, {'loss': train_loss})
# Updates training logging.
training_status = 'Train Step: %d/%d / loss = %s' % (
current_step, total_training_steps, train_loss)
if train_summary_writer:
with train_summary_writer.as_default():
tf.summary.scalar(
train_loss_metric.name, train_loss, step=current_step)
for metric in train_metrics + model.metrics:
metric_value = _float_metric_value(metric)
training_status += ' %s = %f' % (metric.name, metric_value)
tf.summary.scalar(metric.name, metric_value, step=current_step)
train_summary_writer.flush()
logging.info(training_status)
# Saves model checkpoints and run validation steps at every epoch end.
if current_step % steps_per_epoch == 0:
# To avoid repeated model saving, we do not save after the last
# step of training.
if current_step < total_training_steps:
_save_checkpoint(strategy, checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if sub_model_export_name:
_save_checkpoint(
strategy, sub_model_checkpoint, model_dir,
'%s_step_%d.ckpt' % (sub_model_export_name, current_step))
if eval_input_fn:
logging.info('Running evaluation after step: %s.', current_step)
_run_evaluation(current_step,
_get_input_iterator(eval_input_fn, strategy))
# Re-initialize evaluation metric.
for metric in eval_metrics + model.metrics:
metric.reset_states()
_save_checkpoint(strategy, checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if sub_model_export_name:
_save_checkpoint(strategy, sub_model_checkpoint, model_dir,
'%s.ckpt' % sub_model_export_name)
if eval_input_fn:
logging.info('Running final evaluation after training is complete.')
_run_evaluation(current_step,
_get_input_iterator(eval_input_fn, strategy))
training_summary = {
'total_training_steps': total_training_steps,
'train_loss': _float_metric_value(train_loss_metric),
}
if eval_metrics:
# TODO(hongkuny): Cleans up summary reporting in text.
training_summary['last_train_metrics'] = _float_metric_value(
train_metrics[0])
training_summary['eval_metrics'] = _float_metric_value(eval_metrics[0])
write_txt_summary(training_summary, summary_dir)
if not _should_export_summary(strategy):
tf.io.gfile.rmtree(summary_dir)
return model
@@ -0,0 +1,235 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for official.modeling.training.model_training_utils."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl.testing import parameterized
from absl.testing.absltest import mock
import numpy as np
import tensorflow as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.modeling import model_training_utils
def eager_strategy_combinations():
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
strategy_combinations.mirrored_strategy_with_two_gpus,
],
mode='eager',
)
def eager_gpu_strategy_combinations():
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.one_device_strategy_gpu,
strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
strategy_combinations.mirrored_strategy_with_two_gpus,
],
mode='eager',
)
def create_fake_data_input_fn(batch_size, features_shape, num_classes):
"""Creates a dummy input function with the given feature and label shapes.
Args:
batch_size: integer.
features_shape: list[int]. Feature shape for an individual example.
num_classes: integer. Number of labels.
Returns:
An input function that is usable in the executor.
"""
def _dataset_fn(input_context=None):
"""An input function for generating fake data."""
local_batch_size = input_context.get_per_replica_batch_size(batch_size)
features = np.random.rand(64, *features_shape)
labels = np.random.randint(2, size=[64, num_classes])
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((features, labels))
dataset = dataset.shard(input_context.num_input_pipelines,
input_context.input_pipeline_id)
def _assign_dtype(features, labels):
features = tf.cast(features, tf.float32)
labels = tf.cast(labels, tf.float32)
return features, labels
# Shuffle, repeat, and batch the examples.
dataset = dataset.map(_assign_dtype)
dataset = dataset.shuffle(64).repeat()
dataset = dataset.batch(local_batch_size, drop_remainder=True)
dataset = dataset.prefetch(buffer_size=64)
return dataset
return _dataset_fn
def create_model_fn(input_shape, num_classes, use_float16=False):
def _model_fn():
"""A one-layer softmax model suitable for testing."""
input_layer = tf.keras.layers.Input(shape=input_shape)
x = tf.keras.layers.Dense(num_classes, activation='relu')(input_layer)
output_layer = tf.keras.layers.Dense(num_classes, activation='softmax')(x)
sub_model = tf.keras.models.Model(input_layer, x, name='sub_model')
model = tf.keras.models.Model(input_layer, output_layer, name='model')
model.add_metric(
tf.reduce_mean(input_layer), name='mean_input', aggregation='mean')
model.optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9)
if use_float16:
model.optimizer = (
tf.keras.mixed_precision.experimental.LossScaleOptimizer(
model.optimizer, loss_scale='dynamic'))
return model, sub_model
return _model_fn
def metric_fn():
"""Gets a tf.keras metric object."""
return tf.keras.metrics.CategoricalAccuracy(name='accuracy', dtype=tf.float32)
def summaries_with_matching_keyword(keyword, summary_dir):
"""Yields summary protos matching given keyword from event file."""
event_paths = tf.io.gfile.glob(os.path.join(summary_dir, 'events*'))
for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
if event.summary is not None:
for value in event.summary.value:
if keyword in value.tag:
tf.compat.v1.logging.error(event)
yield event.summary
def check_eventfile_for_keyword(keyword, summary_dir):
"""Checks event files for the keyword."""
return any(summaries_with_matching_keyword(keyword, summary_dir))
class ModelTrainingUtilsTest(tf.test.TestCase, parameterized.TestCase):
def setUp(self):
super(ModelTrainingUtilsTest, self).setUp()
self._model_fn = create_model_fn(input_shape=[128], num_classes=3)
def run_training(self, strategy, model_dir, steps_per_loop, run_eagerly):
input_fn = create_fake_data_input_fn(
batch_size=8, features_shape=[128], num_classes=3)
model_training_utils.run_customized_training_loop(
strategy=strategy,
model_fn=self._model_fn,
loss_fn=tf.keras.losses.categorical_crossentropy,
model_dir=model_dir,
steps_per_epoch=20,
steps_per_loop=steps_per_loop,
epochs=2,
train_input_fn=input_fn,
eval_input_fn=input_fn,
eval_steps=10,
init_checkpoint=None,
metric_fn=metric_fn,
custom_callbacks=None,
run_eagerly=run_eagerly)
@combinations.generate(eager_strategy_combinations())
def test_train_eager_single_step(self, distribution):
model_dir = self.get_temp_dir()
if isinstance(distribution, tf.distribute.experimental.TPUStrategy):
with self.assertRaises(ValueError):
self.run_training(
distribution, model_dir, steps_per_loop=1, run_eagerly=True)
else:
self.run_training(
distribution, model_dir, steps_per_loop=1, run_eagerly=True)
@combinations.generate(eager_gpu_strategy_combinations())
def test_train_eager_mixed_precision(self, distribution):
model_dir = self.get_temp_dir()
policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
tf.keras.mixed_precision.experimental.set_policy(policy)
self._model_fn = create_model_fn(
input_shape=[128], num_classes=3, use_float16=True)
self.run_training(
distribution, model_dir, steps_per_loop=1, run_eagerly=True)
@combinations.generate(eager_strategy_combinations())
def test_train_check_artifacts(self, distribution):
model_dir = self.get_temp_dir()
self.run_training(
distribution, model_dir, steps_per_loop=10, run_eagerly=False)
# Two checkpoints should be saved after two epochs.
self.assertNotEmpty(tf.io.gfile.glob(os.path.join(model_dir, 'ctl_step_*')))
self.assertNotEmpty(
tf.io.gfile.glob(
os.path.join(model_dir, 'summaries/training_summary*')))
# Loss and accuracy values should be written into summaries.
self.assertTrue(
check_eventfile_for_keyword('loss',
os.path.join(model_dir, 'summaries/train')))
self.assertTrue(
check_eventfile_for_keyword('accuracy',
os.path.join(model_dir, 'summaries/train')))
self.assertTrue(
check_eventfile_for_keyword('mean_input',
os.path.join(model_dir, 'summaries/train')))
self.assertTrue(
check_eventfile_for_keyword('accuracy',
os.path.join(model_dir, 'summaries/eval')))
self.assertTrue(
check_eventfile_for_keyword('mean_input',
os.path.join(model_dir, 'summaries/eval')))
@combinations.generate(
combinations.combine(
distribution=[
strategy_combinations.one_device_strategy_gpu,
],
mode='eager',
))
def test_train_check_artifacts_non_chief(self, distribution):
# We shouldn't export artifacts on non-chief workers. Since there's no easy
# way to test with real MultiWorkerMirroredStrategy, we patch the strategy
# to make it as if it's MultiWorkerMirroredStrategy on non-chief workers.
extended = distribution.extended
with mock.patch.object(extended.__class__, 'should_checkpoint',
new_callable=mock.PropertyMock, return_value=False), \
mock.patch.object(extended.__class__, 'should_save_summary',
new_callable=mock.PropertyMock, return_value=False):
model_dir = self.get_temp_dir()
self.run_training(
distribution, model_dir, steps_per_loop=10, run_eagerly=False)
self.assertEmpty(tf.io.gfile.listdir(model_dir))
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,56 @@
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to training performance."""
import tensorflow as tf
def configure_optimizer(optimizer,
use_float16=False,
use_graph_rewrite=False,
loss_scale="dynamic"):
"""Configures optimizer object with performance options."""
if use_float16:
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call
# compile(), we must wrap the optimizer manually.
optimizer = (
tf.keras.mixed_precision.experimental.LossScaleOptimizer(
optimizer, loss_scale=loss_scale))
if use_graph_rewrite:
# Note: the model dtype must be 'float32', which will ensure
# tf.ckeras.mixed_precision and
# tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
# up.
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
optimizer)
return optimizer
def set_mixed_precision_policy(dtype, loss_scale=None):
"""Sets mix precision policy."""
if dtype == tf.float16:
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_float16', loss_scale=loss_scale)
tf.keras.mixed_precision.experimental.set_policy(policy)
elif dtype == tf.bfloat16:
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_bfloat16')
tf.keras.mixed_precision.experimental.set_policy(policy)
elif dtype == tf.float32:
tf.keras.mixed_precision.experimental.set_policy('float32')
else:
raise ValueError("Unexpected dtype: %s" % dtype)
@@ -0,0 +1,175 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common TF utilities."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six
import tensorflow as tf
from tensorflow.python.util import deprecation
from official.modeling import activations
@deprecation.deprecated(
None,
"tf.keras.layers.Layer supports multiple positional args and kwargs as "
"input tensors. pack/unpack inputs to override __call__ is no longer "
"needed."
)
def pack_inputs(inputs):
"""Pack a list of `inputs` tensors to a tuple.
Args:
inputs: a list of tensors.
Returns:
a tuple of tensors. if any input is None, replace it with a special constant
tensor.
"""
inputs = tf.nest.flatten(inputs)
outputs = []
for x in inputs:
if x is None:
outputs.append(tf.constant(0, shape=[], dtype=tf.int32))
else:
outputs.append(x)
return tuple(outputs)
@deprecation.deprecated(
None,
"tf.keras.layers.Layer supports multiple positional args and kwargs as "
"input tensors. pack/unpack inputs to override __call__ is no longer "
"needed."
)
def unpack_inputs(inputs):
"""unpack a tuple of `inputs` tensors to a tuple.
Args:
inputs: a list of tensors.
Returns:
a tuple of tensors. if any input is a special constant tensor, replace it
with None.
"""
inputs = tf.nest.flatten(inputs)
outputs = []
for x in inputs:
if is_special_none_tensor(x):
outputs.append(None)
else:
outputs.append(x)
x = tuple(outputs)
# To trick the very pointless 'unbalanced-tuple-unpacking' pylint check
# from triggering.
if len(x) == 1:
return x[0]
return tuple(outputs)
def is_special_none_tensor(tensor):
"""Checks if a tensor is a special None Tensor."""
return tensor.shape.ndims == 0 and tensor.dtype == tf.int32
# TODO(hongkuny): consider moving custom string-map lookup to keras api.
def get_activation(identifier):
"""Maps a identifier to a Python function, e.g., "relu" => `tf.nn.relu`.
It checks string first and if it is one of customized activation not in TF,
the corresponding activation will be returned. For non-customized activation
names and callable identifiers, always fallback to tf.keras.activations.get.
Args:
identifier: String name of the activation function or callable.
Returns:
A Python function corresponding to the activation function.
"""
if isinstance(identifier, six.string_types):
name_to_fn = {
"gelu": activations.gelu,
"simple_swish": activations.simple_swish,
"hard_swish": activations.hard_swish,
"identity": activations.identity,
}
identifier = str(identifier).lower()
if identifier in name_to_fn:
return tf.keras.activations.get(name_to_fn[identifier])
return tf.keras.activations.get(identifier)
def get_shape_list(tensor, expected_rank=None, name=None):
"""Returns a list of the shape of tensor, preferring static dimensions.
Args:
tensor: A tf.Tensor object to find the shape of.
expected_rank: (optional) int. The expected rank of `tensor`. If this is
specified and the `tensor` has a different rank, and exception will be
thrown.
name: Optional name of the tensor for the error message.
Returns:
A list of dimensions of the shape of tensor. All static dimensions will
be returned as python integers, and dynamic dimensions will be returned
as tf.Tensor scalars.
"""
if expected_rank is not None:
assert_rank(tensor, expected_rank, name)
shape = tensor.shape.as_list()
non_static_indexes = []
for (index, dim) in enumerate(shape):
if dim is None:
non_static_indexes.append(index)
if not non_static_indexes:
return shape
dyn_shape = tf.shape(tensor)
for index in non_static_indexes:
shape[index] = dyn_shape[index]
return shape
def assert_rank(tensor, expected_rank, name=None):
"""Raises an exception if the tensor rank is not of the expected rank.
Args:
tensor: A tf.Tensor to check the rank of.
expected_rank: Python integer or list of integers, expected rank.
name: Optional name of the tensor for the error message.
Raises:
ValueError: If the expected shape doesn't match the actual shape.
"""
expected_rank_dict = {}
if isinstance(expected_rank, six.integer_types):
expected_rank_dict[expected_rank] = True
else:
for x in expected_rank:
expected_rank_dict[x] = True
actual_rank = tensor.shape.ndims
if actual_rank not in expected_rank_dict:
raise ValueError(
"For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
"equal to the expected tensor rank `%s`" %
(name, actual_rank, str(tensor.shape), str(expected_rank)))
@@ -0,0 +1,735 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Custom training loop for running TensorFlow 2.0 models."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import json
import os
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
from typing import Optional, Dict, List, Text, Callable, Union, Iterator, Any
from official.modeling.hyperparams import params_dict
from official.utils.misc import distribution_utils
from official.utils import hyperparams_flags
FLAGS = flags.FLAGS
strategy_flags_dict = hyperparams_flags.strategy_flags_dict
hparam_flags_dict = hyperparams_flags.hparam_flags_dict
def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
"""Saves model to model_dir with provided checkpoint prefix."""
checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
saved_path = checkpoint.save(checkpoint_path)
logging.info('Saving model as TF checkpoint: %s', saved_path)
def _steps_to_run(current_step, total_steps, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
return min(total_steps - current_step, steps_per_loop)
def _no_metric():
return None
class SummaryWriter(object):
"""Simple SummaryWriter for writing dictionary of metrics.
Attributes:
writer: The tf.SummaryWriter.
"""
def __init__(self, model_dir: Text, name: Text):
"""Inits SummaryWriter with paths.
Arguments:
model_dir: the model folder path.
name: the summary subfolder name.
"""
self.writer = tf.summary.create_file_writer(os.path.join(model_dir, name))
def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
"""Write metrics to summary with the given writer.
Args:
metrics: a dictionary of metrics values. Prefer dictionary.
step: integer. The training step.
"""
if not isinstance(metrics, dict):
# Support scalar metric without name.
logging.warning('Warning: summary writer prefer metrics as dictionary.')
metrics = {'metric': metrics}
with self.writer.as_default():
for k, v in metrics.items():
tf.summary.scalar(k, v, step=step)
self.writer.flush()
class DistributedExecutor(object):
"""Interface to train and eval models with tf.distribute.Strategy.
Arguments:
strategy: an instance of tf.distribute.Strategy.
params: Model configuration needed to run distribution strategy.
model_fn: Keras model function. Signature:
(params: ParamsDict) -> tf.keras.models.Model.
loss_fn: loss function. Signature:
(y_true: Tensor, y_pred: Tensor) -> Tensor
metric_fn: metric function. Signature: () -> tf.keras.metrics.Metric.
is_multi_host: Set to True when using multi hosts for training, like multi
worker GPU or TPU pod (slice). Otherwise, False.
"""
def __init__(self,
strategy,
params,
model_fn,
loss_fn,
is_multi_host=False):
self._params = params
self._model_fn = model_fn
self._loss_fn = loss_fn
self._strategy = strategy
self._checkpoint_name = 'ctl_step_{step}.ckpt'
self._is_multi_host = is_multi_host
self.train_summary_writer = None
self.eval_summary_writer = None
self.global_train_step = None
@property
def checkpoint_name(self):
"""Returns default checkpoint name."""
return self._checkpoint_name
@checkpoint_name.setter
def checkpoint_name(self, name):
"""Sets default summary writer for the current thread."""
self._checkpoint_name = name
def loss_fn(self):
return self._loss_fn()
def model_fn(self, params):
return self._model_fn(params)
def _save_config(self, model_dir):
"""Save parameters to config files if model_dir is defined."""
logging.info('Save config to model_dir %s.', model_dir)
if model_dir:
if not tf.io.gfile.exists(model_dir):
tf.io.gfile.makedirs(model_dir)
self._params.lock()
params_dict.save_params_dict_to_yaml(self._params,
model_dir + '/params.yaml')
else:
logging.warning('model_dir is empty, so skip the save config.')
def _get_input_iterator(
self, input_fn: Callable[..., tf.data.Dataset],
strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
"""Returns distributed dataset iterator.
Args:
input_fn: (params: dict) -> tf.data.Dataset.
strategy: an instance of tf.distribute.Strategy.
Returns:
An iterator that yields input tensors.
"""
if input_fn is None:
return None
# When training with multiple TPU workers, datasets needs to be cloned
# across workers. Since Dataset instance cannot be cloned in eager mode,
# we instead pass callable that returns a dataset.
if self._is_multi_host:
return iter(
strategy.experimental_distribute_datasets_from_function(input_fn))
else:
input_data = input_fn()
return iter(strategy.experimental_distribute_dataset(input_data))
def _create_replicated_step(self,
strategy,
model,
loss_fn,
optimizer,
metric=None):
def _replicated_step(inputs):
"""Replicated training step."""
inputs, labels = inputs
with tf.GradientTape() as tape:
outputs = model(inputs, training=True)
prediction_loss = loss_fn(labels, outputs)
loss = tf.reduce_mean(prediction_loss)
loss = loss / strategy.num_replicas_in_sync
if isinstance(metric, tf.keras.metrics.Metric):
metric.update_state(labels, outputs)
else:
logging.error('train metric is not an instance of '
'tf.keras.metrics.Metric.')
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
return loss
return _replicated_step
def _create_train_step(self,
strategy,
model,
loss_fn,
optimizer,
metric=None):
"""Creates a distributed training step.
Args:
strategy: an instance of tf.distribute.Strategy.
model: (Tensor, bool) -> Tensor. model function.
loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
optimizer: tf.keras.optimizers.Optimizer.
iterator: an iterator that yields input tensors.
metric: tf.keras.metrics.Metric subclass.
Returns:
The training step callable.
"""
_replicated_step = self._create_replicated_step(strategy, model, loss_fn,
optimizer, metric)
@tf.function
def train_step(iterator, num_steps):
"""Performs a distributed training step.
Args:
iterator: an iterator that yields input tensors.
Returns:
The loss tensor.
"""
if not isinstance(num_steps, tf.Tensor):
raise ValueError('steps should be an Tensor. Python object may cause '
'retracing.')
per_replica_losses = strategy.run(
_replicated_step, args=(next(iterator),))
for _ in tf.range(num_steps - 1):
per_replica_losses = strategy.run(
_replicated_step, args=(next(iterator),))
# For reporting, we returns the mean of losses.
losses = tf.nest.map_structure(
lambda x: strategy.reduce(tf.distribute.ReduceOp.MEAN, x, axis=None),
per_replica_losses)
return losses
return train_step
def _create_test_step(self, strategy, model, metric):
"""Creates a distributed test step."""
@tf.function
def test_step(iterator):
"""Calculates evaluation metrics on distributed devices."""
if not metric:
logging.info('Skip test_step because metric is None (%s)', metric)
return None, None
if not isinstance(metric, tf.keras.metrics.Metric):
raise ValueError(
'Metric must be an instance of tf.keras.metrics.Metric '
'for running in test_step. Actual {}'.format(metric))
def _test_step_fn(inputs):
"""Replicated accuracy calculation."""
inputs, labels = inputs
model_outputs = model(inputs, training=False)
metric.update_state(labels, model_outputs)
return labels, model_outputs
return strategy.run(_test_step_fn, args=(next(iterator),))
return test_step
def train(self,
train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
eval_input_fn: Callable[[params_dict.ParamsDict],
tf.data.Dataset] = None,
model_dir: Text = None,
total_steps: int = 1,
iterations_per_loop: int = 1,
train_metric_fn: Callable[[], Any] = None,
eval_metric_fn: Callable[[], Any] = None,
summary_writer_fn: Callable[[Text, Text],
SummaryWriter] = SummaryWriter,
init_checkpoint: Callable[[tf.keras.Model], Any] = None,
custom_callbacks: List[tf.keras.callbacks.Callback] = None,
save_config: bool = True):
"""Runs distributed training.
Args:
train_input_fn: (params: dict) -> tf.data.Dataset training data input
function.
eval_input_fn: (Optional) same type as train_input_fn. If not None, will
trigger evaluting metric on eval data. If None, will not run eval step.
model_dir: the folder path for model checkpoints.
total_steps: total training steps.
iterations_per_loop: train steps per loop. After each loop, this job will
update metrics like loss and save checkpoint.
train_metric_fn: metric_fn for evaluation in train_step.
eval_metric_fn: metric_fn for evaluation in test_step.
summary_writer_fn: function to create summary writer.
init_checkpoint: function to load checkpoint.
custom_callbacks: A list of Keras Callbacks objects to run during
training. More specifically, `on_batch_begin()`, `on_batch_end()`,
methods are invoked during training.
save_config: bool. Whether to save params to model_dir.
Returns:
The training loss and eval metrics.
"""
assert train_input_fn is not None
if train_metric_fn and not callable(train_metric_fn):
raise ValueError('if `train_metric_fn` is specified, '
'train_metric_fn must be a callable.')
if eval_metric_fn and not callable(eval_metric_fn):
raise ValueError('if `eval_metric_fn` is specified, '
'eval_metric_fn must be a callable.')
train_metric_fn = train_metric_fn or _no_metric
eval_metric_fn = eval_metric_fn or _no_metric
if custom_callbacks and iterations_per_loop != 1:
logging.error(
'It is sematically wrong to run callbacks when '
'iterations_per_loop is not one (%s)', iterations_per_loop)
def _run_callbacks_on_batch_begin(batch):
"""Runs custom callbacks at the start of every step."""
if not custom_callbacks:
return
for callback in custom_callbacks:
if callback:
callback.on_batch_begin(batch)
def _run_callbacks_on_batch_end(batch):
"""Runs custom callbacks at the end of every step."""
if not custom_callbacks:
return
for callback in custom_callbacks:
if callback:
callback.on_batch_end(batch)
if save_config:
self._save_config(model_dir)
if FLAGS.save_checkpoint_freq:
save_freq = FLAGS.save_checkpoint_freq
else:
save_freq = iterations_per_loop
params = self._params
strategy = self._strategy
# To reduce unnecessary send/receive input pipeline operation, we place
# input pipeline ops in worker task.
train_iterator = self._get_input_iterator(train_input_fn, strategy)
train_loss = None
eval_metric_result = None
with strategy.scope():
# To correctly place the model weights on accelerators,
# model and optimizer should be created in scope.
model = self.model_fn(params.as_dict())
if not hasattr(model, 'optimizer'):
raise ValueError('User should set optimizer attribute to model '
'inside `model_fn`.')
optimizer = model.optimizer
# Training loop starts here.
checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
initial_step = 0
if latest_checkpoint_file:
logging.info(
'Checkpoint file %s found and restoring from '
'checkpoint', latest_checkpoint_file)
checkpoint.restore(latest_checkpoint_file)
initial_step = optimizer.iterations.numpy()
logging.info('Loading from checkpoint file completed. Init step %d',
initial_step)
elif init_checkpoint:
logging.info('Restoring from init checkpoint function')
init_checkpoint(model)
logging.info('Loading from init checkpoint file completed')
current_step = optimizer.iterations.numpy()
checkpoint_name = self.checkpoint_name
eval_metric = eval_metric_fn()
train_metric = train_metric_fn()
train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
self.train_summary_writer = train_summary_writer.writer
test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
self.eval_summary_writer = test_summary_writer.writer
# Continue training loop.
train_step = self._create_train_step(
strategy=strategy,
model=model,
loss_fn=self.loss_fn(),
optimizer=optimizer,
metric=train_metric)
test_step = None
if eval_input_fn and eval_metric:
self.global_train_step = model.optimizer.iterations
test_step = self._create_test_step(strategy, model, metric=eval_metric)
logging.info('Training started')
last_save_checkpoint_step = current_step
while current_step < total_steps:
num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
_run_callbacks_on_batch_begin(current_step)
train_loss = train_step(train_iterator,
tf.convert_to_tensor(num_steps, dtype=tf.int32))
_run_callbacks_on_batch_end(current_step)
current_step += num_steps
train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
train_loss)
if not isinstance(train_loss, dict):
train_loss = {'total_loss': train_loss}
if np.isnan(train_loss['total_loss']):
raise ValueError('total loss is NaN.')
if train_metric:
train_metric_result = train_metric.result()
if isinstance(train_metric, tf.keras.metrics.Metric):
train_metric_result = tf.nest.map_structure(
lambda x: x.numpy().astype(float), train_metric_result)
if not isinstance(train_metric_result, dict):
train_metric_result = {'metric': train_metric_result}
train_metric_result.update(train_loss)
else:
train_metric_result = train_loss
if callable(optimizer.lr):
train_metric_result.update(
{'learning_rate': optimizer.lr(current_step).numpy()})
else:
train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
logging.info('Train Step: %d/%d / loss = %s / training metric = %s',
current_step, total_steps, train_loss,
train_metric_result)
train_summary_writer(
metrics=train_metric_result, step=optimizer.iterations)
# Saves model checkpoints and run validation steps at every
# iterations_per_loop steps.
# To avoid repeated model saving, we do not save after the last
# step of training.
if save_freq > 0 and current_step < total_steps and (
current_step - last_save_checkpoint_step) >= save_freq:
_save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step))
last_save_checkpoint_step = current_step
if test_step:
eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
eval_metric_result = self._run_evaluation(test_step, current_step,
eval_metric, eval_iterator)
logging.info('Step: %s evalation metric = %s.', current_step,
eval_metric_result)
test_summary_writer(
metrics=eval_metric_result, step=optimizer.iterations)
# Re-initialize evaluation metric, except the last step.
if eval_metric and current_step < total_steps:
eval_metric.reset_states()
if train_metric and current_step < total_steps:
train_metric.reset_states()
# Reaches the end of training and saves the last checkpoint.
if last_save_checkpoint_step < total_steps:
_save_checkpoint(checkpoint, model_dir,
checkpoint_name.format(step=current_step))
if test_step:
logging.info('Running final evaluation after training is complete.')
eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
eval_metric_result = self._run_evaluation(test_step, current_step,
eval_metric, eval_iterator)
logging.info('Final evaluation metric = %s.', eval_metric_result)
test_summary_writer(
metrics=eval_metric_result, step=optimizer.iterations)
return train_loss, eval_metric_result
def _run_evaluation(self, test_step, current_training_step, metric,
test_iterator):
"""Runs validation steps and aggregate metrics."""
if not test_iterator or not metric:
logging.warning(
'Both test_iterator (%s) and metrics (%s) must not be None.',
test_iterator, metric)
return None
logging.info('Running evaluation after step: %s.', current_training_step)
while True:
try:
test_step(test_iterator)
except (StopIteration, tf.errors.OutOfRangeError):
break
metric_result = metric.result()
if isinstance(metric, tf.keras.metrics.Metric):
metric_result = metric_result.numpy().astype(float)
logging.info('Step: [%d] Validation metric = %f', current_training_step,
metric_result)
return metric_result
def evaluate_from_model_dir(
self,
model_dir: Text,
eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
eval_metric_fn: Callable[[], Any],
total_steps: int = -1,
eval_timeout: int = None,
min_eval_interval: int = 180,
summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
"""Runs distributed evaluation on model folder.
Args:
eval_input_fn: (Optional) same type as train_input_fn. If not None, will
trigger evaluting metric on eval data. If None, will not run eval step.
eval_metric_fn: metric_fn for evaluation in test_step.
model_dir: the folder for storing model checkpoints.
total_steps: total training steps. If the current step reaches the
total_steps, the evaluation loop will stop.
eval_timeout: The maximum number of seconds to wait between checkpoints.
If left as None, then the process will wait indefinitely. Used by
tf.train.checkpoints_iterator.
min_eval_interval: The minimum number of seconds between yielding
checkpoints. Used by tf.train.checkpoints_iterator.
summary_writer_fn: function to create summary writer.
Returns:
Eval metrics dictionary of the last checkpoint.
"""
if not model_dir:
raise ValueError('model_dir must be set.')
def terminate_eval():
tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
eval_timeout)
return True
summary_writer = summary_writer_fn(model_dir, 'eval')
self.eval_summary_writer = summary_writer.writer
# Read checkpoints from the given model directory
# until `eval_timeout` seconds elapses.
for checkpoint_path in tf.train.checkpoints_iterator(
model_dir,
min_interval_secs=min_eval_interval,
timeout=eval_timeout,
timeout_fn=terminate_eval):
eval_metric_result, current_step = self.evaluate_checkpoint(
checkpoint_path=checkpoint_path,
eval_input_fn=eval_input_fn,
eval_metric_fn=eval_metric_fn,
summary_writer=summary_writer)
if total_steps > 0 and current_step >= total_steps:
logging.info('Evaluation finished after training step %d', current_step)
break
return eval_metric_result
def evaluate_checkpoint(self,
checkpoint_path: Text,
eval_input_fn: Callable[[params_dict.ParamsDict],
tf.data.Dataset],
eval_metric_fn: Callable[[], Any],
summary_writer: SummaryWriter = None):
"""Runs distributed evaluation on the one checkpoint.
Args:
eval_input_fn: (Optional) same type as train_input_fn. If not None, will
trigger evaluting metric on eval data. If None, will not run eval step.
eval_metric_fn: metric_fn for evaluation in test_step.
checkpoint_path: the checkpoint to evaluate.
summary_writer_fn: function to create summary writer.
Returns:
Eval metrics dictionary of the last checkpoint.
"""
if not callable(eval_metric_fn):
raise ValueError('if `eval_metric_fn` is specified, '
'eval_metric_fn must be a callable.')
params = self._params
strategy = self._strategy
# To reduce unnecessary send/receive input pipeline operation, we place
# input pipeline ops in worker task.
with strategy.scope():
# To correctly place the model weights on accelerators,
# model and optimizer should be created in scope.
model = self.model_fn(params.as_dict())
checkpoint = tf.train.Checkpoint(model=model)
eval_metric = eval_metric_fn()
assert eval_metric, 'eval_metric does not exist'
test_step = self._create_test_step(strategy, model, metric=eval_metric)
logging.info('Starting to evaluate.')
if not checkpoint_path:
raise ValueError('checkpoint path is empty')
reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
current_step = reader.get_tensor(
'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
logging.info(
'Checkpoint file %s found and restoring from '
'checkpoint', checkpoint_path)
checkpoint.restore(checkpoint_path)
self.global_train_step = model.optimizer.iterations
eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
eval_metric_result = self._run_evaluation(test_step, current_step,
eval_metric, eval_iterator)
logging.info('Step: %s evalation metric = %s.', current_step,
eval_metric_result)
summary_writer(metrics=eval_metric_result, step=current_step)
eval_metric.reset_states()
return eval_metric_result, current_step
def predict(self):
return NotImplementedError('Unimplmented function.')
class ExecutorBuilder(object):
"""Builder of DistributedExecutor.
Example 1: Builds an executor with supported Strategy.
builder = ExecutorBuilder(
strategy_type='tpu',
strategy_config={'tpu': '/bns/xxx'})
dist_executor = builder.build_executor(
params=params,
model_fn=my_model_fn,
loss_fn=my_loss_fn,
metric_fn=my_metric_fn)
Example 2: Builds an executor with customized Strategy.
builder = ExecutorBuilder()
builder.strategy = <some customized Strategy>
dist_executor = builder.build_executor(
params=params,
model_fn=my_model_fn,
loss_fn=my_loss_fn,
metric_fn=my_metric_fn)
Example 3: Builds a customized executor with customized Strategy.
class MyDistributedExecutor(DistributedExecutor):
# implementation ...
builder = ExecutorBuilder()
builder.strategy = <some customized Strategy>
dist_executor = builder.build_executor(
class_ctor=MyDistributedExecutor,
params=params,
model_fn=my_model_fn,
loss_fn=my_loss_fn,
metric_fn=my_metric_fn)
Args:
strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'. If
None. User is responsible to set the strategy before calling
build_executor(...).
strategy_config: necessary config for constructing the proper Strategy.
Check strategy_flags_dict() for examples of the structure.
"""
def __init__(self, strategy_type=None, strategy_config=None):
_ = distribution_utils.configure_cluster(
strategy_config.worker_hosts, strategy_config.task_index)
self._strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=strategy_type,
num_gpus=strategy_config.num_gpus,
all_reduce_alg=strategy_config.all_reduce_alg,
num_packs=strategy_config.num_packs,
tpu_address=strategy_config.tpu)
@property
def strategy(self):
"""Returns default checkpoint name."""
return self._strategy
@strategy.setter
def strategy(self, new_strategy):
"""Sets default summary writer for the current thread."""
self._strategy = new_strategy
def build_executor(self,
class_ctor=DistributedExecutor,
params=None,
model_fn=None,
loss_fn=None,
**kwargs):
"""Creates an executor according to strategy type.
See doc string of the DistributedExecutor.__init__ for more information of
the
input arguments.
Args:
class_ctor: A constructor of executor (default: DistributedExecutor).
params: ParamsDict, all the model parameters and runtime parameters.
model_fn: Keras model function.
loss_fn: loss function.
**kwargs: other arguments to the executor constructor.
Returns:
An instance of DistributedExecutor or its subclass.
"""
if self._strategy is None:
raise ValueError('`strategy` should not be None. You need to specify '
'`strategy_type` in the builder contructor or directly '
'set the `strategy` property of the builder.')
return class_ctor(
strategy=self._strategy,
params=params,
model_fn=model_fn,
loss_fn=loss_fn,
**kwargs)
@@ -0,0 +1,88 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Sets up TensorFlow Official Models."""
import datetime
import os
import sys
from setuptools import find_packages
from setuptools import setup
version = '2.2.0'
project_name = 'tf-models-official'
long_description = """The TensorFlow official models are a collection of
models that use TensorFlow's high-level APIs.
They are intended to be well-maintained, tested, and kept up to date with the
latest TensorFlow API. They should also be reasonably optimized for fast
performance while still being easy to read."""
if '--project_name' in sys.argv:
project_name_idx = sys.argv.index('--project_name')
project_name = sys.argv[project_name_idx + 1]
sys.argv.remove('--project_name')
sys.argv.pop(project_name_idx)
def _get_requirements():
"""Parses requirements.txt file."""
install_requires_tmp = []
dependency_links_tmp = []
with open(
os.path.join(os.path.dirname(__file__), '../requirements.txt'), 'r') as f:
for line in f:
package_name = line.strip()
if package_name.startswith('-e '):
dependency_links_tmp.append(package_name[3:].strip())
else:
install_requires_tmp.append(package_name)
return install_requires_tmp, dependency_links_tmp
install_requires, dependency_links = _get_requirements()
if project_name == 'tf-models-nightly':
version += '.dev' + datetime.datetime.now().strftime('%Y%m%d')
install_requires.append('tf-nightly')
else:
install_requires.append('tensorflow>=2.1.0')
print('install_requires: ', install_requires)
print('dependency_links: ', dependency_links)
setup(
name=project_name,
version=version,
description='TensorFlow Official Models',
long_description=long_description,
author='Google Inc.',
author_email='no-reply@google.com',
url='https://github.com/tensorflow/models',
license='Apache 2.0',
packages=find_packages(exclude=[
'research*',
'tutorials*',
'samples*',
'official.r1*',
'official.pip_package*',
'official.benchmark*',
]),
exclude_package_data={
'': ['*_test.py',],
},
install_requires=install_requires,
dependency_links=dependency_links,
python_requires='>=3.6',
)
@@ -0,0 +1,23 @@
# Legacy Models Collection
The R1 folder contains legacy model implmentation and models that will not
update to TensorFlow 2.x. They do not have solid performance tracking.
**Note: models will be removed from the master branch by 2020/06.**
After removal, you can still access to these legacy models in the previous
released tags, e.g. [v2.1.0](https://github.com/tensorflow/models/releases/tag/v2.1.0).
## Legacy model implmentation
Transformer and MNIST implementation uses pure TF 1.x TF-Estimator.
Users should follow the corresponding TF 2.x implmentation inside the
official model garden.
## Models that will not update to TensorFlow 2.x
* [boosted_trees](boosted_trees): A Gradient Boosted Trees model to
classify higgs boson process from HIGGS Data Set.
* [wide_deep](wide_deep): A model that combines a wide model and deep
network to classify census income data.
@@ -0,0 +1,112 @@
# Classifying Higgs boson processes in the HIGGS Data Set
## Overview
The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) contains 11 million samples with 28 features, and is for the classification problem to distinguish between a signal process which produces Higgs bosons and a background process which does not.
We use Gradient Boosted Trees algorithm to distinguish the two classes.
---
The code sample uses the high level `tf.estimator.Estimator` and `tf.data.Dataset`. These APIs are great for fast iteration and quickly adapting models to your own datasets without major code overhauls. It allows you to move from single-worker training to distributed training, and makes it easy to export model binaries for prediction. Here, for further simplicity and faster execution, we use a utility function `tf.contrib.estimator.boosted_trees_classifier_train_in_memory`. This utility function is especially effective when the input is provided as in-memory data sets like numpy arrays.
An input function for the `Estimator` typically uses `tf.data.Dataset` API, which can handle various data control like streaming, batching, transform and shuffling. However `boosted_trees_classifier_train_in_memory()` utility function requires that the entire data is provided as a single batch (i.e. without using `batch()` API). Thus in this practice, simply `Dataset.from_tensors()` is used to convert numpy arrays into structured tensors, and `Dataset.zip()` is used to put features and label together.
For further references of `Dataset`, [Read more here](https://www.tensorflow.org/guide/datasets).
## Running the code
First make sure you've [added the models folder to your Python path](/official/#running-the-models); otherwise you may encounter an error like `ImportError: No module named official.boosted_trees`.
### Setup
The [HIGGS Data Set](https://archive.ics.uci.edu/ml/datasets/HIGGS) that this sample uses for training is hosted by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/). We have provided a script that downloads and cleans the necessary files.
```
python data_download.py
```
This will download a file and store the processed file under the directory designated by `--data_dir` (defaults to `/tmp/higgs_data/`). To change the target directory, set the `--data_dir` flag. The directory could be network storages that Tensorflow supports (like Google Cloud Storage, `gs://<bucket>/<path>/`).
The file downloaded to the local temporary folder is about 2.8 GB, and the processed file is about 0.8 GB, so there should be enough storage to handle them.
### Training
This example uses about 3 GB of RAM during training.
You can run the code locally as follows:
```
python train_higgs.py
```
The model is by default saved to `/tmp/higgs_model`, which can be changed using the `--model_dir` flag.
Note that the model_dir is cleaned up before every time training starts.
Model parameters can be adjusted by flags, like `--n_trees`, `--max_depth`, `--learning_rate` and so on. Check out the code for details.
The final accuracy will be around 74% and loss will be around 0.516 over the eval set, when trained with the default parameters.
By default, the first 1 million examples among 11 millions are used for training, and the last 1 million examples are used for evaluation.
The training/evaluation data can be selected as index ranges by flags `--train_start`, `--train_count`, `--eval_start`, `--eval_count`, etc.
### TensorBoard
Run TensorBoard to inspect the details about the graph and training progression.
```
tensorboard --logdir=/tmp/higgs_model # set logdir as --model_dir set during training.
```
## Inference with SavedModel
You can export the model into Tensorflow [SavedModel](https://www.tensorflow.org/guide/saved_model) format by using the argument `--export_dir`:
```
python train_higgs.py --export_dir /tmp/higgs_boosted_trees_saved_model
```
After the model finishes training, use [`saved_model_cli`](https://www.tensorflow.org/guide/saved_model#cli_to_inspect_and_execute_savedmodel) to inspect and execute the SavedModel.
Try the following commands to inspect the SavedModel:
**Replace `${TIMESTAMP}` with the folder produced (e.g. 1524249124)**
```
# List possible tag_sets. Only one metagraph is saved, so there will be one option.
saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/
# Show SignatureDefs for tag_set=serve. SignatureDefs define the outputs to show.
saved_model_cli show --dir /tmp/higgs_boosted_trees_saved_model/${TIMESTAMP}/ \
--tag_set serve --all
```
### Inference
Let's use the model to predict the income group of two examples.
Note that this model exports SavedModel with the custom parsing module that accepts csv lines as features. (Each line is an example with 28 columns; be careful to not add a label column, unlike in the training data.)
```
saved_model_cli run --dir /tmp/boosted_trees_higgs_saved_model/${TIMESTAMP}/ \
--tag_set serve --signature_def="predict" \
--input_exprs='inputs=["0.869293,-0.635082,0.225690,0.327470,-0.689993,0.754202,-0.248573,-1.092064,0.0,1.374992,-0.653674,0.930349,1.107436,1.138904,-1.578198,-1.046985,0.0,0.657930,-0.010455,-0.045767,3.101961,1.353760,0.979563,0.978076,0.920005,0.721657,0.988751,0.876678", "1.595839,-0.607811,0.007075,1.818450,-0.111906,0.847550,-0.566437,1.581239,2.173076,0.755421,0.643110,1.426367,0.0,0.921661,-1.190432,-1.615589,0.0,0.651114,-0.654227,-1.274345,3.101961,0.823761,0.938191,0.971758,0.789176,0.430553,0.961357,0.957818"]'
```
This will print out the predicted classes and class probabilities. Something like:
```
Result for output key class_ids:
[[1]
[0]]
Result for output key classes:
[['1']
['0']]
Result for output key logistic:
[[0.6440273 ]
[0.10902369]]
Result for output key logits:
[[ 0.59288704]
[-2.1007526 ]]
Result for output key probabilities:
[[0.3559727 0.6440273]
[0.8909763 0.1090237]]
```
Please note that "predict" signature_def gives out different (more detailed) results than "classification" or "serving_default".
## Additional Links
If you are interested in distributed training, take a look at [Distributed TensorFlow](https://www.tensorflow.org/deploy/distributed).
You can also [train models on Cloud ML Engine](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction), which provides [hyperparameter tuning](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction#hyperparameter_tuning) to maximize your model's results and enables [deploying your model for prediction](https://cloud.google.com/ml-engine/docs/getting-started-training-prediction#deploy_a_model_to_support_prediction).
@@ -0,0 +1,97 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Downloads the UCI HIGGS Dataset and prepares train data.
The details on the dataset are in https://archive.ics.uci.edu/ml/datasets/HIGGS
It takes a while as it needs to download 2.8 GB over the network, process, then
store it into the specified location as a compressed numpy file.
Usage:
$ python data_download.py --data_dir=/tmp/higgs_data
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gzip
import os
import tempfile
# pylint: disable=g-bad-import-order
import numpy as np
import pandas as pd
from six.moves import urllib
from absl import app as absl_app
from absl import flags
import tensorflow as tf
from official.utils.flags import core as flags_core
URL_ROOT = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280"
INPUT_FILE = "HIGGS.csv.gz"
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file to contain "data" array.
def _download_higgs_data_and_save_npz(data_dir):
"""Download higgs data and store as a numpy compressed file."""
input_url = URL_ROOT + "/" + INPUT_FILE
np_filename = os.path.join(data_dir, NPZ_FILE)
if tf.gfile.Exists(np_filename):
raise ValueError("data_dir already has the processed data file: {}".format(
np_filename))
if not tf.gfile.Exists(data_dir):
tf.gfile.MkDir(data_dir)
# 2.8 GB to download.
try:
tf.logging.info("Data downloading...")
temp_filename, _ = urllib.request.urlretrieve(input_url)
# Reading and parsing 11 million csv lines takes 2~3 minutes.
tf.logging.info("Data processing... taking multiple minutes...")
with gzip.open(temp_filename, "rb") as csv_file:
data = pd.read_csv(
csv_file,
dtype=np.float32,
names=["c%02d" % i for i in range(29)] # label + 28 features.
).as_matrix()
finally:
tf.gfile.Remove(temp_filename)
# Writing to temporary location then copy to the data_dir (0.8 GB).
f = tempfile.NamedTemporaryFile()
np.savez_compressed(f, data=data)
tf.gfile.Copy(f.name, np_filename)
tf.logging.info("Data saved to: {}".format(np_filename))
def main(unused_argv):
if not tf.gfile.Exists(FLAGS.data_dir):
tf.gfile.MkDir(FLAGS.data_dir)
_download_higgs_data_and_save_npz(FLAGS.data_dir)
def define_data_download_flags():
"""Add flags specifying data download arguments."""
flags.DEFINE_string(
name="data_dir", default="/tmp/higgs_data",
help=flags_core.help_wrap(
"Directory to download higgs dataset and store training/eval data."))
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_data_download_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
@@ -0,0 +1,297 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""A script that builds boosted trees over higgs data.
If you haven't, please run data_download.py beforehand to prepare the data.
For some more details on this example, please refer to README.md as well.
Note that the model_dir is cleaned up before starting the training.
Usage:
$ python train_higgs.py --n_trees=100 --max_depth=6 --learning_rate=0.1 \
--model_dir=/tmp/higgs_model
Note that BoostedTreesClassifier is available since Tensorflow 1.8.0.
So you need to install recent enough version of Tensorflow to use this example.
The training data is by default the first million examples out of 11M examples,
and eval data is by default the last million examples.
They are controlled by --train_start, --train_count, --eval_start, --eval_count.
e.g. to train over the first 10 million examples instead of 1 million:
$ python train_higgs.py --n_trees=100 --max_depth=6 --learning_rate=0.1 \
--model_dir=/tmp/higgs_model --train_count=10000000
Training history and metrics can be inspected using tensorboard.
Set --logdir as the --model_dir set by flag when training
(or the default /tmp/higgs_model).
$ tensorboard --logdir=/tmp/higgs_model
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
# pylint: disable=g-bad-import-order
import numpy as np
from absl import app as absl_app
from absl import flags
import tensorflow as tf
# pylint: enable=g-bad-import-order
from official.utils.flags import core as flags_core
from official.utils.flags._conventions import help_wrap
from official.utils.logs import logger
NPZ_FILE = "HIGGS.csv.gz.npz" # numpy compressed file containing "data" array
def read_higgs_data(data_dir, train_start, train_count, eval_start, eval_count):
"""Reads higgs data from csv and returns train and eval data.
Args:
data_dir: A string, the directory of higgs dataset.
train_start: An integer, the start index of train examples within the data.
train_count: An integer, the number of train examples within the data.
eval_start: An integer, the start index of eval examples within the data.
eval_count: An integer, the number of eval examples within the data.
Returns:
Numpy array of train data and eval data.
"""
npz_filename = os.path.join(data_dir, NPZ_FILE)
try:
# gfile allows numpy to read data from network data sources as well.
with tf.gfile.Open(npz_filename, "rb") as npz_file:
with np.load(npz_file) as npz:
data = npz["data"]
except tf.errors.NotFoundError as e:
raise RuntimeError(
"Error loading data; use data_download.py to prepare the data.\n{}: {}"
.format(type(e).__name__, e))
return (data[train_start:train_start+train_count],
data[eval_start:eval_start+eval_count])
# This showcases how to make input_fn when the input data is available in the
# form of numpy arrays.
def make_inputs_from_np_arrays(features_np, label_np):
"""Makes and returns input_fn and feature_columns from numpy arrays.
The generated input_fn will return tf.data.Dataset of feature dictionary and a
label, and feature_columns will consist of the list of
tf.feature_column.BucketizedColumn.
Note, for in-memory training, tf.data.Dataset should contain the whole data
as a single tensor. Don't use batch.
Args:
features_np: A numpy ndarray (shape=[batch_size, num_features]) for
float32 features.
label_np: A numpy ndarray (shape=[batch_size, 1]) for labels.
Returns:
input_fn: A function returning a Dataset of feature dict and label.
feature_names: A list of feature names.
feature_column: A list of tf.feature_column.BucketizedColumn.
"""
num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
# Create source feature_columns and bucketized_columns.
def get_bucket_boundaries(feature):
"""Returns bucket boundaries for feature by percentiles."""
return np.unique(np.percentile(feature, range(0, 100))).tolist()
source_columns = [
tf.feature_column.numeric_column(
feature_name, dtype=tf.float32,
# Although higgs data have no missing values, in general, default
# could be set as 0 or some reasonable value for missing values.
default_value=0.0)
for feature_name in feature_names
]
bucketized_columns = [
tf.feature_column.bucketized_column(
source_columns[i],
boundaries=get_bucket_boundaries(features_np_list[i]))
for i in range(num_features)
]
# Make an input_fn that extracts source features.
def input_fn():
"""Returns features as a dictionary of numpy arrays, and a label."""
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((tf.data.Dataset.from_tensors(features),
tf.data.Dataset.from_tensors(label_np),))
return input_fn, feature_names, bucketized_columns
def make_eval_inputs_from_np_arrays(features_np, label_np):
"""Makes eval input as streaming batches."""
num_features = features_np.shape[1]
features_np_list = np.split(features_np, num_features, axis=1)
# 1-based feature names.
feature_names = ["feature_%02d" % (i + 1) for i in range(num_features)]
def input_fn():
features = {
feature_name: tf.constant(features_np_list[i])
for i, feature_name in enumerate(feature_names)
}
return tf.data.Dataset.zip((
tf.data.Dataset.from_tensor_slices(features),
tf.data.Dataset.from_tensor_slices(label_np),)).batch(1000)
return input_fn
def _make_csv_serving_input_receiver_fn(column_names, column_defaults):
"""Returns serving_input_receiver_fn for csv.
The input arguments are relevant to `tf.decode_csv()`.
Args:
column_names: a list of column names in the order within input csv.
column_defaults: a list of default values with the same size of
column_names. Each entity must be either a list of one scalar, or an
empty list to denote the corresponding column is required.
e.g. [[""], [2.5], []] indicates the third column is required while
the first column must be string and the second must be float/double.
Returns:
a serving_input_receiver_fn that handles csv for serving.
"""
def serving_input_receiver_fn():
csv = tf.placeholder(dtype=tf.string, shape=[None], name="csv")
features = dict(zip(column_names, tf.decode_csv(csv, column_defaults)))
receiver_tensors = {"inputs": csv}
return tf.estimator.export.ServingInputReceiver(features, receiver_tensors)
return serving_input_receiver_fn
def train_boosted_trees(flags_obj):
"""Train boosted_trees estimator on HIGGS data.
Args:
flags_obj: An object containing parsed flag values.
"""
# Clean up the model directory if present.
if tf.gfile.Exists(flags_obj.model_dir):
tf.gfile.DeleteRecursively(flags_obj.model_dir)
tf.logging.info("## Data loading...")
train_data, eval_data = read_higgs_data(
flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count,
flags_obj.eval_start, flags_obj.eval_count)
tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format(
train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape))
# Data consists of one label column followed by 28 feature columns.
train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays(
features_np=train_data[:, 1:], label_np=train_data[:, 0:1])
eval_input_fn = make_eval_inputs_from_np_arrays(
features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1])
tf.logging.info("## Features prepared. Training starts...")
# Create benchmark logger to log info about the training and metric values
run_params = {
"train_start": flags_obj.train_start,
"train_count": flags_obj.train_count,
"eval_start": flags_obj.eval_start,
"eval_count": flags_obj.eval_count,
"n_trees": flags_obj.n_trees,
"max_depth": flags_obj.max_depth,
}
benchmark_logger = logger.config_benchmark_logger(flags_obj)
benchmark_logger.log_run_info(
model_name="boosted_trees",
dataset_name="higgs",
run_params=run_params,
test_id=flags_obj.benchmark_test_id)
# Though BoostedTreesClassifier is under tf.estimator, faster in-memory
# training is yet provided as a contrib library.
from tensorflow.contrib import estimator as contrib_estimator # pylint: disable=g-import-not-at-top
classifier = contrib_estimator.boosted_trees_classifier_train_in_memory(
train_input_fn,
feature_columns,
model_dir=flags_obj.model_dir or None,
n_trees=flags_obj.n_trees,
max_depth=flags_obj.max_depth,
learning_rate=flags_obj.learning_rate)
# Evaluation.
eval_results = classifier.evaluate(eval_input_fn)
# Benchmark the evaluation results
benchmark_logger.log_evaluation_result(eval_results)
# Exporting the savedmodel with csv parsing.
if flags_obj.export_dir is not None:
classifier.export_savedmodel(
flags_obj.export_dir,
_make_csv_serving_input_receiver_fn(
column_names=feature_names,
# columns are all floats.
column_defaults=[[0.0]] * len(feature_names)),
strip_default_attrs=True)
def main(_):
train_boosted_trees(flags.FLAGS)
def define_train_higgs_flags():
"""Add tree related flags as well as training/eval configuration."""
flags_core.define_base(clean=False, stop_threshold=False, batch_size=False,
num_gpu=False, export_dir=True)
flags_core.define_benchmark()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_integer(
name="train_start", default=0,
help=help_wrap("Start index of train examples within the data."))
flags.DEFINE_integer(
name="train_count", default=1000000,
help=help_wrap("Number of train examples within the data."))
flags.DEFINE_integer(
name="eval_start", default=10000000,
help=help_wrap("Start index of eval examples within the data."))
flags.DEFINE_integer(
name="eval_count", default=1000000,
help=help_wrap("Number of eval examples within the data."))
flags.DEFINE_integer(
"n_trees", default=100, help=help_wrap("Number of trees to build."))
flags.DEFINE_integer(
"max_depth", default=6, help=help_wrap("Maximum depths of each tree."))
flags.DEFINE_float(
"learning_rate", default=0.1,
help=help_wrap("The learning rate."))
flags_core.set_defaults(data_dir="/tmp/higgs_data",
model_dir="/tmp/higgs_model")
if __name__ == "__main__":
# Training progress and eval results are shown as logging.INFO; so enables it.
tf.logging.set_verbosity(tf.logging.INFO)
define_train_higgs_flags()
absl_app.run(main)
@@ -0,0 +1,152 @@
# ResNet in TensorFlow
Deep residual networks, or ResNets for short, provided the breakthrough idea of
identity mappings in order to enable training of very deep convolutional neural
networks. This folder contains an implementation of ResNet for the ImageNet
dataset written in TensorFlow.
See the following papers for more background:
[1] [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385.pdf) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
[2] [Identity Mappings in Deep Residual Networks](https://arxiv.org/pdf/1603.05027.pdf) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
In code, v1 refers to the ResNet defined in [1] but where a stride 2 is used on
the 3x3 conv rather than the first 1x1 in the bottleneck. This change results
in higher and more stable accuracy with less epochs than the original v1 and has
shown to scale to higher batch sizes with minimal degradation in accuracy.
There is no originating paper. The first mention we are aware of was in the
torch version of [ResNetv1](https://github.com/facebook/fb.resnet.torch). Most
popular v1 implementations are this implementation which we call ResNetv1.5.
In testing we found v1.5 requires ~12% more compute to train and has 6% reduced
throughput for inference compared to ResNetv1. CIFAR-10 ResNet does not use the
bottleneck and is thus the same for v1 as v1.5.
v2 refers to [2]. The principle difference between the two versions is that v1
applies batch normalization and activation after convolution, while v2 applies
batch normalization, then activation, and finally convolution. A schematic
comparison is presented in Figure 1 (left) of [2].
Please proceed according to which dataset you would like to train/evaluate on:
## CIFAR-10
### Setup
You need to have the latest version of TensorFlow installed.
First, make sure [the models folder is in your Python path](/official/#running-the-models); otherwise you may encounter `ImportError: No module named official.resnet`.
Then, download and extract the CIFAR-10 data from Alex's website, specifying the location with the `--data_dir` flag. Run the following:
```bash
python cifar10_download_and_extract.py --data_dir <DATA_DIR>
```
Then, to train the model:
```bash
python cifar10_main.py --data_dir <DATA_DIR>/cifar-10-batches-bin --model_dir <MODEL_DIR>
```
Use `--data_dir` to specify the location of the CIFAR-10 data used in the previous step. There are more flag options as described in `cifar10_main.py`.
To export a `SavedModel` from the trained checkpoint:
```bash
python cifar10_main.py --data_dir <DATA_DIR>/cifar-10-batches-bin --model_dir <MODEL_DIR> --eval_only --export_dir <EXPORT_DIR>
```
Note: The `<EXPORT_DIR>` must be present. You might want to run `mkdir <EXPORT_DIR>` beforehand.
The `SavedModel` can then be [loaded](https://www.tensorflow.org/guide/saved_model#loading_a_savedmodel_in_python) in order to use the ResNet for prediction.
## ImageNet
### Setup
To begin, you will need to download the ImageNet dataset and convert it to
TFRecord format. The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
provide a few options.
Once your dataset is ready, you can begin training the model as follows:
```bash
python imagenet_main.py --data_dir=/path/to/imagenet
```
The model will begin training and will automatically evaluate itself on the
validation data roughly once per epoch.
Note that there are a number of other options you can specify, including
`--model_dir` to choose where to store the model and `--resnet_size` to choose
the model size (options include ResNet-18 through ResNet-200). See
[`resnet_run_loop.py`](resnet_run_loop.py) for the full list of options.
## Compute Devices
Training is accomplished using the DistributionStrategies API. (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md)
The appropriate distribution strategy is chosen based on the `--num_gpus` flag.
By default this flag is one if TensorFlow is compiled with CUDA, and zero
otherwise.
num_gpus:
+ 0: Use OneDeviceStrategy and train on CPU.
+ 1: Use OneDeviceStrategy and train on GPU.
+ 2+: Use MirroredStrategy (data parallelism) to distribute a batch between devices.
### Pre-trained model
You can download pre-trained versions of ResNet-50. Reported accuracies are top-1 single-crop accuracy for the ImageNet validation set.
Models are reported as both checkpoints produced by Estimator during training, and as SavedModels which are more portable. Checkpoints are fragile,
and these are not guaranteed to work with future versions of the code. Both ResNet v1
and ResNet v2 have been trained in both fp16 and fp32 precision. (Here v1 refers to "v1.5". See the note above.) Furthermore, SavedModels
are generated to accept either tensor or JPG inputs, and with channels_first (NCHW) and channels_last (NHWC) convolutions. NCHW is generally
better for GPUs, while NHWC is generally better for CPUs. See the TensorFlow [performance guide](https://www.tensorflow.org/performance/performance_guide#data_formats)
for more details.
ResNet-50 v2 (fp32, Accuracy 76.47%):
* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v2_fp32_20181001.tar.gz)
* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NCHW.tar.gz),
[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NCHW_jpg.tar.gz),
[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NHWC.tar.gz),
[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp32_savedmodel_NHWC_jpg.tar.gz)
ResNet-50 v2 (fp16, Accuracy 76.56%):
* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v2_fp16_20180928.tar.gz)
* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NCHW.tar.gz),
[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NCHW_jpg.tar.gz),
[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NHWC.tar.gz),
[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v2_fp16_savedmodel_NHWC_jpg.tar.gz)
ResNet-50 v1 (fp32, Accuracy 76.53%):
* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v1_fp32_20181001.tar.gz)
* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW.tar.gz),
[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NCHW_jpg.tar.gz),
[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NHWC.tar.gz),
[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp32_savedmodel_NHWC_jpg.tar.gz)
ResNet-50 v1 (fp16, Accuracy 76.18%):
* [Checkpoint](http://download.tensorflow.org/models/official/20181001_resnet/checkpoints/resnet_imagenet_v1_fp16_20181001.tar.gz)
* SavedModel [(NCHW)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NCHW.tar.gz),
[(NCHW, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NCHW_jpg.tar.gz),
[(NHWC)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NHWC.tar.gz),
[(NHWC, JPG)](http://download.tensorflow.org/models/official/20181001_resnet/savedmodels/resnet_v1_fp16_savedmodel_NHWC_jpg.tar.gz)
### Transfer Learning
You can use a pretrained model to initialize a training process. In addition you are able to freeze all but the final fully connected layers to fine tune your model. Transfer Learning is useful when training on your own small datasets. For a brief look at transfer learning in the context of convolutional neural networks, we recommend reading these [short notes](http://cs231n.github.io/transfer-learning/).
To fine tune a pretrained resnet you must make three changes to your training procedure:
1) Build the exact same model as previously except we change the number of labels in the final classification layer.
2) Restore all weights from the pre-trained resnet except for the final classification layer; this will get randomly initialized instead.
3) Freeze earlier layers of the network
We can perform these three operations by specifying two flags: ```--pretrained_model_checkpoint_path``` and ```--fine_tune```. The first flag is a string that points to the path of a pre-trained resnet model. If this flag is specified, it will load all but the final classification layer. A key thing to note: if both ```--pretrained_model_checkpoint_path``` and a non empty ```model_dir``` directory are passed, the tensorflow estimator will load only the ```model_dir```. For more on this please see [WarmStartSettings](https://www.tensorflow.org/versions/master/api_docs/python/tf/estimator/WarmStartSettings) and [Estimators](https://www.tensorflow.org/guide/estimators).
The second flag ```--fine_tune``` is a boolean that indicates whether earlier layers of the network should be frozen. You may set this flag to false if you wish to continue training a pre-trained model from a checkpoint. If you set this flag to true, you can train a new classification layer from scratch.
@@ -0,0 +1,499 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Executes Estimator benchmarks and accuracy tests."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
from absl import flags
from absl.testing import flagsaver
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.r1.resnet import cifar10_main as cifar_main
from official.r1.resnet import imagenet_main
from official.utils.flags import core as flags_core
from official.utils.logs import hooks
IMAGENET_DATA_DIR_NAME = 'imagenet'
CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
FLAGS = flags.FLAGS
class EstimatorBenchmark(tf.test.Benchmark):
"""Base class to hold methods common to test classes in the module.
Code under test for Estimator models (ResNet50 and 56) report mostly the
same data and require the same FLAG setup.
"""
local_flags = None
def __init__(self, output_dir=None, default_flags=None, flag_methods=None):
if not output_dir:
output_dir = '/tmp'
self.output_dir = output_dir
self.default_flags = default_flags or {}
self.flag_methods = flag_methods or {}
def _get_model_dir(self, folder_name):
"""Returns directory to store info, e.g. saved model and event log."""
return os.path.join(self.output_dir, folder_name)
def _setup(self):
"""Sets up and resets flags before each test."""
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
if EstimatorBenchmark.local_flags is None:
for flag_method in self.flag_methods:
flag_method()
# Loads flags to get defaults to then override. List cannot be empty.
flags.FLAGS(['foo'])
# Overrides flag values with defaults for the class of tests.
for k, v in self.default_flags.items():
setattr(FLAGS, k, v)
saved_flag_values = flagsaver.save_flag_values()
EstimatorBenchmark.local_flags = saved_flag_values
else:
flagsaver.restore_flag_values(EstimatorBenchmark.local_flags)
def _report_benchmark(self,
stats,
wall_time_sec,
top_1_max=None,
top_1_min=None):
"""Report benchmark results by writing to local protobuf file.
Args:
stats: dict returned from estimator models with known entries.
wall_time_sec: the during of the benchmark execution in seconds
top_1_max: highest passing level for top_1 accuracy.
top_1_min: lowest passing level for top_1 accuracy.
"""
examples_per_sec_hook = None
for hook in stats['train_hooks']:
if isinstance(hook, hooks.ExamplesPerSecondHook):
examples_per_sec_hook = hook
break
eval_results = stats['eval_results']
metrics = []
if 'accuracy' in eval_results:
metrics.append({'name': 'accuracy_top_1',
'value': float(eval_results['accuracy']),
'min_value': top_1_min,
'max_value': top_1_max})
if 'accuracy_top_5' in eval_results:
metrics.append({'name': 'accuracy_top_5',
'value': float(eval_results['accuracy_top_5'])})
if examples_per_sec_hook:
exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
# ExamplesPerSecondHook skips the first 10 steps.
exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
metrics.append({'name': 'exp_per_second',
'value': exp_per_sec})
flags_str = flags_core.get_nondefault_flags_as_str()
self.report_benchmark(
iters=eval_results.get('global_step', None),
wall_time=wall_time_sec,
metrics=metrics,
extras={'flags': flags_str})
class Resnet50EstimatorAccuracy(EstimatorBenchmark):
"""Benchmark accuracy tests for ResNet50 w/ Estimator."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""Benchmark accuracy tests for ResNet50 w/ Estimator.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [imagenet_main.define_imagenet_flags]
self.data_dir = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
super(Resnet50EstimatorAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def benchmark_graph_8_gpu(self):
"""Test 8 GPUs graph mode."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.dtype = 'fp32'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_8_gpu(self):
"""Test FP16 8 GPUs graph mode."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 256 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_graph_rewrite_8_gpu(self):
"""Test FP16 graph rewrite 8 GPUs graph mode."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 256 * 8
FLAGS.train_epochs = 90
FLAGS.epochs_between_evals = 10
FLAGS.model_dir = self._get_model_dir(
'benchmark_graph_fp16_graph_rewrite_8_gpu')
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = imagenet_main.run_imagenet(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
self._report_benchmark(stats,
wall_time_sec,
top_1_min=0.762,
top_1_max=0.766)
class Resnet50EstimatorBenchmarkBase(EstimatorBenchmark):
"""Base class for benchmarks for ResNet50 using Estimator."""
local_flags = None
def __init__(self, output_dir=None, default_flags=None):
flag_methods = [imagenet_main.define_imagenet_flags]
super(Resnet50EstimatorBenchmarkBase, self).__init__(
output_dir=output_dir,
default_flags=default_flags,
flag_methods=flag_methods)
def _run_and_report_benchmark(self):
start_time_sec = time.time()
stats = imagenet_main.run_imagenet(FLAGS)
wall_time_sec = time.time() - start_time_sec
print(stats)
# Remove values to skip triggering accuracy check.
stats['eval_results'].pop('accuracy', None)
stats['eval_results'].pop('accuracy_top_5', None)
self._report_benchmark(stats, wall_time_sec)
class Resnet50EstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
"""Benchmarks for ResNet50 using Estimator with 1 worker."""
def __init__(self, output_dir=None, default_flags=None):
super(Resnet50EstimatorBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags)
def benchmark_graph_fp16_1_gpu(self):
"""Benchmarks graph fp16 1 gpu."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
FLAGS.batch_size = 128
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_1_gpu_tweaked(self):
"""Benchmarks graph fp16 1 gpu tweaked."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.intra_op_parallelism_threads = 1
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu_tweaked')
FLAGS.batch_size = 256
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked(self):
"""Benchmarks graph fp16 graph rewrite 1 gpu tweaked."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.intra_op_parallelism_threads = 1
FLAGS.model_dir = self._get_model_dir(
'benchmark_graph_fp16_graph_rewrite_1_gpu_tweaked')
FLAGS.batch_size = 256
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_1_gpu(self):
"""Benchmarks graph 1 gpu."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
FLAGS.batch_size = 128
FLAGS.dtype = 'fp32'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_8_gpu(self):
"""Benchmarks graph 8 gpus."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_graph_8_gpu')
FLAGS.batch_size = 128*8
FLAGS.dtype = 'fp32'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_8_gpu(self):
"""Benchmarks graph fp16 8 gpus."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu')
FLAGS.batch_size = 256*8
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_8_gpu_tweaked(self):
"""Benchmarks graph fp16 8 gpus tweaked."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.intra_op_parallelism_threads = 1
FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_8_gpu_tweaked')
FLAGS.batch_size = 256*8
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked(self):
"""Benchmarks graph fp16 graph rewrite 8 gpus tweaked."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.intra_op_parallelism_threads = 1
FLAGS.model_dir = self._get_model_dir(
'benchmark_graph_fp16_graph_rewrite_8_gpu_tweaked')
FLAGS.batch_size = 256*8
FLAGS.dtype = 'fp16'
FLAGS.fp16_implementation = 'graph_rewrite'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
class Resnet50EstimatorBenchmarkSynth(Resnet50EstimatorBenchmark):
"""Resnet50 synthetic benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['use_synthetic_data'] = True
def_flags['max_train_steps'] = 110
def_flags['train_epochs'] = 1
super(Resnet50EstimatorBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags)
class Resnet50EstimatorBenchmarkReal(Resnet50EstimatorBenchmark):
"""Resnet50 real data benchmark tests."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['data_dir'] = os.path.join(root_data_dir, IMAGENET_DATA_DIR_NAME)
def_flags['max_train_steps'] = 110
def_flags['train_epochs'] = 1
super(Resnet50EstimatorBenchmarkReal, self).__init__(
output_dir=output_dir, default_flags=def_flags)
class Resnet50MultiWorkerEstimatorBenchmark(Resnet50EstimatorBenchmarkBase):
"""Benchmarks for ResNet50 using Estimator with multiple workers."""
def __init__(self, output_dir=None, default_flags=None):
super(Resnet50MultiWorkerEstimatorBenchmark, self).__init__(
output_dir=output_dir,
default_flags=default_flags)
def benchmark_graph_fp16_8_gpu_ring_tweaked(self):
"""Benchmarks graph fp16 8 gpus with ring collective tweaked."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.all_reduce_alg = 'ring'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.intra_op_parallelism_threads = 1
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
folder_name='benchmark_graph_fp16_8_gpu_ring_tweaked')
FLAGS.batch_size = 256*8
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_8_gpu_nccl_tweaked(self):
"""Benchmarks graph fp16 8 gpus with nccl collective tweaked."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.distribution_strategy = 'multi_worker_mirrored'
FLAGS.all_reduce_alg = 'nccl'
FLAGS.tf_gpu_thread_mode = 'gpu_private'
FLAGS.intra_op_parallelism_threads = 1
FLAGS.datasets_num_private_threads = 32
FLAGS.model_dir = self._get_model_dir(
folder_name='benchmark_graph_fp16_8_gpu_nccl_tweaked')
FLAGS.batch_size = 256*8
FLAGS.dtype = 'fp16'
FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
class Resnet50MultiWorkerEstimatorBenchmarkSynth(
Resnet50MultiWorkerEstimatorBenchmark):
"""ResNet50, multi-worker, Estimator, synthetic data."""
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
def_flags = {}
def_flags['use_synthetic_data'] = True
def_flags['max_train_steps'] = 110
def_flags['train_epochs'] = 1
super(Resnet50MultiWorkerEstimatorBenchmarkSynth, self).__init__(
output_dir=output_dir, default_flags=def_flags)
class Resnet56EstimatorAccuracy(EstimatorBenchmark):
"""Accuracy tests for Estimator ResNet56."""
local_flags = None
def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
"""A benchmark class.
Args:
output_dir: directory where to output e.g. log files
root_data_dir: directory under which to look for dataset
**kwargs: arbitrary named arguments. This is needed to make the
constructor forward compatible in case PerfZero provides more
named arguments before updating the constructor.
"""
flag_methods = [cifar_main.define_cifar_flags]
self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
super(Resnet56EstimatorAccuracy, self).__init__(
output_dir=output_dir, flag_methods=flag_methods)
def benchmark_graph_1_gpu(self):
"""Test layers model with Estimator and distribution strategies."""
self._setup()
flags.FLAGS.num_gpus = 1
flags.FLAGS.data_dir = self.data_dir
flags.FLAGS.batch_size = 128
flags.FLAGS.train_epochs = 182
flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp32'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_1_gpu(self):
"""Test layers FP16 model with Estimator and distribution strategies."""
self._setup()
flags.FLAGS.num_gpus = 1
flags.FLAGS.data_dir = self.data_dir
flags.FLAGS.batch_size = 128
flags.FLAGS.train_epochs = 182
flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_1_gpu')
flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp16'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_2_gpu(self):
"""Test layers model with Estimator and dist_strat. 2 GPUs."""
self._setup()
flags.FLAGS.num_gpus = 2
flags.FLAGS.data_dir = self.data_dir
flags.FLAGS.batch_size = 128
flags.FLAGS.train_epochs = 182
flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp32'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def benchmark_graph_fp16_2_gpu(self):
"""Test layers FP16 model with Estimator and dist_strat. 2 GPUs."""
self._setup()
flags.FLAGS.num_gpus = 2
flags.FLAGS.data_dir = self.data_dir
flags.FLAGS.batch_size = 128
flags.FLAGS.train_epochs = 182
flags.FLAGS.model_dir = self._get_model_dir('benchmark_graph_fp16_2_gpu')
flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp16'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def unit_test(self):
"""A lightweight test that can finish quickly."""
self._setup()
flags.FLAGS.num_gpus = 1
flags.FLAGS.data_dir = self.data_dir
flags.FLAGS.batch_size = 128
flags.FLAGS.train_epochs = 1
flags.FLAGS.model_dir = self._get_model_dir('unit_test')
flags.FLAGS.resnet_size = 8
flags.FLAGS.dtype = 'fp32'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark()
def _run_and_report_benchmark(self):
"""Executes benchmark and reports result."""
start_time_sec = time.time()
stats = cifar_main.run_cifar(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec
self._report_benchmark(stats,
wall_time_sec,
top_1_min=0.926,
top_1_max=0.938)
@@ -0,0 +1,433 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import datetime
import time
sys.path.append(os.path.dirname(os.path.realpath(__file__)) + '/../../../')
# import pydevd_pycharm
# pydevd_pycharm.settrace('10.174.181.209', port=8008, stdoutToServer=True, stderrToServer=True)
from absl import app as absl_app
from absl import flags
from six.moves import range
import tensorflow as tf
from official.r1.resnet import imagenet_preprocessing
from official.r1.resnet import resnet_model
from official.r1.resnet import resnet_run_loop
from official.utils.flags import core as flags_core
from official.utils.logs import logger
import logging
############## npu modify begin #############
from npu_bridge.estimator import npu_ops
from hccl.manage.api import get_local_rank_id
from hccl.manage.api import get_rank_size
from hccl.manage.api import get_rank_id
from tensorflow.core.protobuf import rewriter_config_pb2
tf.compat.v1.logging.set_verbosity(tf.logging.INFO)
############## npu modify end ###############
DEFAULT_IMAGE_SIZE = 224
NUM_CHANNELS = 3
NUM_CLASSES = 1001
NUM_IMAGES = {
'train': 1281167,
'validation': 50000,
}
_NUM_TRAIN_FILES = 1024
_SHUFFLE_BUFFER = 10000
DATASET_NAME = 'ImageNet'
#log_file1 = 'result/logger_resnet50.log'
#log_file1 = os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../result/logger_resnet50.log')
###############################################################################
# Data processing
###############################################################################
def get_filenames(is_training, data_dir):
"""Return filenames for dataset."""
if is_training:
return [
os.path.join(data_dir, 'train-%05d-of-01024' % i)
for i in range(_NUM_TRAIN_FILES)]
else:
return [
os.path.join(data_dir, 'validation-%05d-of-00128' % i)
for i in range(128)]
def _parse_example_proto(example_serialized):
"""Parses an Example proto containing a training example of an image.
The output of the build_image_data.py image preprocessing script is a dataset
containing serialized Example protocol buffers. Each Example proto contains
the following fields (values are included as examples):
image/height: 462
image/width: 581
image/colorspace: 'RGB'
image/channels: 3
image/class/label: 615
image/class/synset: 'n03623198'
image/class/text: 'knee pad'
image/object/bbox/xmin: 0.1
image/object/bbox/xmax: 0.9
image/object/bbox/ymin: 0.2
image/object/bbox/ymax: 0.6
image/object/bbox/label: 615
image/format: 'JPEG'
image/filename: 'ILSVRC2012_val_00041207.JPEG'
image/encoded: <JPEG encoded string>
Args:
example_serialized: scalar Tensor tf.string containing a serialized
Example protocol buffer.
Returns:
image_buffer: Tensor tf.string containing the contents of a JPEG file.
label: Tensor tf.int32 containing the label.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
"""
# Dense features in Example proto.
feature_map = {
'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
default_value=''),
'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
default_value=-1),
'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
default_value=''),
}
sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
# Sparse features in Example proto.
feature_map.update(
{k: sparse_float32 for k in ['image/object/bbox/xmin',
'image/object/bbox/ymin',
'image/object/bbox/xmax',
'image/object/bbox/ymax']})
features = tf.io.parse_single_example(serialized=example_serialized,
features=feature_map)
label = tf.cast(features['image/class/label'], dtype=tf.int32)
xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
# Note that we impose an ordering of (y, x) just to make life difficult.
bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
# Force the variable number of bounding boxes into the shape
# [1, num_boxes, coords].
bbox = tf.expand_dims(bbox, 0)
bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
return features['image/encoded'], label, bbox
def parse_record(raw_record, is_training, dtype):
"""Parses a record containing a training example of an image.
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
Args:
raw_record: scalar Tensor tf.string containing a serialized
Example protocol buffer.
is_training: A boolean denoting whether the input is for training.
dtype: data type to use for images/features.
Returns:
Tuple with processed image tensor and one-hot-encoded label tensor.
"""
image_buffer, label, bbox = _parse_example_proto(raw_record)
#work_num, root_dir, datatime, resnet_logger = hwlog.env(log_file1)
# add 预处理
#resnet_logger.info("namespace:%s,time_ts:%s, event_type:pre_process_event" % (work_num, date_time))
#resnet_logger.info("namespace:%s,time_ts:%s,event_type:init_start" % (work_num, date_time))
image = imagenet_preprocessing.preprocess_image(
image_buffer=image_buffer,
bbox=bbox,
output_height=DEFAULT_IMAGE_SIZE,
output_width=DEFAULT_IMAGE_SIZE,
num_channels=NUM_CHANNELS,
is_training=is_training)
# resnet_logger.info("namespace:%s,time_ts:%d,event_type:init_end, root_dir:%s" % (work_num, datatime, root_dir))
image = tf.cast(image, dtype)
return image, label
def input_fn(is_training,
data_dir,
batch_size,
num_epochs=1,
dtype=tf.float32,
datasets_num_private_threads=None,
parse_record_fn=parse_record,
input_context=None,
drop_remainder=False,
tf_data_experimental_slack=False):
"""Input function which provides batches for train or eval.
Args:
is_training: A boolean denoting whether the input is for training.
data_dir: The directory containing the input data.
batch_size: The number of samples per batch.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features
datasets_num_private_threads: Number of private threads for tf.data.
parse_record_fn: Function to use for parsing the records.
input_context: A `tf.distribute.InputContext` object passed in by
`tf.distribute.Strategy`.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
tf_data_experimental_slack: Whether to enable tf.data's
`experimental_slack` option.
Returns:
A dataset that can be used for iteration.
"""
filenames = get_filenames(is_training, data_dir)
dataset = tf.data.Dataset.from_tensor_slices(filenames)
if input_context:
############## npu modify begin #############
dataset = dataset.shard(get_rank_size(),
get_rank_id())
############## npu modify end ###############
if is_training:
# Shuffle the input files
dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
# Convert to individual records.
# cycle_length = 10 means that up to 10 files will be read and deserialized in
# parallel. You may want to increase this number if you have a large number of
# CPU cores.
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
return resnet_run_loop.process_record_dataset(
dataset=dataset,
is_training=is_training,
batch_size=batch_size,
shuffle_buffer=_SHUFFLE_BUFFER,
parse_record_fn=parse_record_fn,
num_epochs=num_epochs,
dtype=dtype,
datasets_num_private_threads=datasets_num_private_threads,
drop_remainder=drop_remainder,
tf_data_experimental_slack=tf_data_experimental_slack,
)
def get_synth_input_fn(dtype):
return resnet_run_loop.get_synth_input_fn(
DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS, NUM_CLASSES,
dtype=dtype)
###############################################################################
# Running the model
###############################################################################
class ImagenetModel(resnet_model.Model):
"""Model class with appropriate defaults for Imagenet data."""
def __init__(self, resnet_size, data_format=None, num_classes=NUM_CLASSES,
resnet_version=resnet_model.DEFAULT_VERSION,
dtype=resnet_model.DEFAULT_DTYPE):
"""These are the parameters that work for Imagenet data.
Args:
resnet_size: The number of convolutional layers needed in the model.
data_format: Either 'channels_first' or 'channels_last', specifying which
data format to use when setting up the model.
num_classes: The number of output classes needed from the model. This
enables users to extend the same model to their own datasets.
resnet_version: Integer representing which version of the ResNet network
to use. See README for details. Valid values: [1, 2]
dtype: The TensorFlow dtype to use for calculations.
"""
# For bigger models, we want to use "bottleneck" layers
if resnet_size < 50:
bottleneck = False
else:
bottleneck = True
super(ImagenetModel, self).__init__(
resnet_size=resnet_size,
bottleneck=bottleneck,
num_classes=num_classes,
num_filters=64,
kernel_size=7,
conv_stride=2,
first_pool_size=3,
first_pool_stride=2,
block_sizes=_get_block_sizes(resnet_size),
block_strides=[1, 2, 2, 2],
resnet_version=resnet_version,
data_format=data_format,
dtype=dtype
)
def _get_block_sizes(resnet_size):
"""Retrieve the size of each block_layer in the ResNet model.
The number of block layers used for the Resnet model varies according
to the size of the model. This helper grabs the layer set we want, throwing
an error if a non-standard size has been selected.
Args:
resnet_size: The number of convolutional layers needed in the model.
Returns:
A list of block sizes to use in building the model.
Raises:
KeyError: if invalid resnet_size is received.
"""
choices = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
200: [3, 24, 36, 3]
}
try:
return choices[resnet_size]
except KeyError:
err = ('Could not find layers for selected Resnet size.\n'
'Size received: {}; sizes allowed: {}.'.format(
resnet_size, list(choices.keys())))
raise ValueError(err)
def imagenet_model_fn(features, labels, mode, params):
"""Our model_fn for ResNet to be used with our Estimator."""
# Warmup and higher lr may not be valid for fine tuning with small batches
# and smaller numbers of training images.
if params['fine_tune']:
warmup = False
base_lr = .1
else:
warmup = True
base_lr = .128
learning_rate_fn = resnet_run_loop.learning_rate_with_decay(
batch_size=params['num_gpus']*params['batch_size'],
batch_denom=256, num_images=NUM_IMAGES['train'],
boundary_epochs=[30, 60, 80, 90], decay_rates=[1, 0.1, 0.01, 0.001, 1e-4],
warmup=warmup, base_lr=base_lr)
return resnet_run_loop.resnet_model_fn(
features=features,
labels=labels,
mode=mode,
model_class=ImagenetModel,
resnet_size=params['resnet_size'],
weight_decay=flags.FLAGS.weight_decay,
learning_rate_fn=learning_rate_fn,
momentum=0.9,
data_format=params['data_format'],
resnet_version=params['resnet_version'],
loss_scale=params['loss_scale'],
loss_filter_fn=None,
dtype=params['dtype'],
fine_tune=params['fine_tune'],
label_smoothing=flags.FLAGS.label_smoothing
)
def define_imagenet_flags():
resnet_run_loop.define_resnet_flags(
resnet_size_choices=['18', '34', '50', '101', '152', '200'],
dynamic_loss_scale=True,
fp16_implementation=True)
flags.adopt_module_key_flags(resnet_run_loop)
flags_core.set_defaults(train_epochs=90)
#Loss scale is defautt used because Davinci core supports mixed precision naturally
flags_core.set_defaults(loss_scale='512')
def run_imagenet(flags_obj):
"""Run ResNet ImageNet training and eval loop.
Args:
flags_obj: An object containing parsed flag values.
Returns:
Dict of results of the run. Contains the keys `eval_results` and
`train_hooks`. `eval_results` contains accuracy (top_1) and
accuracy_top_5. `train_hooks` is a list the instances of hooks used during
training.
"""
input_function = (flags_obj.use_synthetic_data and
get_synth_input_fn(flags_core.get_tf_dtype(flags_obj)) or
input_fn)
result = resnet_run_loop.resnet_main(
flags_obj, imagenet_model_fn, input_function, DATASET_NAME,NUM_IMAGES,
shape=[DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, NUM_CHANNELS],)
return result
def main(flags_obj):
############## npu modify begin #############
# Init NPU ,then can call HCCL Interface
init_sess,npu_init=resnet_run_loop.init_npu()
init_sess.run(npu_init)
# i=1
# while(1):
# i+=1
############## npu modify end ###############
with logger.benchmark_context(flags.FLAGS):
run_imagenet(flags.FLAGS)
def benchmark_main():
absl_app.run(main)
def benchmark_pre():
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
define_imagenet_flags()
if __name__ == '__main__':
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
define_imagenet_flags()
absl_app.run(main)
@@ -0,0 +1,262 @@
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images.
Training images are sampled using the provided bounding boxes, and subsequently
cropped to the sampled bounding box. Images are additionally flipped randomly,
then resized to the target output size (without aspect-ratio preservation).
Images used during evaluation are resized (with aspect-ratio preservation) and
centrally cropped.
All images undergo mean color subtraction.
Note that these steps are colloquially referred to as "ResNet preprocessing,"
and they differ from "VGG preprocessing," which does not use bounding boxes
and instead does an aspect-preserving resize followed by random crop during
training. (These both differ from "Inception preprocessing," which introduces
color distortion steps.)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
_CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
# The lower bound for the smallest side of the image for aspect-preserving
# resizing. For example, if an image is 500 x 1000, it will be resized to
# _RESIZE_MIN x (_RESIZE_MIN * 2).
_RESIZE_MIN = 256
def _decode_crop_and_flip(image_buffer, bbox, num_channels):
"""Crops the given image to a random part of the image, and randomly flips.
We use the fused decode_and_crop op, which performs better than the two ops
used separately in series, but note that this requires that the image be
passed in as an un-decoded string Tensor.
Args:
image_buffer: scalar string Tensor representing the raw JPEG image buffer.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
num_channels: Integer depth of the image buffer for decoding.
Returns:
3-D tensor with cropped image.
"""
# A large fraction of image datasets contain a human-annotated bounding box
# delineating the region of the image containing the object of interest. We
# choose to create a new bounding box for the object which is a randomly
# distorted version of the human-annotated bounding box that obeys an
# allowed range of aspect ratios, sizes and overlap with the human-annotated
# bounding box. If no box is supplied, then we assume the bounding box is
# the entire image.
sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
tf.image.extract_jpeg_shape(image_buffer),
bounding_boxes=bbox,
min_object_covered=0.1,
aspect_ratio_range=[0.75, 1.33],
area_range=[0.05, 1.0],
max_attempts=100,
use_image_if_no_bounding_boxes=True)
bbox_begin, bbox_size, _ = sample_distorted_bounding_box
# Reassemble the bounding box in the format the crop op requires.
offset_y, offset_x, _ = tf.unstack(bbox_begin)
target_height, target_width, _ = tf.unstack(bbox_size)
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
# Use the fused decode and crop op here, which is faster than each in series.
cropped = tf.image.decode_and_crop_jpeg(
image_buffer, crop_window, channels=num_channels)
# Flip to add a little more random distortion in.
cropped = tf.image.random_flip_left_right(cropped)
return cropped
def _central_crop(image, crop_height, crop_width):
"""Performs central crops of the given image list.
Args:
image: a 3-D image tensor
crop_height: the height of the image following the crop.
crop_width: the width of the image following the crop.
Returns:
3-D tensor with cropped image.
"""
shape = tf.shape(input=image)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = (height - crop_height)
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = (width - crop_width)
crop_left = amount_to_be_cropped_w // 2
return tf.slice(
image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
def _mean_image_subtraction(image, means, num_channels):
"""Subtracts the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
num_channels: number of color channels in the image that will be distorted.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
# We have a 1-D tensor of means; convert to 3-D.
# Note(b/130245863): we explicitly call `broadcast` instead of simply
# expanding dimensions for better performance.
means = tf.broadcast_to(means, tf.shape(image))
return image - means
def _smallest_size_at_least(height, width, resize_min):
"""Computes new shape with the smallest side equal to `smallest_side`.
Computes new shape with the smallest side equal to `smallest_side` while
preserving the original aspect ratio.
Args:
height: an int32 scalar tensor indicating the current height.
width: an int32 scalar tensor indicating the current width.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
new_height: an int32 scalar tensor indicating the new height.
new_width: an int32 scalar tensor indicating the new width.
"""
resize_min = tf.cast(resize_min, tf.float32)
# Convert to floats to make subsequent calculations go smoothly.
height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
smaller_dim = tf.minimum(height, width)
scale_ratio = resize_min / smaller_dim
# Convert back to ints to make heights and widths that TF ops will accept.
new_height = tf.cast(height * scale_ratio, tf.int32)
new_width = tf.cast(width * scale_ratio, tf.int32)
return new_height, new_width
def _aspect_preserving_resize(image, resize_min):
"""Resize images preserving the original aspect ratio.
Args:
image: A 3-D image `Tensor`.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
resized_image: A 3-D tensor containing the resized image.
"""
shape = tf.shape(input=image)
height, width = shape[0], shape[1]
new_height, new_width = _smallest_size_at_least(height, width, resize_min)
return _resize_image(image, new_height, new_width)
def _resize_image(image, height, width):
"""Simple wrapper around tf.resize_images.
This is primarily to make sure we use the same `ResizeMethod` and other
details each time.
Args:
image: A 3-D image `Tensor`.
height: The target height for the resized image.
width: The target width for the resized image.
Returns:
resized_image: A 3-D tensor containing the resized image. The first two
dimensions have the shape [height, width].
"""
return tf.compat.v1.image.resize(
image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
def preprocess_image(image_buffer, bbox, output_height, output_width,
num_channels, is_training=False):
"""Preprocesses the given image.
Preprocessing includes decoding, cropping, and resizing for both training
and eval images. Training preprocessing, however, introduces some random
distortion of the image to improve accuracy.
Args:
image_buffer: scalar string Tensor representing the raw JPEG image buffer.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
num_channels: Integer depth of the image buffer for decoding.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
Returns:
A preprocessed image.
"""
if is_training:
# For training, we want to randomize some of the distortions.
image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
image = _resize_image(image, output_height, output_width)
else:
# For validation, we want to decode, resize, then just crop the middle.
image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
image = _aspect_preserving_resize(image, _RESIZE_MIN)
image = _central_crop(image, output_height, output_width)
image.set_shape([output_height, output_width, num_channels])
return _mean_image_subtraction(image, _CHANNEL_MEANS, num_channels)
@@ -0,0 +1,326 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.r1.resnet import imagenet_main
from official.utils.misc import keras_utils
from official.utils.testing import integration
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
_BATCH_SIZE = 32
_LABEL_CLASSES = 1001
class BaseTest(tf.test.TestCase):
_num_validation_images = None
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(BaseTest, cls).setUpClass()
imagenet_main.define_imagenet_flags()
def setUp(self):
super(BaseTest, self).setUp()
if keras_utils.is_v2_0:
tf.compat.v1.disable_eager_execution()
self._num_validation_images = imagenet_main.NUM_IMAGES['validation']
imagenet_main.NUM_IMAGES['validation'] = 4
def tearDown(self):
super(BaseTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
imagenet_main.NUM_IMAGES['validation'] = self._num_validation_images
def _tensor_shapes_helper(self, resnet_size, resnet_version, dtype, with_gpu):
"""Checks the tensor shapes after each phase of the ResNet model."""
def reshape(shape):
"""Returns the expected dimensions depending on if a GPU is being used."""
# If a GPU is used for the test, the shape is returned (already in NCHW
# form). When GPU is not used, the shape is converted to NHWC.
if with_gpu:
return shape
return shape[0], shape[2], shape[3], shape[1]
graph = tf.Graph()
with graph.as_default(), self.test_session(
graph=graph, use_gpu=with_gpu, force_gpu=with_gpu):
model = imagenet_main.ImagenetModel(
resnet_size=resnet_size,
data_format='channels_first' if with_gpu else 'channels_last',
resnet_version=resnet_version,
dtype=dtype
)
inputs = tf.random.uniform([1, 224, 224, 3])
output = model(inputs, training=True)
initial_conv = graph.get_tensor_by_name('resnet_model/initial_conv:0')
max_pool = graph.get_tensor_by_name('resnet_model/initial_max_pool:0')
block_layer1 = graph.get_tensor_by_name('resnet_model/block_layer1:0')
block_layer2 = graph.get_tensor_by_name('resnet_model/block_layer2:0')
block_layer3 = graph.get_tensor_by_name('resnet_model/block_layer3:0')
block_layer4 = graph.get_tensor_by_name('resnet_model/block_layer4:0')
reduce_mean = graph.get_tensor_by_name('resnet_model/final_reduce_mean:0')
dense = graph.get_tensor_by_name('resnet_model/final_dense:0')
self.assertAllEqual(initial_conv.shape, reshape((1, 64, 112, 112)))
self.assertAllEqual(max_pool.shape, reshape((1, 64, 56, 56)))
# The number of channels after each block depends on whether we're
# using the building_block or the bottleneck_block.
if resnet_size < 50:
self.assertAllEqual(block_layer1.shape, reshape((1, 64, 56, 56)))
self.assertAllEqual(block_layer2.shape, reshape((1, 128, 28, 28)))
self.assertAllEqual(block_layer3.shape, reshape((1, 256, 14, 14)))
self.assertAllEqual(block_layer4.shape, reshape((1, 512, 7, 7)))
self.assertAllEqual(reduce_mean.shape, reshape((1, 512, 1, 1)))
else:
self.assertAllEqual(block_layer1.shape, reshape((1, 256, 56, 56)))
self.assertAllEqual(block_layer2.shape, reshape((1, 512, 28, 28)))
self.assertAllEqual(block_layer3.shape, reshape((1, 1024, 14, 14)))
self.assertAllEqual(block_layer4.shape, reshape((1, 2048, 7, 7)))
self.assertAllEqual(reduce_mean.shape, reshape((1, 2048, 1, 1)))
self.assertAllEqual(dense.shape, (1, _LABEL_CLASSES))
self.assertAllEqual(output.shape, (1, _LABEL_CLASSES))
def tensor_shapes_helper(self, resnet_size, resnet_version, with_gpu=False):
self._tensor_shapes_helper(resnet_size=resnet_size,
resnet_version=resnet_version,
dtype=tf.float32, with_gpu=with_gpu)
self._tensor_shapes_helper(resnet_size=resnet_size,
resnet_version=resnet_version,
dtype=tf.float16, with_gpu=with_gpu)
def test_tensor_shapes_resnet_18_v1(self):
self.tensor_shapes_helper(18, resnet_version=1)
def test_tensor_shapes_resnet_18_v2(self):
self.tensor_shapes_helper(18, resnet_version=2)
def test_tensor_shapes_resnet_34_v1(self):
self.tensor_shapes_helper(34, resnet_version=1)
def test_tensor_shapes_resnet_34_v2(self):
self.tensor_shapes_helper(34, resnet_version=2)
def test_tensor_shapes_resnet_50_v1(self):
self.tensor_shapes_helper(50, resnet_version=1)
def test_tensor_shapes_resnet_50_v2(self):
self.tensor_shapes_helper(50, resnet_version=2)
def test_tensor_shapes_resnet_101_v1(self):
self.tensor_shapes_helper(101, resnet_version=1)
def test_tensor_shapes_resnet_101_v2(self):
self.tensor_shapes_helper(101, resnet_version=2)
def test_tensor_shapes_resnet_152_v1(self):
self.tensor_shapes_helper(152, resnet_version=1)
def test_tensor_shapes_resnet_152_v2(self):
self.tensor_shapes_helper(152, resnet_version=2)
def test_tensor_shapes_resnet_200_v1(self):
self.tensor_shapes_helper(200, resnet_version=1)
def test_tensor_shapes_resnet_200_v2(self):
self.tensor_shapes_helper(200, resnet_version=2)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_18_with_gpu_v1(self):
self.tensor_shapes_helper(18, resnet_version=1, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_18_with_gpu_v2(self):
self.tensor_shapes_helper(18, resnet_version=2, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_34_with_gpu_v1(self):
self.tensor_shapes_helper(34, resnet_version=1, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_34_with_gpu_v2(self):
self.tensor_shapes_helper(34, resnet_version=2, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_50_with_gpu_v1(self):
self.tensor_shapes_helper(50, resnet_version=1, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_50_with_gpu_v2(self):
self.tensor_shapes_helper(50, resnet_version=2, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_101_with_gpu_v1(self):
self.tensor_shapes_helper(101, resnet_version=1, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_101_with_gpu_v2(self):
self.tensor_shapes_helper(101, resnet_version=2, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_152_with_gpu_v1(self):
self.tensor_shapes_helper(152, resnet_version=1, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_152_with_gpu_v2(self):
self.tensor_shapes_helper(152, resnet_version=2, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_200_with_gpu_v1(self):
self.tensor_shapes_helper(200, resnet_version=1, with_gpu=True)
@unittest.skipUnless(tf.test.is_built_with_cuda(), 'requires GPU')
def test_tensor_shapes_resnet_200_with_gpu_v2(self):
self.tensor_shapes_helper(200, resnet_version=2, with_gpu=True)
def resnet_model_fn_helper(self, mode, resnet_version, dtype):
"""Tests that the EstimatorSpec is given the appropriate arguments."""
tf.compat.v1.train.create_global_step()
input_fn = imagenet_main.get_synth_input_fn(dtype)
dataset = input_fn(True, '', _BATCH_SIZE)
iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
features, labels = iterator.get_next()
spec = imagenet_main.imagenet_model_fn(
features, labels, mode, {
'dtype': dtype,
'resnet_size': 50,
'data_format': 'channels_last',
'batch_size': _BATCH_SIZE,
'resnet_version': resnet_version,
'loss_scale': 128 if dtype == tf.float16 else 1,
'fine_tune': False,
})
predictions = spec.predictions
self.assertAllEqual(predictions['probabilities'].shape,
(_BATCH_SIZE, _LABEL_CLASSES))
self.assertEqual(predictions['probabilities'].dtype, tf.float32)
self.assertAllEqual(predictions['classes'].shape, (_BATCH_SIZE,))
self.assertEqual(predictions['classes'].dtype, tf.int64)
if mode != tf.estimator.ModeKeys.PREDICT:
loss = spec.loss
self.assertAllEqual(loss.shape, ())
self.assertEqual(loss.dtype, tf.float32)
if mode == tf.estimator.ModeKeys.EVAL:
eval_metric_ops = spec.eval_metric_ops
self.assertAllEqual(eval_metric_ops['accuracy'][0].shape, ())
self.assertAllEqual(eval_metric_ops['accuracy'][1].shape, ())
self.assertEqual(eval_metric_ops['accuracy'][0].dtype, tf.float32)
self.assertEqual(eval_metric_ops['accuracy'][1].dtype, tf.float32)
def test_resnet_model_fn_train_mode_v1(self):
self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=1,
dtype=tf.float32)
def test_resnet_model_fn_train_mode_v2(self):
self.resnet_model_fn_helper(tf.estimator.ModeKeys.TRAIN, resnet_version=2,
dtype=tf.float32)
def test_resnet_model_fn_eval_mode_v1(self):
self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=1,
dtype=tf.float32)
def test_resnet_model_fn_eval_mode_v2(self):
self.resnet_model_fn_helper(tf.estimator.ModeKeys.EVAL, resnet_version=2,
dtype=tf.float32)
def test_resnet_model_fn_predict_mode_v1(self):
self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=1,
dtype=tf.float32)
def test_resnet_model_fn_predict_mode_v2(self):
self.resnet_model_fn_helper(tf.estimator.ModeKeys.PREDICT, resnet_version=2,
dtype=tf.float32)
def _test_imagenetmodel_shape(self, resnet_version):
batch_size = 135
num_classes = 246
model = imagenet_main.ImagenetModel(
50, data_format='channels_last', num_classes=num_classes,
resnet_version=resnet_version)
fake_input = tf.random.uniform([batch_size, 224, 224, 3])
output = model(fake_input, training=True)
self.assertAllEqual(output.shape, (batch_size, num_classes))
def test_imagenetmodel_shape_v1(self):
self._test_imagenetmodel_shape(resnet_version=1)
def test_imagenetmodel_shape_v2(self):
self._test_imagenetmodel_shape(resnet_version=2)
def test_imagenet_end_to_end_synthetic_v1(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v2(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v1_tiny(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'-resnet_size', '18', '--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v2_tiny(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'-resnet_size', '18', '--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v1_huge(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '1', '-batch_size', '4',
'-resnet_size', '200', '--max_train_steps', '1']
)
def test_imagenet_end_to_end_synthetic_v2_huge(self):
integration.run_synthetic(
main=imagenet_main.run_imagenet, tmp_root=self.get_temp_dir(),
extra_flags=['-resnet_version', '2', '-batch_size', '4',
'-resnet_size', '200', '--max_train_steps', '1']
)
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,552 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains definitions for Residual Networks.
Residual networks ('v1' ResNets) were originally proposed in:
[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Deep Residual Learning for Image Recognition. arXiv:1512.03385
The full preactivation 'v2' ResNet variant was introduced by:
[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
Identity Mappings in Deep Residual Networks. arXiv: 1603.05027
The key difference of the full preactivation 'v2' variant compared to the
'v1' variant in [1] is the use of batch normalization before every weight layer
rather than after.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
_BATCH_NORM_DECAY = 0.997
_BATCH_NORM_EPSILON = 1e-5
DEFAULT_VERSION = 2
DEFAULT_DTYPE = tf.float32
CASTABLE_TYPES = (tf.float16,)
ALLOWED_TYPES = (DEFAULT_DTYPE,) + CASTABLE_TYPES
################################################################################
# Convenience functions for building the ResNet model.
################################################################################
def batch_norm(inputs, training, data_format):
"""Performs a batch normalization using a standard set of parameters."""
# We set fused=True for a significant performance boost. See
# https://www.tensorflow.org/performance/performance_guide#common_fused_ops
return tf.compat.v1.layers.batch_normalization(
inputs=inputs, axis=1 if data_format == 'channels_first' else 3,
momentum=_BATCH_NORM_DECAY, epsilon=_BATCH_NORM_EPSILON, center=True,
scale=True, training=training, fused=True)
def fixed_padding(inputs, kernel_size, data_format):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
Should be a positive integer.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
A tensor with the same format as the input with the data either intact
(if kernel_size == 1) or padded (if kernel_size > 1).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(tensor=inputs,
paddings=[[0, 0], [0, 0], [pad_beg, pad_end],
[pad_beg, pad_end]])
else:
padded_inputs = tf.pad(tensor=inputs,
paddings=[[0, 0], [pad_beg, pad_end],
[pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
"""Strided 2-D convolution with explicit padding."""
# The padding is consistent and is based only on `kernel_size`, not on the
# dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format)
return tf.compat.v1.layers.conv2d(
inputs=inputs, filters=filters, kernel_size=kernel_size, strides=strides,
padding=('SAME' if strides == 1 else 'VALID'), use_bias=False,
kernel_initializer=tf.compat.v1.variance_scaling_initializer(),
data_format=data_format)
################################################################################
# ResNet block definitions.
################################################################################
def _building_block_v1(inputs, filters, training, projection_shortcut, strides,
data_format):
"""A single block for ResNet v1, without a bottleneck.
Convolution then batch normalization then ReLU as described by:
Deep Residual Learning for Image Recognition
https://arxiv.org/pdf/1512.03385.pdf
by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
filters: The number of filters for the convolutions.
training: A Boolean for whether the model is in training or inference
mode. Needed for batch normalization.
projection_shortcut: The function to use for projection shortcuts
(typically a 1x1 convolution when downsampling the input).
strides: The block's stride. If greater than 1, this block will ultimately
downsample the input.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
The output tensor of the block; shape should match inputs.
"""
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
shortcut = batch_norm(inputs=shortcut, training=training,
data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=1,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs += shortcut
inputs = tf.nn.relu(inputs)
return inputs
def _building_block_v2(inputs, filters, training, projection_shortcut, strides,
data_format):
"""A single block for ResNet v2, without a bottleneck.
Batch normalization then ReLu then convolution as described by:
Identity Mappings in Deep Residual Networks
https://arxiv.org/pdf/1603.05027.pdf
by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
filters: The number of filters for the convolutions.
training: A Boolean for whether the model is in training or inference
mode. Needed for batch normalization.
projection_shortcut: The function to use for projection shortcuts
(typically a 1x1 convolution when downsampling the input).
strides: The block's stride. If greater than 1, this block will ultimately
downsample the input.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
The output tensor of the block; shape should match inputs.
"""
shortcut = inputs
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
# The projection shortcut should come after the first batch norm and ReLU
# since it performs a 1x1 convolution.
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=1,
data_format=data_format)
return inputs + shortcut
def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
strides, data_format):
"""A single block for ResNet v1, with a bottleneck.
Similar to _building_block_v1(), except using the "bottleneck" blocks
described in:
Convolution then batch normalization then ReLU as described by:
Deep Residual Learning for Image Recognition
https://arxiv.org/pdf/1512.03385.pdf
by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
filters: The number of filters for the convolutions.
training: A Boolean for whether the model is in training or inference
mode. Needed for batch normalization.
projection_shortcut: The function to use for projection shortcuts
(typically a 1x1 convolution when downsampling the input).
strides: The block's stride. If greater than 1, this block will ultimately
downsample the input.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
The output tensor of the block; shape should match inputs.
"""
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
shortcut = batch_norm(inputs=shortcut, training=training,
data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=1, strides=1,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs += shortcut
inputs = tf.nn.relu(inputs)
return inputs
def _bottleneck_block_v2(inputs, filters, training, projection_shortcut,
strides, data_format):
"""A single block for ResNet v2, with a bottleneck.
Similar to _building_block_v2(), except using the "bottleneck" blocks
described in:
Convolution then batch normalization then ReLU as described by:
Deep Residual Learning for Image Recognition
https://arxiv.org/pdf/1512.03385.pdf
by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Dec 2015.
Adapted to the ordering conventions of:
Batch normalization then ReLu then convolution as described by:
Identity Mappings in Deep Residual Networks
https://arxiv.org/pdf/1603.05027.pdf
by Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, Jul 2016.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
filters: The number of filters for the convolutions.
training: A Boolean for whether the model is in training or inference
mode. Needed for batch normalization.
projection_shortcut: The function to use for projection shortcuts
(typically a 1x1 convolution when downsampling the input).
strides: The block's stride. If greater than 1, this block will ultimately
downsample the input.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
The output tensor of the block; shape should match inputs.
"""
shortcut = inputs
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
# The projection shortcut should come after the first batch norm and ReLU
# since it performs a 1x1 convolution.
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=1, strides=1,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides,
data_format=data_format)
inputs = batch_norm(inputs, training, data_format)
inputs = tf.nn.relu(inputs)
inputs = conv2d_fixed_padding(
inputs=inputs, filters=4 * filters, kernel_size=1, strides=1,
data_format=data_format)
return inputs + shortcut
def block_layer(inputs, filters, bottleneck, block_fn, blocks, strides,
training, name, data_format):
"""Creates one layer of blocks for the ResNet model.
Args:
inputs: A tensor of size [batch, channels, height_in, width_in] or
[batch, height_in, width_in, channels] depending on data_format.
filters: The number of filters for the first convolution of the layer.
bottleneck: Is the block created a bottleneck block.
block_fn: The block to use within the model, either `building_block` or
`bottleneck_block`.
blocks: The number of blocks contained in the layer.
strides: The stride to use for the first convolution of the layer. If
greater than 1, this layer will ultimately downsample the input.
training: Either True or False, whether we are currently training the
model. Needed for batch norm.
name: A string name for the tensor output of the block layer.
data_format: The input format ('channels_last' or 'channels_first').
Returns:
The output tensor of the block layer.
"""
# Bottleneck blocks end with 4x the number of filters as they start with
filters_out = filters * 4 if bottleneck else filters
def projection_shortcut(inputs):
return conv2d_fixed_padding(
inputs=inputs, filters=filters_out, kernel_size=1, strides=strides,
data_format=data_format)
# Only the first block per block_layer uses projection_shortcut and strides
inputs = block_fn(inputs, filters, training, projection_shortcut, strides,
data_format)
for _ in range(1, blocks):
inputs = block_fn(inputs, filters, training, None, 1, data_format)
return tf.identity(inputs, name)
class Model(object):
"""Base class for building the Resnet Model."""
def __init__(self, resnet_size, bottleneck, num_classes, num_filters,
kernel_size,
conv_stride, first_pool_size, first_pool_stride,
block_sizes, block_strides,
resnet_version=DEFAULT_VERSION, data_format=None,
dtype=DEFAULT_DTYPE):
"""Creates a model for classifying an image.
Args:
resnet_size: A single integer for the size of the ResNet model.
bottleneck: Use regular blocks or bottleneck blocks.
num_classes: The number of classes used as labels.
num_filters: The number of filters to use for the first block layer
of the model. This number is then doubled for each subsequent block
layer.
kernel_size: The kernel size to use for convolution.
conv_stride: stride size for the initial convolutional layer
first_pool_size: Pool size to be used for the first pooling layer.
If none, the first pooling layer is skipped.
first_pool_stride: stride size for the first pooling layer. Not used
if first_pool_size is None.
block_sizes: A list containing n values, where n is the number of sets of
block layers desired. Each value should be the number of blocks in the
i-th set.
block_strides: List of integers representing the desired stride size for
each of the sets of block layers. Should be same length as block_sizes.
resnet_version: Integer representing which version of the ResNet network
to use. See README for details. Valid values: [1, 2]
data_format: Input format ('channels_last', 'channels_first', or None).
If set to None, the format is dependent on whether a GPU is available.
dtype: The TensorFlow dtype to use for calculations. If not specified
tf.float32 is used.
Raises:
ValueError: if invalid version is selected.
"""
self.resnet_size = resnet_size
if not data_format:
data_format = (
'channels_first' if tf.test.is_built_with_cuda() else 'channels_last')
self.resnet_version = resnet_version
if resnet_version not in (1, 2):
raise ValueError(
'Resnet version should be 1 or 2. See README for citations.')
self.bottleneck = bottleneck
if bottleneck:
if resnet_version == 1:
self.block_fn = _bottleneck_block_v1
else:
self.block_fn = _bottleneck_block_v2
else:
if resnet_version == 1:
self.block_fn = _building_block_v1
else:
self.block_fn = _building_block_v2
if dtype not in ALLOWED_TYPES:
raise ValueError('dtype must be one of: {}'.format(ALLOWED_TYPES))
self.data_format = data_format
self.num_classes = num_classes
self.num_filters = num_filters
self.kernel_size = kernel_size
self.conv_stride = conv_stride
self.first_pool_size = first_pool_size
self.first_pool_stride = first_pool_stride
self.block_sizes = block_sizes
self.block_strides = block_strides
self.dtype = dtype
self.pre_activation = resnet_version == 2
def _custom_dtype_getter(self, getter, name, shape=None, dtype=DEFAULT_DTYPE,
*args, **kwargs):
"""Creates variables in fp32, then casts to fp16 if necessary.
This function is a custom getter. A custom getter is a function with the
same signature as tf.get_variable, except it has an additional getter
parameter. Custom getters can be passed as the `custom_getter` parameter of
tf.variable_scope. Then, tf.get_variable will call the custom getter,
instead of directly getting a variable itself. This can be used to change
the types of variables that are retrieved with tf.get_variable.
The `getter` parameter is the underlying variable getter, that would have
been called if no custom getter was used. Custom getters typically get a
variable with `getter`, then modify it in some way.
This custom getter will create an fp32 variable. If a low precision
(e.g. float16) variable was requested it will then cast the variable to the
requested dtype. The reason we do not directly create variables in low
precision dtypes is that applying small gradients to such variables may
cause the variable not to change.
Args:
getter: The underlying variable getter, that has the same signature as
tf.get_variable and returns a variable.
name: The name of the variable to get.
shape: The shape of the variable to get.
dtype: The dtype of the variable to get. Note that if this is a low
precision dtype, the variable will be created as a tf.float32 variable,
then cast to the appropriate dtype
*args: Additional arguments to pass unmodified to getter.
**kwargs: Additional keyword arguments to pass unmodified to getter.
Returns:
A variable which is cast to fp16 if necessary.
"""
if dtype in CASTABLE_TYPES:
var = getter(name, shape, tf.float32, *args, **kwargs)
return tf.cast(var, dtype=dtype, name=name + '_cast')
else:
return getter(name, shape, dtype, *args, **kwargs)
def _model_variable_scope(self):
"""Returns a variable scope that the model should be created under.
If self.dtype is a castable type, model variable will be created in fp32
then cast to self.dtype before being used.
Returns:
A variable scope for the model.
"""
return tf.compat.v1.variable_scope('resnet_model',
custom_getter=self._custom_dtype_getter)
def __call__(self, inputs, training):
"""Add operations to classify a batch of input images.
Args:
inputs: A Tensor representing a batch of input images.
training: A boolean. Set to True to add operations required only when
training the classifier.
Returns:
A logits Tensor with shape [<batch_size>, self.num_classes].
"""
with self._model_variable_scope():
if self.data_format == 'channels_first':
# Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
# This provides a large performance boost on GPU. See
# https://www.tensorflow.org/performance/performance_guide#data_formats
inputs = tf.transpose(a=inputs, perm=[0, 3, 1, 2])
inputs = conv2d_fixed_padding(
inputs=inputs, filters=self.num_filters, kernel_size=self.kernel_size,
strides=self.conv_stride, data_format=self.data_format)
inputs = tf.identity(inputs, 'initial_conv')
# We do not include batch normalization or activation functions in V2
# for the initial conv1 because the first ResNet unit will perform these
# for both the shortcut and non-shortcut paths as part of the first
# block's projection. Cf. Appendix of [2].
if self.resnet_version == 1:
inputs = batch_norm(inputs, training, self.data_format)
inputs = tf.nn.relu(inputs)
if self.first_pool_size:
############## npu modify begin #############
#max_pooling2d is replaced by max_pool_with_argmax for better performance
inputs,argmax = tf.compat.v1.nn.max_pool_with_argmax(
input=inputs, ksize=(1,self.first_pool_size,self.first_pool_size,1),
strides=(1,self.first_pool_stride,self.first_pool_stride,1), padding='SAME',
data_format='NCHW' if self.data_format == 'channels_first' else 'NHWC')
############## npu modify end ###############
inputs = tf.identity(inputs, 'initial_max_pool')
for i, num_blocks in enumerate(self.block_sizes):
num_filters = self.num_filters * (2**i)
inputs = block_layer(
inputs=inputs, filters=num_filters, bottleneck=self.bottleneck,
block_fn=self.block_fn, blocks=num_blocks,
strides=self.block_strides[i], training=training,
name='block_layer{}'.format(i + 1), data_format=self.data_format)
# Only apply the BN and ReLU for model that does pre_activation in each
# building/bottleneck block, eg resnet V2.
if self.pre_activation:
inputs = batch_norm(inputs, training, self.data_format)
inputs = tf.nn.relu(inputs)
# The current top layer has shape
# `batch_size x pool_size x pool_size x final_size`.
# ResNet does an Average Pooling layer over pool_size,
# but that is the same as doing a reduce_mean. We do a reduce_mean
# here because it performs better than AveragePooling2D.
axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
inputs = tf.reduce_mean(input_tensor=inputs, axis=axes, keepdims=True)
inputs = tf.identity(inputs, 'final_reduce_mean')
inputs = tf.squeeze(inputs, axes)
inputs = tf.compat.v1.layers.dense(inputs=inputs, units=self.num_classes)
inputs = tf.identity(inputs, 'final_dense')
return inputs
@@ -0,0 +1,979 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains utility and supporting functions for ResNet.
This module contains ResNet code which does not directly build layers. This
includes dataset management, hyperparameter and optimizer code, and argument
parsing. Code for defining the ResNet layers can be found in resnet_model.py.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import math
import multiprocessing
import os
import datetime
import time
from absl import flags
import tensorflow as tf
import logging
import sys
############## npu modify begin #############
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator import npu_ops
from npu_bridge.hccl import hccl_ops
from hccl.manage.api import get_local_rank_id
from hccl.manage.api import get_rank_size
from hccl.manage.api import get_rank_id
from tensorflow.core.protobuf import rewriter_config_pb2
############## npu modify end ###############
from official.r1.resnet import imagenet_preprocessing
from official.r1.resnet import resnet_model
from official.r1.utils import export
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import model_helpers
from benchmark_log import hwlog
################################################################################
# Functions for input processing.
################################################################################
def process_record_dataset(dataset,
is_training,
batch_size,
shuffle_buffer,
parse_record_fn,
num_epochs=1,
dtype=tf.float32,
datasets_num_private_threads=None,
drop_remainder=False,
tf_data_experimental_slack=False):
"""Given a Dataset with raw records, return an iterator over the records.
Args:
dataset: A Dataset representing raw records
is_training: A boolean denoting whether the input is for training.
batch_size: The number of samples per batch.
shuffle_buffer: The buffer size to use when shuffling records. A larger
value results in better randomness, but smaller values reduce startup
time and use less memory.
parse_record_fn: A function that takes a raw record and returns the
corresponding (image, label) pair.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features.
datasets_num_private_threads: Number of threads for a private
threadpool created for all datasets computation.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
tf_data_experimental_slack: Whether to enable tf.data's
`experimental_slack` option.
Returns:
Dataset of (image, label) pairs ready for iteration.
"""
# Defines a specific size thread pool for tf.data operations.
if datasets_num_private_threads:
options = tf.data.Options()
options.experimental_threading.private_threadpool_size = (
datasets_num_private_threads)
dataset = dataset.with_options(options)
tf.compat.v1.logging.info('datasets_num_private_threads: %s',
datasets_num_private_threads)
# Disable intra-op parallelism to optimize for throughput instead of latency.
options = tf.data.Options()
options.experimental_threading.max_intra_op_parallelism = 1
dataset = dataset.with_options(options)
# Prefetches a batch at a time to smooth out the time taken to load input
# files for shuffling and processing.
dataset = dataset.prefetch(buffer_size=batch_size)
if is_training:
# Shuffles records before repeating to respect epoch boundaries.
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
# Repeats the dataset for the number of epochs to train.
#dataset = dataset.repeat(num_epochs)
dataset = dataset.repeat()
# Parses the raw records into images and labels.
dataset = dataset.map(
lambda value: parse_record_fn(value, is_training, dtype),
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
# Operations between the final prefetch and the get_next call to the iterator
# will happen synchronously during run time. We prefetch here again to
# background all of the above processing work and keep it out of the
# critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
# allows DistributionStrategies to adjust how many batches to fetch based
# on how many devices are present.
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
if tf_data_experimental_slack:
options = tf.data.Options()
options.experimental_slack = True
dataset = dataset.with_options(options)
return dataset
def get_synth_input_fn(height, width, num_channels, num_classes,
dtype=tf.float32):
"""Returns an input function that returns a dataset with random data.
This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when
tunning the full input pipeline.
Args:
height: Integer height that will be used to create a fake image tensor.
width: Integer width that will be used to create a fake image tensor.
num_channels: Integer depth that will be used to create a fake image tensor.
num_classes: Number of classes that should be represented in the fake labels
tensor
dtype: Data type for features/images.
Returns:
An input_fn that can be used in place of a real one to return a dataset
that can be used for iteration.
"""
# pylint: disable=unused-argument
def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
"""Returns dataset filled with random data."""
# Synthetic input should be within [0, 255].
inputs = tf.random.truncated_normal(
[batch_size] + [height, width, num_channels],
dtype=dtype,
mean=127,
stddev=60,
name='synthetic_inputs')
labels = tf.random.uniform(
[batch_size],
minval=0,
maxval=num_classes - 1,
dtype=tf.int32,
name='synthetic_labels')
data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return data
return input_fn
def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
"""Serving input fn for raw jpeg images."""
def _preprocess_image(image_bytes):
"""Preprocess a single raw image."""
# Bounding box around the whole image.
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
height, width, num_channels = image_shape
image = imagenet_preprocessing.preprocess_image(
image_bytes, bbox, height, width, num_channels, is_training=False)
return image
image_bytes_list = tf.compat.v1.placeholder(
shape=[None], dtype=tf.string, name='input_tensor')
images = tf.map_fn(
_preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
return tf.estimator.export.TensorServingInputReceiver(
images, {'image_bytes': image_bytes_list})
def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
"""Override flags and set env_vars for performance.
These settings exist to test the difference between using stock settings
and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
squeeze a few extra examples per second. These settings are defaulted to the
current platform of interest, which changes over time.
On systems with small numbers of cpu cores, e.g. under 8 logical cores,
setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
poorly.
Args:
flags_obj: Current flags, which will be adjusted possibly overriding
what has been set by the user on the command-line.
"""
cpu_count = multiprocessing.cpu_count()
tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count)
# Sets up thread pool for each GPU for op scheduling.
per_gpu_thread_count = 1
total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s',
os.environ['TF_GPU_THREAD_COUNT'])
tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s',
os.environ['TF_GPU_THREAD_MODE'])
# Reduces general thread pool by number of threads used for GPU pool.
main_thread_count = cpu_count - total_gpu_thread_count
flags_obj.inter_op_parallelism_threads = main_thread_count
# Sets thread count for tf.data. Logical cores minus threads assign to the
# private GPU pool along with 2 thread per GPU for event monitoring and
# sending / receiving tensors.
num_monitoring_threads = 2 * flags_obj.num_gpus
flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
- num_monitoring_threads)
################################################################################
# Functions for running training/eval/validation loops for the model.
################################################################################
def learning_rate_with_decay(
batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
base_lr=0.1, warmup=False):
"""Get a learning rate that decays step-wise as training progresses.
Args:
batch_size: the number of examples processed in each training batch.
batch_denom: this value will be used to scale the base learning rate.
`0.1 * batch size` is divided by this number, such that when
batch_denom == batch_size, the initial learning rate will be 0.1.
num_images: total number of images that will be used for training.
boundary_epochs: list of ints representing the epochs at which we
decay the learning rate.
decay_rates: list of floats representing the decay rates to be used
for scaling the learning rate. It should have one more element
than `boundary_epochs`, and all elements should have the same type.
base_lr: Initial learning rate scaled based on batch_denom.
warmup: Run a 5 epoch warmup to the initial lr.
Returns:
Returns a function that takes a single argument - the number of batches
trained so far (global_step)- and returns the learning rate to be used
for training the next batch.
"""
initial_learning_rate = base_lr * batch_size / batch_denom
batches_per_epoch = num_images / batch_size
# Reduce the learning rate at certain epochs.
# CIFAR-10: divide by 10 at epoch 100, 150, and 200
# ImageNet: divide by 10 at epoch 30, 60, 80, and 90
boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
vals = [initial_learning_rate * decay for decay in decay_rates]
def learning_rate_fn(global_step):
"""Builds scaled learning rate function with 5 epoch warm up."""
############## npu modify begin #############
#Using int32 for better computing performance
global_step=tf.cast(global_step,tf.int32)
############## npu modify end ###############
lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
if warmup:
warmup_steps = int(batches_per_epoch * 5)
warmup_lr = (
initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
warmup_steps, tf.float32))
return tf.cond(pred=global_step < warmup_steps,
true_fn=lambda: warmup_lr,
false_fn=lambda: lr)
return lr
def poly_rate_fn(global_step):
"""Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at 0, then it increases linearly per step. After
FLAGS.poly_warmup_epochs, we reach the base learning rate (scaled to account
for batch size). The learning rate is then decayed using a polynomial rate
decay schedule with power 2.0.
Args:
global_step: the current global_step
Returns:
returns the current learning rate
"""
# Learning rate schedule for LARS polynomial schedule
if flags.FLAGS.batch_size < 8192:
plr = 5.0
w_epochs = 5
elif flags.FLAGS.batch_size < 16384:
plr = 10.0
w_epochs = 5
elif flags.FLAGS.batch_size < 32768:
plr = 25.0
w_epochs = 5
else:
plr = 32.0
w_epochs = 14
w_steps = int(w_epochs * batches_per_epoch)
wrate = (plr * tf.cast(global_step, tf.float32) / tf.cast(
w_steps, tf.float32))
# TODO(pkanwar): use a flag to help calc num_epochs.
num_epochs = 90
train_steps = batches_per_epoch * num_epochs
min_step = tf.constant(1, dtype=tf.int64)
decay_steps = tf.maximum(min_step, tf.subtract(global_step, w_steps))
poly_rate = tf.train.polynomial_decay(
plr,
decay_steps,
train_steps - w_steps + 1,
power=2.0)
return tf.where(global_step <= w_steps, wrate, poly_rate)
# For LARS we have a new learning rate schedule
if flags.FLAGS.enable_lars:
return poly_rate_fn
return learning_rate_fn
def resnet_model_fn(features, labels, mode, model_class,
resnet_size, weight_decay, learning_rate_fn, momentum,
data_format, resnet_version, loss_scale,
loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
fine_tune=False, label_smoothing=0.0):
"""Shared functionality for different resnet model_fns.
Initializes the ResnetModel representing the model layers
and uses that model to build the necessary EstimatorSpecs for
the `mode` in question. For training, this means building losses,
the optimizer, and the train op that get passed into the EstimatorSpec.
For evaluation and prediction, the EstimatorSpec is returned without
a train op, but with the necessary parameters for the given mode.
Args:
features: tensor representing input images
labels: tensor representing class labels for all input images
mode: current estimator mode; should be one of
`tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
model_class: a class representing a TensorFlow model that has a __call__
function. We assume here that this is a subclass of ResnetModel.
resnet_size: A single integer for the size of the ResNet model.
weight_decay: weight decay loss rate used to regularize learned variables.
learning_rate_fn: function that returns the current learning rate given
the current global_step
momentum: momentum term used for optimization
data_format: Input format ('channels_last', 'channels_first', or None).
If set to None, the format is dependent on whether a GPU is available.
resnet_version: Integer representing which version of the ResNet network to
use. See README for details. Valid values: [1, 2]
loss_scale: The factor to scale the loss for numerical stability. A detailed
summary is present in the arg parser help text.
loss_filter_fn: function that takes a string variable name and returns
True if the var should be included in loss calculation, and False
otherwise. If None, batch_normalization variables will be excluded
from the loss.
dtype: the TensorFlow dtype to use for calculations.
fine_tune: If True only train the dense layers(final layers).
label_smoothing: If greater than 0 then smooth the labels.
Returns:
EstimatorSpec parameterized according to the input params and the
current mode.
"""
# Generate a summary node for the images
tf.compat.v1.summary.image('images', features, max_outputs=6)
############## npu modify begin #############
# Checks that features/images have same data type being used for calculations.
if features.dtype != dtype:
features=tf.cast(features,dtype)
############## npu modify end ###############
model = model_class(resnet_size, data_format, resnet_version=resnet_version,
dtype=dtype)
logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
# This acts as a no-op if the logits are already in fp32 (provided logits are
# not a SparseTensor). If dtype is is low precision, logits must be cast to
# fp32 for numerical stability.
logits = tf.cast(logits, tf.float32)
predictions = {
'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
}
if mode == tf.estimator.ModeKeys.PREDICT:
# Return the predictions and the specification for serving a SavedModel
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
export_outputs={
'predict': tf.estimator.export.PredictOutput(predictions)
})
# Calculate loss, which includes softmax cross entropy and L2 regularization.
if label_smoothing != 0.0:
one_hot_labels = tf.one_hot(labels, 1001)
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=one_hot_labels,
label_smoothing=label_smoothing)
else:
cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
logits=logits, labels=labels)
# Create a tensor named cross_entropy for logging purposes.
tf.identity(cross_entropy, name='cross_entropy')
tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
# If no loss_filter_fn is passed, assume we want the default behavior,
# which is that batch_normalization variables are excluded from loss.
def exclude_batch_norm(name):
return 'batch_normalization' not in name
loss_filter_fn = loss_filter_fn or exclude_batch_norm
# Add weight decay to the loss.
l2_loss = weight_decay * tf.add_n(
# loss is computed using fp32 for numerical stability.
[
tf.nn.l2_loss(tf.cast(v, tf.float32))
for v in tf.compat.v1.trainable_variables()
if loss_filter_fn(v.name)
])
tf.compat.v1.summary.scalar('l2_loss', l2_loss)
loss = cross_entropy + l2_loss
if mode == tf.estimator.ModeKeys.TRAIN:
global_step = tf.compat.v1.train.get_or_create_global_step()
learning_rate = learning_rate_fn(global_step)
# Create a tensor named learning_rate for logging purposes
tf.identity(learning_rate, name='learning_rate')
tf.compat.v1.summary.scalar('learning_rate', learning_rate)
if flags.FLAGS.enable_lars:
from tensorflow.contrib import opt as contrib_opt # pylint: disable=g-import-not-at-top
optimizer = contrib_opt.LARSOptimizer(
learning_rate,
momentum=momentum,
weight_decay=weight_decay,
skip_list=['batch_normalization', 'bias'])
else:
optimizer = tf.compat.v1.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=momentum
)
############## npu modify begin #############
optimizer = NPUDistributedOptimizer(optimizer)
############## npu modify end ###############
fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
if fp16_implementation == 'graph_rewrite':
optimizer = (
tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
optimizer, loss_scale=loss_scale))
def _dense_grad_filter(gvs):
"""Only apply gradient updates to the final layer.
This function is used for fine tuning.
Args:
gvs: list of tuples with gradients and variable info
Returns:
filtered gradients so that only the dense layer remains
"""
return [(g, v) for g, v in gvs if 'dense' in v.name]
#loss_scale = 512
if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
# When computing fp16 gradients, often intermediate tensor values are
# so small, they underflow to 0. To avoid this, we multiply the loss by
# loss_scale to make these tensor values loss_scale times bigger.
scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
print(">>>>>>>>>>>>>>>>>>>")
print(loss_scale)
print("<<<<<<<<<<<<<<<<<<")
if fine_tune:
scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)
# Once the gradient computation is complete we can scale the gradients
# back to the correct scale before passing them to the optimizer.
unscaled_grad_vars = [(grad / loss_scale, var)
for grad, var in scaled_grad_vars]
minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
else:
grad_vars = optimizer.compute_gradients(loss)
if fine_tune:
grad_vars = _dense_grad_filter(grad_vars)
minimize_op = optimizer.apply_gradients(grad_vars, global_step)
update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
train_op = tf.group(minimize_op, update_ops)
else:
train_op = None
############## npu modify begin #############
#Using float32 for better performance
accuracy = tf.compat.v1.metrics.accuracy(tf.cast(labels,tf.float32), predictions['classes'])
############## npu modify end ###############
accuracy_top_5 = tf.compat.v1.metrics.mean(
tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
############## npu modify begin #############
#Using for 8P
rank_size = int(os.getenv('RANK_SIZE'))
newaccuracy = (hccl_ops.allreduce(accuracy[0], "sum") / rank_size, accuracy[1])
newaccuracy_top_5 = (hccl_ops.allreduce(accuracy_top_5[0], "sum") / rank_size, accuracy_top_5[1])
############## npu modify begin #############
metrics = {'accuracy': newaccuracy,
'accuracy_top_5': newaccuracy_top_5}
# Create a tensor named train_accuracy for logging purposes
tf.identity(accuracy[1], name='train_accuracy')
tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
eval_metric_ops=metrics)
############## npu modify begin #############
def init_npu():
"""Initialize npu manually.
Returns:
`init_sess` npu init session config.
`npu_init` npu init ops.
"""
npu_init = npu_ops.initialize_system()
config = tf.ConfigProto()
#npu mix precision attribute set to true when using mix precision
config.graph_options.rewrite_options.remapping = rewriter_config_pb2.RewriterConfig.OFF
custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name = "NpuOptimizer"
#custom_op.parameter_map["precision_mode"].b = True
custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
custom_op.parameter_map["use_off_line"].b = True
init_sess = tf.Session(config=config)
print("this is init sess config ------------- ",config)
print("this is npu_init ------------- ", npu_init)
# i=1
# while(1):
# i+=1
return init_sess,npu_init
############## npu modify end ###############
def resnet_main(
flags_obj, model_function, input_function, dataset_name, num_images, shape=None):
"""Shared main loop for ResNet Models.
Args:
flags_obj: An object containing parsed flags. See define_resnet_flags()
for details.
model_function: the function that instantiates the Model and builds the
ops for train/eval. This will be passed directly into the estimator.
input_function: the function that processes the dataset and returns a
dataset that the estimator can train on. This will be wrapped with
all the relevant flags for running and passed to estimator.
dataset_name: the name of the dataset for training and evaluation. This is
used for logging purpose.
shape: list of ints representing the shape of the images used for training.
This is only used if flags_obj.export_dir is passed.
Returns:
Dict of results of the run. Contains the keys `eval_results` and
`train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
`train_hooks` is a list the instances of hooks used during training.
"""
# Set other logger configurations
# work_num="work " + str(os.environ.get("DEVICE_INDEX"))
# hwlog.config(
# default_namespace=work_num,
# default_stack_offset=1,
# default_clear_line=False,
# root_dir=os.path.normpath(
# os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..")))
# global logger1
# logger1 = get_logger('rizhi', log_file1)
# print("work_num is ", work_num)
# exit()
model_helpers.apply_clean(flags.FLAGS)
# Ensures flag override logic is only executed if explicitly triggered.
if flags_obj.tf_gpu_thread_mode:
override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
# Configures cluster spec for distribution strategy.
num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
flags_obj.task_index)
# Creates session config. allow_soft_placement = True, is required for
# multi-GPU and is not harmful for other modes.
session_config = tf.compat.v1.ConfigProto(
inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
allow_soft_placement=True)
distribution_strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_core.get_num_gpus(flags_obj),
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs)
############## npu modify begin #############
# Creates a `NPURunConfig` that checkpoints every 115200 steps
run_config = NPURunConfig(
model_dir=flags_obj.model_dir,
session_config=session_config,
keep_checkpoint_max=5,
save_summary_steps=0,
#save_checkpoints_steps=115200,
save_checkpoints_steps=flags_obj.save_checkpoints_steps,
enable_data_pre_proc=True,
#iterations_per_loop=100,
iterations_per_loop=flags_obj.iterations_per_loop,
#enable_auto_mix_precision=True,
precision_mode='allow_mix_precision',
hcom_parallel=True
)
############## npu modify end ###############
# Initializes model with all but the dense layer from pretrained ResNet.
if flags_obj.pretrained_model_checkpoint_path is not None:
warm_start_settings = tf.estimator.WarmStartSettings(
flags_obj.pretrained_model_checkpoint_path,
vars_to_warm_start='^(?!.*dense)')
else:
warm_start_settings = None
############## npu modify begin #############
# Creates a `NPUEstimator` instead of using tf.estimator.Estimator
classifier = NPUEstimator(
model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
params={
'resnet_size': int(flags_obj.resnet_size),
'data_format': flags_obj.data_format,
'batch_size': flags_obj.batch_size,
'resnet_version': int(flags_obj.resnet_version),
'loss_scale': flags_core.get_loss_scale(flags_obj,
default_for_fp16=128),
'dtype': flags_core.get_tf_dtype(flags_obj),
'fine_tune': flags_obj.fine_tune,
'num_workers': num_workers,
'num_gpus' : flags_core.get_num_gpus(flags_obj),
})
############## npu modify end ###############
run_params = {
'batch_size': flags_obj.batch_size,
'dtype': flags_core.get_tf_dtype(flags_obj),
'resnet_size': flags_obj.resnet_size,
'resnet_version': flags_obj.resnet_version,
'synthetic_data': flags_obj.use_synthetic_data,
'train_epochs': flags_obj.train_epochs,
'num_workers': num_workers,
}
if flags_obj.use_synthetic_data:
dataset_name = dataset_name + '-synthetic'
benchmark_logger = logger.get_benchmark_logger()
benchmark_logger.log_run_info('resnet', dataset_name, run_params,
test_id=flags_obj.benchmark_test_id)
train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks,
model_dir=flags_obj.model_dir,
batch_size=flags_obj.batch_size)
def input_fn_train(num_epochs, input_context=None):
############## npu modify begin #############
# Using dtype=tf.float16 for higher data transmission performance
# drop_remainder currently only support true
# batch_size means single card batch instead of global batch size
return input_function(
is_training=True,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
num_epochs=num_epochs,
dtype=tf.float16,
input_context=input_context,
drop_remainder=True)
def input_fn_eval():
# batch_size means single card batch instead of global batch size
# Using dtype=tf.float16 for higher data transmission performance
# drop_remainder currently only support true
return input_function(
is_training=False,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
num_epochs=1,
dtype=tf.float16,
input_context=True,
drop_remainder=True)
############## npu modify end ###############
train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
flags_obj.train_epochs)
use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
############## npu_kai modify end ###############
# init_sess, npu_init = init_npu()
# npu_shutdown = npu_ops.shutdown_system()
############## npu_kai modify end ###############
if use_train_and_evaluate:
train_spec = tf.estimator.TrainSpec(
input_fn=lambda input_context=None: input_fn_train(
train_epochs, input_context=input_context),
hooks=train_hooks,
max_steps=flags_obj.max_train_steps)
eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
tf.compat.v1.logging.info('Starting to train and evaluate.')
tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
# tf.estimator.train_and_evalute doesn't return anything in multi-worker
# case.
eval_results = {}
else:
if train_epochs == 0:
# If --eval_only is set, perform a single loop with zero train epochs.
schedule, n_loops = [0], 1
else:
# Compute the number of times to loop while training. All but the last
# pass will train for `epochs_between_evals` epochs, while the last will
# train for the number needed to reach `training_epochs`. For instance if
# train_epochs = 25 and epochs_between_evals = 10
# schedule will be set to [10, 10, 5]. That is to say, the loop will:
# Train for 10 epochs and then evaluate.
# Train for another 10 epochs and then evaluate.
# Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
schedule[-1] = train_epochs - sum(schedule[:-1]) # over counting.
current_max_steps = 0
############## npu modify begin #############
#if flags_obj.max_train_steps is None:
# flags_obj.max_train_steps = (num_images['train']/flags_obj.batch_size)/flags_core.get_num_gpus(flags_obj)
# max_eval_steps = num_images['validation']/flags_obj.batch_size
# else:
# max_eval_steps = flags_obj.max_train_steps
# for cycle_index, num_train_epochs in enumerate(schedule):
# print(cycle_index)
# print(num_train_epochs)
############## npu modify end #############
for cycle_index, num_train_epochs in enumerate(schedule):
tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
int(n_loops))
############## npu modify begin #############
if flags_obj.max_train_steps is None:
current_max_steps += (
num_images['train'] / flags_obj.batch_size) * num_train_epochs / flags_core.get_num_gpus(
flags_obj)
else:
current_max_steps += flags_obj.max_train_steps
############## npu modify end #############
# add zwx5326390训练开始
# hwlogger.event(key=hwlog.constants.GLOBAL_BATCH_SIZE, value=flags_obj.batch_size)
#work_num, root_dir, datatime, resnet_logger = hwlog.env(log_file1)
#date_time = hwlog.get_time()
#resnet_logger.info("namespace: %s,time_ts: %s, global_batch_size: %d, num_train_epochs: %d" %(\
#work_num, date_time, flags_obj.batch_size, num_train_epochs))
#remark_logger.info("ABK time_ts: %s, current_epoch: %d, batch_size: %d, file: %s, lineno: %s" % (date_time,
# num_train_epochs, flags_obj.batch_size,file_name, sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=num_train_epochs)
if num_train_epochs:
# Since we are calling classifier.train immediately in each loop, the
# value of num_train_epochs in the lambda function will not be changed
# before it is used. So it is safe to ignore the pylint error here
# pylint: disable=cell-var-from-loop
# hwlogger.start(key=hwlog.constants.EPOCH_START)
from hccl.split.api import set_split_strategy_by_idx
set_split_strategy_by_idx([86,160])
classifier.train(
input_fn=lambda input_context=True: input_fn_train(
num_train_epochs, input_context=input_context),
hooks=train_hooks,
max_steps=current_max_steps)
# hwlogger.end(key=hwlog.constants.EPOCH_STOP)
############## npu modify begin #############
# npu resorce will be destoryed When the training is over
# Reinitialize is needed if using hccl interface before next process
init_sess,npu_init=init_npu()
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
init_sess.run(npu_init)
############## npu modify end ###############
# flags_obj.max_train_steps is generally associated with testing and
# profiling. As a result it is frequently called with synthetic data,
# which will iterate forever. Passing steps=flags_obj.max_train_steps
# allows the eval (which is generally unimportant in those circumstances)
# to terminate. Note that eval will run for max_train_steps each loop,
# regardless of the global_step count.
tf.compat.v1.logging.info('Starting to evaluate.')
eval_results = classifier.evaluate(input_fn=input_fn_eval,
steps=num_images['validation']/flags_obj.batch_size)
benchmark_logger.log_evaluation_result(eval_results)
#date_time = hwlog.get_time()
#remark_logger.info("ABK time_ts: %s, accuracy: %f, accuracy_top_5: %f, file: %s, lineno: %s" % (date_time,
# float(eval_results.get("accuracy")),float(eval_results.get("accuracy_top_5")), file_name,sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP1, value=float(eval_results.get("accuracy")))
hwlog.remark_print(key=hwlog.EVAL_ACCURACY_TOP5, value=float(eval_results.get("accuracy_top_5")))
if model_helpers.past_stop_threshold(
flags_obj.stop_threshold, eval_results['accuracy']):
break
############## npu modify begin #############
# npu resorce will be destoryed when evaluate finish
# Reinitialize is needed before using hccl interface
if cycle_index < n_loops-1:
init_sess,npu_init=init_npu()
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
#from hccl.split.api import set_split_strategy_by_idx
# set_split_strategy_by_idx([86,160])
init_sess.run(npu_init)
############## npu modify end ###############
if flags_obj.export_dir is not None:
# Exports a saved model for the given classifier.
export_dtype = flags_core.get_tf_dtype(flags_obj)
if flags_obj.image_bytes_as_serving_input:
input_receiver_fn = functools.partial(
image_bytes_serving_input_fn, shape, dtype=export_dtype)
else:
input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
strip_default_attrs=True)
############## npu modify begin #############
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
############## npu modify end ###############
stats = {}
stats['eval_results'] = eval_results
stats['train_hooks'] = train_hooks
return stats
def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
fp16_implementation=False):
"""Add flags and validators for ResNet."""
flags_core.define_base(clean=True, train_epochs=True,
epochs_between_evals=True, stop_threshold=True,
num_gpu=True, hooks=True, export_dir=True,
distribution_strategy=True)
flags_core.define_performance(num_parallel_calls=False,
inter_op=True,
intra_op=True,
synthetic_data=True,
dtype=True,
all_reduce_alg=True,
num_packs=True,
tf_gpu_thread_mode=True,
datasets_num_private_threads=True,
dynamic_loss_scale=dynamic_loss_scale,
fp16_implementation=fp16_implementation,
loss_scale=True,
tf_data_experimental_slack=True,
max_train_steps=True)
flags_core.define_image()
flags_core.define_benchmark()
flags_core.define_distribution()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_enum(
name='resnet_version', short_name='rv', default='1',
enum_values=['1', '2'],
help=flags_core.help_wrap(
'Version of ResNet. (1 or 2) See README.md for details.'))
flags.DEFINE_bool(
name='fine_tune', short_name='ft', default=False,
help=flags_core.help_wrap(
'If True do not train any parameters except for the final layer.'))
flags.DEFINE_string(
name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
help=flags_core.help_wrap(
'If not None initialize all the network except the final layer with '
'these values'))
flags.DEFINE_boolean(
name='eval_only', default=False,
help=flags_core.help_wrap('Skip training and only perform evaluation on '
'the latest checkpoint.'))
flags.DEFINE_boolean(
name='image_bytes_as_serving_input', default=False,
help=flags_core.help_wrap(
'If True exports savedmodel with serving signature that accepts '
'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
'represents the image. The former is easier to use for serving at '
'the expense of image resize/cropping being done as part of model '
'inference. Note, this flag only applies to ImageNet and cannot '
'be used for CIFAR.'))
flags.DEFINE_boolean(
name='use_train_and_evaluate', default=False,
help=flags_core.help_wrap(
'If True, uses `tf.estimator.train_and_evaluate` for the training '
'and evaluation loop, instead of separate calls to `classifier.train '
'and `classifier.evaluate`, which is the default behavior.'))
flags.DEFINE_bool(
name='enable_lars', default=False,
help=flags_core.help_wrap(
'Enable LARS optimizer for large batch training.'))
flags.DEFINE_float(
name='label_smoothing', default=0.0,
help=flags_core.help_wrap(
'Label smoothing parameter used in the softmax_cross_entropy'))
flags.DEFINE_float(
name='weight_decay', default=1e-4,
help=flags_core.help_wrap(
'Weight decay coefficiant for l2 regularization.'))
choice_kwargs = dict(
name='resnet_size', short_name='rs', default='50',
help=flags_core.help_wrap('The size of the ResNet model to use.'))
if resnet_size_choices is None:
flags.DEFINE_string(**choice_kwargs)
else:
flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
@@ -0,0 +1,901 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Contains utility and supporting functions for ResNet.
This module contains ResNet code which does not directly build layers. This
includes dataset management, hyperparameter and optimizer code, and argument
parsing. Code for defining the ResNet layers can be found in resnet_model.py.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import math
import multiprocessing
import os
from absl import flags
import tensorflow as tf
############## npu modify begin #############
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
from npu_bridge.estimator import npu_ops
from hccl.manage.api import get_local_rank_id
from hccl.manage.api import get_rank_size
from hccl.manage.api import get_rank_id
from tensorflow.core.protobuf import rewriter_config_pb2
############## npu modify end ###############
from official.r1.resnet import imagenet_preprocessing
from official.r1.resnet import resnet_model
from official.r1.utils import export
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import model_helpers
################################################################################
# Functions for input processing.
################################################################################
def process_record_dataset(dataset,
is_training,
batch_size,
shuffle_buffer,
parse_record_fn,
num_epochs=1,
dtype=tf.float32,
datasets_num_private_threads=None,
drop_remainder=False,
tf_data_experimental_slack=False):
"""Given a Dataset with raw records, return an iterator over the records.
Args:
dataset: A Dataset representing raw records
is_training: A boolean denoting whether the input is for training.
batch_size: The number of samples per batch.
shuffle_buffer: The buffer size to use when shuffling records. A larger
value results in better randomness, but smaller values reduce startup
time and use less memory.
parse_record_fn: A function that takes a raw record and returns the
corresponding (image, label) pair.
num_epochs: The number of epochs to repeat the dataset.
dtype: Data type to use for images/features.
datasets_num_private_threads: Number of threads for a private
threadpool created for all datasets computation.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
tf_data_experimental_slack: Whether to enable tf.data's
`experimental_slack` option.
Returns:
Dataset of (image, label) pairs ready for iteration.
"""
# Defines a specific size thread pool for tf.data operations.
if datasets_num_private_threads:
options = tf.data.Options()
options.experimental_threading.private_threadpool_size = (
datasets_num_private_threads)
dataset = dataset.with_options(options)
tf.compat.v1.logging.info('datasets_num_private_threads: %s',
datasets_num_private_threads)
# Disable intra-op parallelism to optimize for throughput instead of latency.
options = tf.data.Options()
options.experimental_threading.max_intra_op_parallelism = 1
dataset = dataset.with_options(options)
# Prefetches a batch at a time to smooth out the time taken to load input
# files for shuffling and processing.
dataset = dataset.prefetch(buffer_size=batch_size)
if is_training:
# Shuffles records before repeating to respect epoch boundaries.
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
# Repeats the dataset for the number of epochs to train.
#dataset = dataset.repeat(num_epochs)
dataset = dataset.repeat()
# Parses the raw records into images and labels.
dataset = dataset.map(
lambda value: parse_record_fn(value, is_training, dtype),
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
# Operations between the final prefetch and the get_next call to the iterator
# will happen synchronously during run time. We prefetch here again to
# background all of the above processing work and keep it out of the
# critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
# allows DistributionStrategies to adjust how many batches to fetch based
# on how many devices are present.
dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
if tf_data_experimental_slack:
options = tf.data.Options()
options.experimental_slack = True
dataset = dataset.with_options(options)
return dataset
def get_synth_input_fn(height, width, num_channels, num_classes,
dtype=tf.float32):
"""Returns an input function that returns a dataset with random data.
This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when
tunning the full input pipeline.
Args:
height: Integer height that will be used to create a fake image tensor.
width: Integer width that will be used to create a fake image tensor.
num_channels: Integer depth that will be used to create a fake image tensor.
num_classes: Number of classes that should be represented in the fake labels
tensor
dtype: Data type for features/images.
Returns:
An input_fn that can be used in place of a real one to return a dataset
that can be used for iteration.
"""
# pylint: disable=unused-argument
def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
"""Returns dataset filled with random data."""
# Synthetic input should be within [0, 255].
inputs = tf.random.truncated_normal(
[batch_size] + [height, width, num_channels],
dtype=dtype,
mean=127,
stddev=60,
name='synthetic_inputs')
labels = tf.random.uniform(
[batch_size],
minval=0,
maxval=num_classes - 1,
dtype=tf.int32,
name='synthetic_labels')
data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return data
return input_fn
def image_bytes_serving_input_fn(image_shape, dtype=tf.float32):
"""Serving input fn for raw jpeg images."""
def _preprocess_image(image_bytes):
"""Preprocess a single raw image."""
# Bounding box around the whole image.
bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=dtype, shape=[1, 1, 4])
height, width, num_channels = image_shape
image = imagenet_preprocessing.preprocess_image(
image_bytes, bbox, height, width, num_channels, is_training=False)
return image
image_bytes_list = tf.compat.v1.placeholder(
shape=[None], dtype=tf.string, name='input_tensor')
images = tf.map_fn(
_preprocess_image, image_bytes_list, back_prop=False, dtype=dtype)
return tf.estimator.export.TensorServingInputReceiver(
images, {'image_bytes': image_bytes_list})
def override_flags_and_set_envars_for_gpu_thread_pool(flags_obj):
"""Override flags and set env_vars for performance.
These settings exist to test the difference between using stock settings
and manual tuning. It also shows some of the ENV_VARS that can be tweaked to
squeeze a few extra examples per second. These settings are defaulted to the
current platform of interest, which changes over time.
On systems with small numbers of cpu cores, e.g. under 8 logical cores,
setting up a gpu thread pool with `tf_gpu_thread_mode=gpu_private` may perform
poorly.
Args:
flags_obj: Current flags, which will be adjusted possibly overriding
what has been set by the user on the command-line.
"""
cpu_count = multiprocessing.cpu_count()
tf.compat.v1.logging.info('Logical CPU cores: %s', cpu_count)
# Sets up thread pool for each GPU for op scheduling.
per_gpu_thread_count = 1
total_gpu_thread_count = per_gpu_thread_count * flags_obj.num_gpus
os.environ['TF_GPU_THREAD_MODE'] = flags_obj.tf_gpu_thread_mode
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
tf.compat.v1.logging.info('TF_GPU_THREAD_COUNT: %s',
os.environ['TF_GPU_THREAD_COUNT'])
tf.compat.v1.logging.info('TF_GPU_THREAD_MODE: %s',
os.environ['TF_GPU_THREAD_MODE'])
# Reduces general thread pool by number of threads used for GPU pool.
main_thread_count = cpu_count - total_gpu_thread_count
flags_obj.inter_op_parallelism_threads = main_thread_count
# Sets thread count for tf.data. Logical cores minus threads assign to the
# private GPU pool along with 2 thread per GPU for event monitoring and
# sending / receiving tensors.
num_monitoring_threads = 2 * flags_obj.num_gpus
flags_obj.datasets_num_private_threads = (cpu_count - total_gpu_thread_count
- num_monitoring_threads)
################################################################################
# Functions for running training/eval/validation loops for the model.
################################################################################
def learning_rate_with_decay(
batch_size, batch_denom, num_images, boundary_epochs, decay_rates,
base_lr=0.1, warmup=False):
"""Get a learning rate that decays step-wise as training progresses.
Args:
batch_size: the number of examples processed in each training batch.
batch_denom: this value will be used to scale the base learning rate.
`0.1 * batch size` is divided by this number, such that when
batch_denom == batch_size, the initial learning rate will be 0.1.
num_images: total number of images that will be used for training.
boundary_epochs: list of ints representing the epochs at which we
decay the learning rate.
decay_rates: list of floats representing the decay rates to be used
for scaling the learning rate. It should have one more element
than `boundary_epochs`, and all elements should have the same type.
base_lr: Initial learning rate scaled based on batch_denom.
warmup: Run a 5 epoch warmup to the initial lr.
Returns:
Returns a function that takes a single argument - the number of batches
trained so far (global_step)- and returns the learning rate to be used
for training the next batch.
"""
initial_learning_rate = base_lr * batch_size / batch_denom
batches_per_epoch = num_images / batch_size
# Reduce the learning rate at certain epochs.
# CIFAR-10: divide by 10 at epoch 100, 150, and 200
# ImageNet: divide by 10 at epoch 30, 60, 80, and 90
boundaries = [int(batches_per_epoch * epoch) for epoch in boundary_epochs]
vals = [initial_learning_rate * decay for decay in decay_rates]
def learning_rate_fn(global_step):
"""Builds scaled learning rate function with 5 epoch warm up."""
############## npu modify begin #############
#Using int32 for better computing performance
global_step=tf.cast(global_step,tf.int32)
############## npu modify end ###############
lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
if warmup:
warmup_steps = int(batches_per_epoch * 5)
warmup_lr = (
initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
warmup_steps, tf.float32))
return tf.cond(pred=global_step < warmup_steps,
true_fn=lambda: warmup_lr,
false_fn=lambda: lr)
return lr
def poly_rate_fn(global_step):
"""Handles linear scaling rule, gradual warmup, and LR decay.
The learning rate starts at 0, then it increases linearly per step. After
FLAGS.poly_warmup_epochs, we reach the base learning rate (scaled to account
for batch size). The learning rate is then decayed using a polynomial rate
decay schedule with power 2.0.
Args:
global_step: the current global_step
Returns:
returns the current learning rate
"""
# Learning rate schedule for LARS polynomial schedule
if flags.FLAGS.batch_size < 8192:
plr = 5.0
w_epochs = 5
elif flags.FLAGS.batch_size < 16384:
plr = 10.0
w_epochs = 5
elif flags.FLAGS.batch_size < 32768:
plr = 25.0
w_epochs = 5
else:
plr = 32.0
w_epochs = 14
w_steps = int(w_epochs * batches_per_epoch)
wrate = (plr * tf.cast(global_step, tf.float32) / tf.cast(
w_steps, tf.float32))
# TODO(pkanwar): use a flag to help calc num_epochs.
num_epochs = 90
train_steps = batches_per_epoch * num_epochs
min_step = tf.constant(1, dtype=tf.int64)
decay_steps = tf.maximum(min_step, tf.subtract(global_step, w_steps))
poly_rate = tf.train.polynomial_decay(
plr,
decay_steps,
train_steps - w_steps + 1,
power=2.0)
return tf.where(global_step <= w_steps, wrate, poly_rate)
# For LARS we have a new learning rate schedule
if flags.FLAGS.enable_lars:
return poly_rate_fn
return learning_rate_fn
def resnet_model_fn(features, labels, mode, model_class,
resnet_size, weight_decay, learning_rate_fn, momentum,
data_format, resnet_version, loss_scale,
loss_filter_fn=None, dtype=resnet_model.DEFAULT_DTYPE,
fine_tune=False, label_smoothing=0.0):
"""Shared functionality for different resnet model_fns.
Initializes the ResnetModel representing the model layers
and uses that model to build the necessary EstimatorSpecs for
the `mode` in question. For training, this means building losses,
the optimizer, and the train op that get passed into the EstimatorSpec.
For evaluation and prediction, the EstimatorSpec is returned without
a train op, but with the necessary parameters for the given mode.
Args:
features: tensor representing input images
labels: tensor representing class labels for all input images
mode: current estimator mode; should be one of
`tf.estimator.ModeKeys.TRAIN`, `EVALUATE`, `PREDICT`
model_class: a class representing a TensorFlow model that has a __call__
function. We assume here that this is a subclass of ResnetModel.
resnet_size: A single integer for the size of the ResNet model.
weight_decay: weight decay loss rate used to regularize learned variables.
learning_rate_fn: function that returns the current learning rate given
the current global_step
momentum: momentum term used for optimization
data_format: Input format ('channels_last', 'channels_first', or None).
If set to None, the format is dependent on whether a GPU is available.
resnet_version: Integer representing which version of the ResNet network to
use. See README for details. Valid values: [1, 2]
loss_scale: The factor to scale the loss for numerical stability. A detailed
summary is present in the arg parser help text.
loss_filter_fn: function that takes a string variable name and returns
True if the var should be included in loss calculation, and False
otherwise. If None, batch_normalization variables will be excluded
from the loss.
dtype: the TensorFlow dtype to use for calculations.
fine_tune: If True only train the dense layers(final layers).
label_smoothing: If greater than 0 then smooth the labels.
Returns:
EstimatorSpec parameterized according to the input params and the
current mode.
"""
# Generate a summary node for the images
tf.compat.v1.summary.image('images', features, max_outputs=6)
############## npu modify begin #############
# Checks that features/images have same data type being used for calculations.
if features.dtype != dtype:
features=tf.cast(features,dtype)
############## npu modify end ###############
model = model_class(resnet_size, data_format, resnet_version=resnet_version,
dtype=dtype)
logits = model(features, mode == tf.estimator.ModeKeys.TRAIN)
# This acts as a no-op if the logits are already in fp32 (provided logits are
# not a SparseTensor). If dtype is is low precision, logits must be cast to
# fp32 for numerical stability.
logits = tf.cast(logits, tf.float32)
predictions = {
'classes': tf.argmax(input=logits, axis=1),
'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
}
if mode == tf.estimator.ModeKeys.PREDICT:
# Return the predictions and the specification for serving a SavedModel
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
export_outputs={
'predict': tf.estimator.export.PredictOutput(predictions)
})
# Calculate loss, which includes softmax cross entropy and L2 regularization.
if label_smoothing != 0.0:
one_hot_labels = tf.one_hot(labels, 1001)
cross_entropy = tf.losses.softmax_cross_entropy(
logits=logits, onehot_labels=one_hot_labels,
label_smoothing=label_smoothing)
else:
cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
logits=logits, labels=labels)
# Create a tensor named cross_entropy for logging purposes.
tf.identity(cross_entropy, name='cross_entropy')
tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
# If no loss_filter_fn is passed, assume we want the default behavior,
# which is that batch_normalization variables are excluded from loss.
def exclude_batch_norm(name):
return 'batch_normalization' not in name
loss_filter_fn = loss_filter_fn or exclude_batch_norm
# Add weight decay to the loss.
l2_loss = weight_decay * tf.add_n(
# loss is computed using fp32 for numerical stability.
[
tf.nn.l2_loss(tf.cast(v, tf.float32))
for v in tf.compat.v1.trainable_variables()
if loss_filter_fn(v.name)
])
tf.compat.v1.summary.scalar('l2_loss', l2_loss)
loss = cross_entropy + l2_loss
if mode == tf.estimator.ModeKeys.TRAIN:
global_step = tf.compat.v1.train.get_or_create_global_step()
learning_rate = learning_rate_fn(global_step)
# Create a tensor named learning_rate for logging purposes
tf.identity(learning_rate, name='learning_rate')
tf.compat.v1.summary.scalar('learning_rate', learning_rate)
if flags.FLAGS.enable_lars:
from tensorflow.contrib import opt as contrib_opt # pylint: disable=g-import-not-at-top
optimizer = contrib_opt.LARSOptimizer(
learning_rate,
momentum=momentum,
weight_decay=weight_decay,
skip_list=['batch_normalization', 'bias'])
else:
optimizer = tf.compat.v1.train.MomentumOptimizer(
learning_rate=learning_rate,
momentum=momentum
)
############## npu modify begin #############
optimizer = NPUDistributedOptimizer(optimizer)
############## npu modify end ###############
fp16_implementation = getattr(flags.FLAGS, 'fp16_implementation', None)
if fp16_implementation == 'graph_rewrite':
optimizer = (
tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
optimizer, loss_scale=loss_scale))
def _dense_grad_filter(gvs):
"""Only apply gradient updates to the final layer.
This function is used for fine tuning.
Args:
gvs: list of tuples with gradients and variable info
Returns:
filtered gradients so that only the dense layer remains
"""
return [(g, v) for g, v in gvs if 'dense' in v.name]
# if loss_scale != 1 and fp16_implementation != 'graph_rewrite':
# When computing fp16 gradients, often intermediate tensor values are
# so small, they underflow to 0. To avoid this, we multiply the loss by
# loss_scale to make these tensor values loss_scale times bigger.
loss_scale = 512
scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
if fine_tune:
scaled_grad_vars = _dense_grad_filter(scaled_grad_vars)
# Once the gradient computation is complete we can scale the gradients
# back to the correct scale before passing them to the optimizer.
unscaled_grad_vars = [(grad / loss_scale, var)
for grad, var in scaled_grad_vars]
minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
#else:
# grad_vars = optimizer.compute_gradients(loss)
# if fine_tune:
# grad_vars = _dense_grad_filter(grad_vars)
# minimize_op = optimizer.apply_gradients(grad_vars, global_step)
update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
train_op = tf.group(minimize_op, update_ops)
else:
train_op = None
############## npu modify begin #############
#Using float32 for better performance
accuracy = tf.compat.v1.metrics.accuracy(tf.cast(labels,tf.float32), predictions['classes'])
############## npu modify end ###############
accuracy_top_5 = tf.compat.v1.metrics.mean(
tf.nn.in_top_k(predictions=logits, targets=labels, k=5, name='top_5_op'))
metrics = {'accuracy': accuracy,
'accuracy_top_5': accuracy_top_5}
# Create a tensor named train_accuracy for logging purposes
tf.identity(accuracy[1], name='train_accuracy')
tf.identity(accuracy_top_5[1], name='train_accuracy_top_5')
tf.compat.v1.summary.scalar('train_accuracy', accuracy[1])
tf.compat.v1.summary.scalar('train_accuracy_top_5', accuracy_top_5[1])
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
eval_metric_ops=metrics)
############## npu modify begin #############
def init_npu():
"""Initialize npu manually.
Returns:
`init_sess` npu init session config.
`npu_init` npu init ops.
"""
npu_init = npu_ops.initialize_system()
config = tf.ConfigProto()
#npu mix precision attribute set to true when using mix precision
config.graph_options.rewrite_options.remapping = rewriter_config_pb2.RewriterConfig.OFF
custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name = "NpuOptimizer"
#custom_op.parameter_map["precision_mode"].b = True
custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision")
custom_op.parameter_map["use_off_line"].b = True
init_sess = tf.Session(config=config)
return init_sess,npu_init
############## npu modify end ###############
def resnet_main(
flags_obj, model_function, input_function, dataset_name, num_images, shape=None):
"""Shared main loop for ResNet Models.
Args:
flags_obj: An object containing parsed flags. See define_resnet_flags()
for details.
model_function: the function that instantiates the Model and builds the
ops for train/eval. This will be passed directly into the estimator.
input_function: the function that processes the dataset and returns a
dataset that the estimator can train on. This will be wrapped with
all the relevant flags for running and passed to estimator.
dataset_name: the name of the dataset for training and evaluation. This is
used for logging purpose.
shape: list of ints representing the shape of the images used for training.
This is only used if flags_obj.export_dir is passed.
Returns:
Dict of results of the run. Contains the keys `eval_results` and
`train_hooks`. `eval_results` contains accuracy (top_1) and accuracy_top_5.
`train_hooks` is a list the instances of hooks used during training.
"""
model_helpers.apply_clean(flags.FLAGS)
# Ensures flag override logic is only executed if explicitly triggered.
if flags_obj.tf_gpu_thread_mode:
override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)
# Configures cluster spec for distribution strategy.
num_workers = distribution_utils.configure_cluster(flags_obj.worker_hosts,
flags_obj.task_index)
# Creates session config. allow_soft_placement = True, is required for
# multi-GPU and is not harmful for other modes.
session_config = tf.compat.v1.ConfigProto(
inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
allow_soft_placement=True)
distribution_strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_core.get_num_gpus(flags_obj),
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs)
############## npu modify begin #############
# Creates a `NPURunConfig` that checkpoints every 115200 steps
run_config = NPURunConfig(
model_dir=flags_obj.model_dir,
session_config=session_config,
keep_checkpoint_max=5,
save_checkpoints_steps=115200,
enable_data_pre_proc=True,
iterations_per_loop=100,
#enable_auto_mix_precision=True,
precision_mode='allow_mix_precision',
hcom_parallel=True
)
############## npu modify end ###############
# Initializes model with all but the dense layer from pretrained ResNet.
if flags_obj.pretrained_model_checkpoint_path is not None:
warm_start_settings = tf.estimator.WarmStartSettings(
flags_obj.pretrained_model_checkpoint_path,
vars_to_warm_start='^(?!.*dense)')
else:
warm_start_settings = None
############## npu modify begin #############
# Creates a `NPUEstimator` instead of using tf.estimator.Estimator
classifier = NPUEstimator(
model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
params={
'resnet_size': int(flags_obj.resnet_size),
'data_format': flags_obj.data_format,
'batch_size': flags_obj.batch_size,
'resnet_version': int(flags_obj.resnet_version),
'loss_scale': flags_core.get_loss_scale(flags_obj,
default_for_fp16=128),
'dtype': flags_core.get_tf_dtype(flags_obj),
'fine_tune': flags_obj.fine_tune,
'num_workers': num_workers,
'num_gpus' : flags_core.get_num_gpus(flags_obj),
})
############## npu modify end ###############
run_params = {
'batch_size': flags_obj.batch_size,
'dtype': flags_core.get_tf_dtype(flags_obj),
'resnet_size': flags_obj.resnet_size,
'resnet_version': flags_obj.resnet_version,
'synthetic_data': flags_obj.use_synthetic_data,
'train_epochs': flags_obj.train_epochs,
'num_workers': num_workers,
}
if flags_obj.use_synthetic_data:
dataset_name = dataset_name + '-synthetic'
benchmark_logger = logger.get_benchmark_logger()
benchmark_logger.log_run_info('resnet', dataset_name, run_params,
test_id=flags_obj.benchmark_test_id)
train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks,
model_dir=flags_obj.model_dir,
batch_size=flags_obj.batch_size)
def input_fn_train(num_epochs, input_context=None):
############## npu modify begin #############
# Using dtype=tf.float16 for higher data transmission performance
# drop_remainder currently only support true
# batch_size means single card batch instead of global batch size
return input_function(
is_training=True,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
num_epochs=num_epochs,
dtype=tf.float16,
input_context=input_context,
drop_remainder=True)
def input_fn_eval():
# batch_size means single card batch instead of global batch size
# Using dtype=tf.float16 for higher data transmission performance
# drop_remainder currently only support true
return input_function(
is_training=False,
data_dir=flags_obj.data_dir,
batch_size=flags_obj.batch_size,
num_epochs=1,
dtype=tf.float16,
drop_remainder=True)
############## npu modify end ###############
train_epochs = (0 if flags_obj.eval_only or not flags_obj.train_epochs else
flags_obj.train_epochs)
use_train_and_evaluate = flags_obj.use_train_and_evaluate or num_workers > 1
if use_train_and_evaluate:
train_spec = tf.estimator.TrainSpec(
input_fn=lambda input_context=None: input_fn_train(
train_epochs, input_context=input_context),
hooks=train_hooks,
max_steps=flags_obj.max_train_steps)
eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval)
tf.compat.v1.logging.info('Starting to train and evaluate.')
tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
# tf.estimator.train_and_evalute doesn't return anything in multi-worker
# case.
eval_results = {}
else:
if train_epochs == 0:
# If --eval_only is set, perform a single loop with zero train epochs.
schedule, n_loops = [0], 1
else:
# Compute the number of times to loop while training. All but the last
# pass will train for `epochs_between_evals` epochs, while the last will
# train for the number needed to reach `training_epochs`. For instance if
# train_epochs = 25 and epochs_between_evals = 10
# schedule will be set to [10, 10, 5]. That is to say, the loop will:
# Train for 10 epochs and then evaluate.
# Train for another 10 epochs and then evaluate.
# Train for a final 5 epochs (to reach 25 epochs) and then evaluate.
n_loops = math.ceil(train_epochs / flags_obj.epochs_between_evals)
schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))]
schedule[-1] = train_epochs - sum(schedule[:-1]) # over counting.
############## npu modify begin #############
if flags_obj.max_train_steps is None:
flags_obj.max_train_steps = (num_images['train']/flags_obj.batch_size)/flags_core.get_num_gpus(flags_obj)
max_eval_steps = num_images['validation']/flags_obj.batch_size
else:
max_eval_steps = flags_obj.max_train_steps
############## npu modify end #############
for cycle_index, num_train_epochs in enumerate(schedule):
tf.compat.v1.logging.info('Starting cycle: %d/%d', cycle_index,
int(n_loops))
if num_train_epochs:
# Since we are calling classifier.train immediately in each loop, the
# value of num_train_epochs in the lambda function will not be changed
# before it is used. So it is safe to ignore the pylint error here
# pylint: disable=cell-var-from-loop
classifier.train(
input_fn=lambda input_context=True: input_fn_train(
num_train_epochs, input_context=input_context),
hooks=train_hooks,
max_steps=flags_obj.max_train_steps*(cycle_index+1))
############## npu modify begin #############
# npu resorce will be destoryed When the training is over
# Reinitialize is needed if using hccl interface before next process
init_sess,npu_init=init_npu()
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
init_sess.run(npu_init)
############## npu modify end ###############
# flags_obj.max_train_steps is generally associated with testing and
# profiling. As a result it is frequently called with synthetic data,
# which will iterate forever. Passing steps=flags_obj.max_train_steps
# allows the eval (which is generally unimportant in those circumstances)
# to terminate. Note that eval will run for max_train_steps each loop,
# regardless of the global_step count.
tf.compat.v1.logging.info('Starting to evaluate.')
eval_results = classifier.evaluate(input_fn=input_fn_eval,
steps=max_eval_steps)
benchmark_logger.log_evaluation_result(eval_results)
if model_helpers.past_stop_threshold(
flags_obj.stop_threshold, eval_results['accuracy']):
break
############## npu modify begin #############
# npu resorce will be destoryed when evaluate finish
# Reinitialize is needed before using hccl interface
init_sess,npu_init=init_npu()
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
init_sess.run(npu_init)
############## npu modify end ###############
if flags_obj.export_dir is not None:
# Exports a saved model for the given classifier.
export_dtype = flags_core.get_tf_dtype(flags_obj)
if flags_obj.image_bytes_as_serving_input:
input_receiver_fn = functools.partial(
image_bytes_serving_input_fn, shape, dtype=export_dtype)
else:
input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
strip_default_attrs=True)
############## npu modify begin #############
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
############## npu modify end ###############
stats = {}
stats['eval_results'] = eval_results
stats['train_hooks'] = train_hooks
return stats
def define_resnet_flags(resnet_size_choices=None, dynamic_loss_scale=False,
fp16_implementation=False):
"""Add flags and validators for ResNet."""
flags_core.define_base(clean=True, train_epochs=True,
epochs_between_evals=True, stop_threshold=True,
num_gpu=True, hooks=True, export_dir=True,
distribution_strategy=True)
flags_core.define_performance(num_parallel_calls=False,
inter_op=True,
intra_op=True,
synthetic_data=True,
dtype=True,
all_reduce_alg=True,
num_packs=True,
tf_gpu_thread_mode=True,
datasets_num_private_threads=True,
dynamic_loss_scale=dynamic_loss_scale,
fp16_implementation=fp16_implementation,
loss_scale=True,
tf_data_experimental_slack=True,
max_train_steps=True)
flags_core.define_image()
flags_core.define_benchmark()
flags_core.define_distribution()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_enum(
name='resnet_version', short_name='rv', default='1',
enum_values=['1', '2'],
help=flags_core.help_wrap(
'Version of ResNet. (1 or 2) See README.md for details.'))
flags.DEFINE_bool(
name='fine_tune', short_name='ft', default=False,
help=flags_core.help_wrap(
'If True do not train any parameters except for the final layer.'))
flags.DEFINE_string(
name='pretrained_model_checkpoint_path', short_name='pmcp', default=None,
help=flags_core.help_wrap(
'If not None initialize all the network except the final layer with '
'these values'))
flags.DEFINE_boolean(
name='eval_only', default=False,
help=flags_core.help_wrap('Skip training and only perform evaluation on '
'the latest checkpoint.'))
flags.DEFINE_boolean(
name='image_bytes_as_serving_input', default=False,
help=flags_core.help_wrap(
'If True exports savedmodel with serving signature that accepts '
'JPEG image bytes instead of a fixed size [HxWxC] tensor that '
'represents the image. The former is easier to use for serving at '
'the expense of image resize/cropping being done as part of model '
'inference. Note, this flag only applies to ImageNet and cannot '
'be used for CIFAR.'))
flags.DEFINE_boolean(
name='use_train_and_evaluate', default=False,
help=flags_core.help_wrap(
'If True, uses `tf.estimator.train_and_evaluate` for the training '
'and evaluation loop, instead of separate calls to `classifier.train '
'and `classifier.evaluate`, which is the default behavior.'))
flags.DEFINE_bool(
name='enable_lars', default=False,
help=flags_core.help_wrap(
'Enable LARS optimizer for large batch training.'))
flags.DEFINE_float(
name='label_smoothing', default=0.0,
help=flags_core.help_wrap(
'Label smoothing parameter used in the softmax_cross_entropy'))
flags.DEFINE_float(
name='weight_decay', default=1e-4,
help=flags_core.help_wrap(
'Weight decay coefficiant for l2 regularization.'))
choice_kwargs = dict(
name='resnet_size', short_name='rs', default='50',
help=flags_core.help_wrap('The size of the ResNet model to use.'))
if resnet_size_choices is None:
flags.DEFINE_string(**choice_kwargs)
else:
flags.DEFINE_enum(enum_values=resnet_size_choices, **choice_kwargs)
@@ -0,0 +1,581 @@
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:20.962.831 [tdt/device/../common/src/log.cpp:158][TSDaemon] begin to send heartbeat to appmon,[tdt/device/src/tsd/tsdaemon.cpp:1580:SendHeartBeatToAppMon]8462 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:20.963.000 [tdt/device/../common/src/log.cpp:149][TsdEVENT] send heartbeat to appmon success,[tdt/device/src/tsd/tsdaemon.cpp:1587:SendHeartBeatToAppMon]8462
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.682 [tdt/device/../common/src/log.cpp:158][HdcSever] drv accept an session,[tdt/device/../common/src/hdc_server.cpp:330:AcceptHdcSession]8454 Msg: running ok
[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:22.243.730 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:1609][drvHdcSetSessionReference:1609] >>> session 55, pid 8380
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.752 [tdt/device/../common/src/log.cpp:158][HdcSever] drvHdcSetSessionReference success,[tdt/device/../common/src/hdc_server.cpp:342:AcceptHdcSession]8454 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.772 [tdt/device/../common/src/log.cpp:158][HdcSever] drv accept an session and drvHdcSetSessionReference success, sessionId=1,[tdt/device/../common/src/hdc_server.cpp:351:AcceptHdcSession]8454 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.788 [tdt/device/../common/src/log.cpp:158][HdcSever] accept an session sessionId=1, open recv thread,[tdt/device/../common/src/hdc_server.cpp:279:Accept]8454 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.823 [tdt/device/../common/src/log.cpp:158]HdcServer::AcceptConnection Start,[tdt/device/../common/src/hdc_server.cpp:310:AcceptHdcSession]8454 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.854 [tdt/device/../common/src/log.cpp:158]HdcServer::RecvData thread = 281470605762992,[tdt/device/../common/src/hdc_server.cpp:154:RecvData]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.945 [tdt/device/../common/src/log.cpp:158]tsdaemon get process sign successfully, procpid:40927 signSize:48,[tdt/device/src/tsd/tsdaemon.cpp:901:FmkToTsdMsg]30221 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.963 [tdt/device/../common/src/log.cpp:149][TsdEVENT]FmkToTsdMsg dev[0] msg[6] sessionId[1] realDev[0] fmkSignPid[40927] profilingMode[0] rankSize[1],[tdt/device/src/tsd/tsdaemon.cpp:905:FmkToTsdMsg]30221
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.243.982 [tdt/device/../common/src/log.cpp:149][TsdEVENT]From FMK Start >>>>>>>>>> TSD dev[0] sessionId[1] realDev[0] fmkPid[40927] rankSize[1],[tdt/device/src/tsd/tsdaemon.cpp:853:FmkToTsdMsgProc]30221
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.000 [tdt/device/../common/src/log.cpp:158][TSDaemon] isAllLastRcvThreadClean_ value:0,[tdt/device/src/tsd/tsdaemon.cpp:819:CleanAllLastRcvThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.013 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() enter [threadSize=1]!,[tdt/device/src/tsd/ppc_server.cpp:96:JoinAllPPCRcvThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.039 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() [ppc tid=281470588977584] [threadSize=1] [freeThreadSize=1].,[tdt/device/src/tsd/ppc_server.cpp:105:JoinAllPPCRcvThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.056 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() [free tid=281470588977584].,[tdt/device/src/tsd/ppc_server.cpp:111:JoinAllPPCRcvThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.071 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() Find free tid and joinable.,[tdt/device/src/tsd/ppc_server.cpp:114:JoinAllPPCRcvThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.086 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] JoinAllPPCRcvThreads() exit [threadSize=0] [freeThreadSize=0].,[tdt/device/src/tsd/ppc_server.cpp:129:JoinAllPPCRcvThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.101 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() enter [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:333:CleanTsdRcvHdcThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.120 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() [tid=281470597370288] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:341:CleanTsdRcvHdcThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.139 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() [tid=281470572192176] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:341:CleanTsdRcvHdcThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.152 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvHdcThreads() exit [threadSize=0]!,[tdt/device/src/tsd/tsdaemon.cpp:346:CleanTsdRcvHdcThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.167 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() enter [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:356:CleanTsdRcvPPCThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.181 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() [tid=281470580584880] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:364:CleanTsdRcvPPCThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.197 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() [tid=281470563799472] [threadSize=2]!,[tdt/device/src/tsd/tsdaemon.cpp:364:CleanTsdRcvPPCThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.241 [tdt/device/../common/src/log.cpp:158][TSDaemon] CleanTsdRcvPPCThreads() exit [threadSize=0]!,[tdt/device/src/tsd/tsdaemon.cpp:369:CleanTsdRcvPPCThreads]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.259 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartSubProcess deviceId: 0, fmkPid: 40927, sessionId: 1, state: 0,[tdt/device/src/tsd/tsdaemon.cpp:630:StartSubProcess]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.273 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartSubProcess rankSize: 1,,[tdt/device/src/tsd/tsdaemon.cpp:635:StartSubProcess]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.285 [tdt/device/../common/src/log.cpp:158][TSDaemon] Process HCCP is abandoned to start, the rank size is 1,[tdt/device/src/tsd/tsdaemon.cpp:651:StartSubProcess]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.337 [tdt/device/../common/src/log.cpp:158][TSDaemon] start delete file, direct is /home/HwHiAiUser/hdcd/device0/,[tdt/device/src/tsd/tsdaemon.cpp:1878:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.367 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is .,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.382 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is ..,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.398 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is etc,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.412 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is upgrade,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.457 [tdt/device/../common/src/log.cpp:158][TSDaemon] ExecuteStart() [tid=281470563799472]!,[tdt/device/src/tsd/tsdaemon.cpp:477:ExecuteStart]30222 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.244.477 [tdt/device/../common/src/log.cpp:158][TSDaemon] check pathName[/var/aicpu_scheduler], pathLen[20] procName[aicpu_scheduler], len[15], MAX_LEN[256] ,[tdt/device/src/tsd/tsdaemon.cpp:1514:CheckProcessInputParam]30222 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.245.278 [tdt/device/../common/src/log.cpp:149][TsdEVENT]#### Start TSD->SubProcess[PROC] Start Msg Device[0] Proc[aicpu_scheduler] fmkPid[40927] fatherPid[8380] subPid[30223] #### profilingMode is[0],[tdt/device/src/tsd/tsdaemon.cpp:498:ExecuteStart]30222
[OPLOG] TDT(8380,tsdaemon):2020-05-12-11:05:22.245.328 [tdt/device/../common/src/log.cpp:151][tdt/device/src/tsd/tsdaemon.cpp:499:ExecuteStart]30222 alloc resource {devOS:[30223]} for {dev:0}
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.245.390 [tdt/device/../common/src/log.cpp:158][TSDaemon] SetTsdToFmkMsg:deviceId[0], sessionId[1], subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:774:SetTsdToFmkMsg]30222 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.681 [hardware/dev_plat/../dev_plat/devhdc/hdc_cfg_parse.c:190][CfgFileOpen:190] >>> /etc/hdcBasic.cfg not exist
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.712 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:554][hdcInit:554] >>> HDC pcie init,use default segment(524288)
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.765 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:539][hdcPcieInit:539] >>> after init hdc segment 524224, socket segment 0
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.279.780 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:574][hdcInit:574] >>> HDC init success.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.281.768 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.646658] [hdcdrv] [hdcdrv_config 2288] <aicpu_scheduler:30223> pid 30223 use segment 524224.
[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.284.329 [tdt/device/../common/src/log.cpp:143]Register data type failed: hiaiSerializeFunc is existed,[tdt/common/common_inc/data_type_reg.h:192:Register]30223 Msg: func has already existed
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.723 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:185][AICPUFW] [main 185] Compute process(cloud) compile time is 02:13:48 Apr 26 2020
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.751 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:168][AICPUFW] [ParseArgs 168] Parse args success. deviceId=0, pid=40927, pidSign=e9b203cc443d80564c8c88a0d111cb95145fae36b00d1ec1, profilingMode=0.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.960 [hardware/dev_plat/../dev_core/devdrv/devdrv_container.c:193][devdrv] [devdrv_do_container 193] para.num(4).
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.983 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:83][AICPUFW] [Start 83] Aicpu_scheduler will start, hostpid=40927, deviceId=0, hostDeviceId = 0.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.284.999 [hardware/dev_plat/aicpufw/aicpufw_api.c:125][AICPUFW] [drvDevBindPid 125] drvDevBindPid enter: chip_id = 0, hostpid=40927, mode=0.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.285.015 [hardware/dev_plat/aicpufw/aicpufw_dev.c:348][AICPUFW] [aicpufw_dev_bind_pid 348] chip0 aicpu bind pid (40927).
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.805 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651711] [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(0), dev(0)
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.825 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651716] [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(1), dev(1)
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.836 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651719] [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(2), dev(2)
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.285.846 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.651722] [devdrv] [devdrv_manager_container_init_devlist_ns 1139] <aicpu_scheduler:30223> num(3), dev(3)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.370.966 [hardware/dev_plat/aicpufw/aicpufw_dev.c:76][AICPUFW] [aicpufw_dev_open 76] chip_id:0 is opened success, fd=18.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.371.155 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:283][AICPUFW] [InitFd 283] InitFd begin, deviceId=0.
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.186 [datapreprocess/src/task_queue.cc:58][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:58] Begin create task queue eventfd.
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.209 [datapreprocess/src/task_queue.cc:70][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:70] End create task queue eventfd 19.
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.224 [datapreprocess/src/task_queue.cc:58][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:58] Begin create task queue eventfd.
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.241 [datapreprocess/src/task_queue.cc:70][DP_PREPROCESS] [I] [datapreprocess/src/task_queue.cc:70] End create task queue eventfd 20.
[TRACE] DP(30223,aicpu_scheduler):2020-05-12-11:05:22.371.256 [status:START] [datapreprocess/src/task_queue.cc:262]DP_PREPROCESS module has been initialized
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.371.275 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:315][AICPUFW] [InitFd 315] InitFd end, deviceId=0.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.371.288 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:100][AICPUFW] [Start 100] Begin start aicpu work task, deviceId=0, hostpid=40927.
[INFO] DEVMM(30223,aicpu_scheduler):2020-05-12-11:05:22.371.300 [hardware/dev_plat/../dev_plat/devmm/agentmm/agentmm_svm.c:137][drvMemInitSvmDevice 137] <curpid:30223,0xfe8dc010> init svm start pid=40927.
[INFO] DEVMM(30223,aicpu_scheduler):2020-05-12-11:05:22.372.570 [hardware/dev_plat/../dev_plat/devmm/agentmm/agentmm_svm.c:120][devmm_init_svm_device 120] <curpid:30223,0xfe8dc010> init svm (hpid:40927) succ.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.593 [hardware/dev_plat/aicpufw/aicpufw_api.c:89][AICPUFW] [drvCreateAicpuWorkTasks 89] drvCreateAicpuWorkTasks enter: chip_id = 0, pid=40927, mode=0.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.610 [hardware/dev_plat/aicpufw/aicpufw_api.c:103][AICPUFW] [drvCreateAicpuWorkTasks 103] chip[0] start load kernel serve, pid=40927.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.739 [hardware/dev_plat/aicpufw/aicpufw_dev.c:226][AICPUFW] [aicpufw_dev_register_pid 226] chip0 register pid (40927).
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.839 [hardware/dev_plat/../dev_core/devdrv/devdrv_manager.c:838][devdrv] [drvGetCpuInfo 838] Dev[1] cpu info:1 14 1 4 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.875 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65531)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.891 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65515)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.902 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65530)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.914 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65514)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.923 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65529)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.935 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65513)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.943 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65528)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.955 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65512)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.964 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65527)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.976 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65511)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.985 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65526)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.372.998 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65510)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.007 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65525)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.019 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65509)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.029 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65524)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.041 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65508)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.050 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65523)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.061 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65507)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.070 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65522)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.081 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65506)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.090 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65521)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.101 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65505)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.110 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65520)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.122 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65504)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.130 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65519)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.142 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65503)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.150 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(65518)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.162 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(65502)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.170 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(0)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.182 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(0)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.191 [hardware/dev_plat/aicpufw/aicpufw_dev.c:209][AICPUFW] [aicpufw_dev_get_info 209] _ts_irq(0)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.202 [hardware/dev_plat/aicpufw/aicpufw_dev.c:210][AICPUFW] [aicpufw_dev_get_info 210] _cpu_irq(0)
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.215 [hardware/dev_plat/aicpufw/aicpufw_dev.c:142][AICPUFW] [aicpufw_dev_mmap 142] mmap opened fd=18.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.227 [hardware/dev_plat/aicpufw/aicpufw_dev.c:152][AICPUFW] [aicpufw_dev_mmap 152] start mmap, fd=18, offset: 4096, size: 258048.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.255 [hardware/dev_plat/aicpufw/aicpufw_dev.c:155][AICPUFW] [aicpufw_dev_mmap 155] finish mmap, fd=18, offset: 4096, size: 258048, addr: 0x0xfffefe877000.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.270 [hardware/dev_plat/aicpufw/aicpufw_thread.c:121][AICPUFW] [aicpufw_thread_data_init 121] chip[0] ts[0] finish mmap sram_offset[4096] sram_size[258048], ret[0]
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.287 [hardware/dev_plat/aicpufw/aicpufw_dev.c:142][AICPUFW] [aicpufw_dev_mmap 142] mmap opened fd=18.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.297 [hardware/dev_plat/aicpufw/aicpufw_dev.c:152][AICPUFW] [aicpufw_dev_mmap 152] start mmap, fd=18, offset: 262144, size: 1048576.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.323 [hardware/dev_plat/aicpufw/aicpufw_dev.c:155][AICPUFW] [aicpufw_dev_mmap 155] finish mmap, fd=18, offset: 262144, size: 1048576, addr: 0x0xfffefe777000.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.338 [hardware/dev_plat/aicpufw/aicpufw_thread.c:142][AICPUFW] [aicpufw_thread_data_init 142] chip[0] chip_info.chip_id is 0x6528.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.354 [hardware/dev_plat/aicpufw/aicpufw_dev.c:142][AICPUFW] [aicpufw_dev_mmap 142] mmap opened fd=18.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.368 [hardware/dev_plat/aicpufw/aicpufw_dev.c:152][AICPUFW] [aicpufw_dev_mmap 152] start mmap, fd=18, offset: 1310720, size: 4096.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.385 [hardware/dev_plat/aicpufw/aicpufw_dev.c:155][AICPUFW] [aicpufw_dev_mmap 155] finish mmap, fd=18, offset: 1310720, size: 4096, addr: 0x0xfffeffffa000.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.400 [hardware/dev_plat/aicpufw/aicpufw_thread.c:165][AICPUFW] [aicpufw_thread_data_init 165] finish mmap chip[0] ts[0] sram[0x0xfffefe877000]
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.420 [hardware/dev_plat/aicpufw/aicpufw_thread.c:710][AICPUFW] [aicpufw_thread_create 710] chip0 aicpu num: 14, first_aicpu: 2.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.441 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=0 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.454 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:0, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.852 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:0, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.872 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[0] id 281470656012688
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.874 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.737826] [aicpufw-drv] [aicpufw_init_dfx 467] <aicpu_scheduler:30223>there are 2 processes open,current tgid: 30223
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.881 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=0 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.892 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=1 begin
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.896 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739661] [aicpufw-drv] [aicpufw_drv_add_match_info_check 1240] <aicpu_scheduler:30223>register pid(40927) ts_index(0) monitor_is_running(1).
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.900 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:1, ts_ind:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.910 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739665] [aicpufw-drv] [aicpufw_drv_add_match_info 1282] <aicpu_scheduler:30223>register pid(40927) ts_index(0) monitor_is_running(1).
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.921 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739680] [aicpufw-drv] [aicpufw_drv_get_moniter_info 1583] <aicpu_m_ioctl:8314>aicpufw event happened. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.930 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.739733] [devdrv] [devdrv_manager_get_cpu_info 1927] <aicpu_scheduler:30223> aicpu_num=14, ccpu_num=1
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.941 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740140] [aicpufw-drv] [aicpufw_drv_mmap 1123] <aicpu_scheduler:30223>mmap_sram,ts_index=0, offset=4096, ts_size=4096.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.940 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:1, ret:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.950 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740143] [aicpufw-drv] [aicpufw_drv_mmap_sram 974] <aicpu_scheduler:30223>sram status mem: virt_addr = 0xfffefe877000, tgid = 30223, size = 258048,offset = 4096, numa node = 0, ts = 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.952 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[1] id 281470647619984
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.962 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=1 end ret=0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.964 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740150] [aicpufw-drv] [aicpufw_drv_mmap_sram 995] <aicpu_scheduler:30223>finish sram status mem: virt_addr = 0xfffefe877000, tgid = 30223, size = 258048,offset = 4096, numa node = 0, ts = 0, ret = 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.973 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=2 begin
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.976 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740210] [aicpufw-drv] [aicpufw_drv_mmap 1127] <aicpu_scheduler:30223>mmap_gicd,ts_index=0, offset=262144, ts_size=4096, sram_size=258048.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.373.983 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:2, ts_ind:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.988 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740212] [aicpufw-drv] [aicpufw_drv_mmap_gicd 926] <aicpu_scheduler:30223>gicd status mem: virt_addr = 0xfffefe777000, tgid = 30223, size = 1048576,offset = 262144, numa node = 0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.373.999 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740278] [aicpufw-drv] [aicpufw_drv_mmap 1132] <aicpu_scheduler:30223>mmap_gicr,ts_index=0, offset=1310720, ts_size=4096, sram_size=258048, gicd_size=1048576.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.374.008 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.740280] [aicpufw-drv] [aicpufw_drv_mmap_gicr 892] <aicpu_scheduler:30223>ts gicr mem: virt_addr = 0xfffeffffa000, tgid = 30223, size = 4096,offset = 1310720, numa node = 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.011 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:2, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.023 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[2] id 281470639227280
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.033 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=2 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.042 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=3 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.051 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:3, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.079 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:3, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.090 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[3] id 281470630834576
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.099 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=3 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.109 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=4 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.118 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:4, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.154 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:4, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.168 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[4] id 281470622441872
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.178 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=4 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.190 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=5 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.199 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:5, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.230 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:5, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.243 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[5] id 281470614049168
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.253 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=5 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.265 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=6 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.275 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:6, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.307 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:6, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.321 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[6] id 281470605656464
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.330 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=6 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.342 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=7 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.351 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:7, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.381 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:7, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.395 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[7] id 281470597263760
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.404 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=7 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.415 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=8 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.425 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:8, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.455 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:8, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.469 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[8] id 281470588871056
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.478 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=8 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.490 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=9 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.499 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:9, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.528 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:9, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.541 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[9] id 281470580478352
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.550 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=9 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.561 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=10 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.571 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:10, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.599 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:10, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.612 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[10] id 281470572085648
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.621 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=10 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.633 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=11 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.642 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:11, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.676 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:11, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.690 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[11] id 281470563692944
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.699 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=11 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.711 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=12 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.720 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:12, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.748 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:12, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.762 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[12] id 281470555300240
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.771 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=12 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.783 [hardware/dev_plat/aicpufw/aicpufw_thread.c:720][AICPUFW] [aicpufw_thread_create 720] pthread_create for aicpu_index=13 begin
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.792 [hardware/dev_plat/aicpufw/aicpufw_thread.c:656][AICPUFW] [aicpufw_thread_create_one 656] thread_create_one chip_id:0, aicpu_index:13, ts_ind:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.820 [hardware/dev_plat/aicpufw/aicpufw_thread.c:682][AICPUFW] [aicpufw_thread_create_one 682] thread_create_one aicpu_index:13, ret:0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.833 [hardware/dev_plat/aicpufw/aicpufw_thread.c:689][AICPUFW] [aicpufw_thread_create_one 689] thread[13] id 281470546907536
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.842 [hardware/dev_plat/aicpufw/aicpufw_thread.c:722][AICPUFW] [aicpufw_thread_create 722] pthread_create for aicpu_index=13 end ret=0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.854 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.864 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.876 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.885 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.897 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.907 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.918 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.928 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.940 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.949 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.961 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.970 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.982 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.374.992 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.003 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.012 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.024 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.033 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.045 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.054 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.065 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.075 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.086 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.095 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.107 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.117 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.129 [hardware/dev_plat/aicpufw/aicpufw_thread.c:731][AICPUFW] [aicpufw_thread_create 731] sem_post start chip_id=0, ts_ind=0, aicpu_index13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.138 [hardware/dev_plat/aicpufw/aicpufw_thread.c:733][AICPUFW] [aicpufw_thread_create 733] sem_post end chip_id=0, ts_ind=0, aicpu_index13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.150 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.217 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 5] begin, tid: 30230.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.329 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 6] begin, tid: 30231.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.366 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 7, chip 0, ts 0, aicpu_index 5, thread id 30230
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.411 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.443 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 3] begin, tid: 30228.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.503 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 8] begin, tid: 30233.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.550 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 9] begin, tid: 30234.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.602 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 4] begin, tid: 30229.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.634 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 8, chip 0, ts 0, aicpu_index 6, thread id 30231
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.673 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.690 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.706 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.721 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.741 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.762 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.776 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.799 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 11, chip 0, ts 0, aicpu_index 9, thread id 30234
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.824 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.841 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.855 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.870 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.900 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 0] begin, tid: 30225.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.375.963 [hardware/dev_plat/../dev_core/devdrv/devdrv_aicpu.c:449][devdrv] [devdrv_load_kernel_serve_thread 449] thread for load kernel start.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.027 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[11] begin, tid: 30236.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.081 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[12] begin, tid: 30237.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.127 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 7] begin, tid: 30232.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.179 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[13] begin, tid: 30238.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.204 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 5, chip 0, ts 0, aicpu_index 3, thread id 30228
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.231 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.247 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.259 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.273 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.291 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 6, chip 0, ts 0, aicpu_index 4, thread id 30229
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.316 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.332 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.346 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.362 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.391 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 15, chip 0, ts 0, aicpu_index 13, thread id 30238
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.431 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.447 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.462 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.477 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.500 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 2, chip 0, ts 0, aicpu_index 0, thread id 30225
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.528 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.545 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.559 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.578 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.604 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[10] begin, tid: 30235.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.657 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 2] begin, tid: 30227.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.706 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:181][AICPUFW] [EventThreadTask 181] Aicpu device[0]:thread[ 1] begin, tid: 30226.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.731 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 13, chip 0, ts 0, aicpu_index 11, thread id 30236
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.760 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 3, chip 0, ts 0, aicpu_index 1, thread id 30226
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.785 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.801 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.815 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.830 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.851 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 4, chip 0, ts 0, aicpu_index 2, thread id 30227
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.878 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.894 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.908 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.925 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.945 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index0
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.963 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.978 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index1
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.376.993 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.008 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index2
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.023 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.039 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index3
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.054 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.070 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index4
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.085 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.099 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index5
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.114 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.130 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index6
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.145 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.168 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 9, chip 0, ts 0, aicpu_index 7, thread id 30232
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.195 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.211 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.225 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.242 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.261 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 10, chip 0, ts 0, aicpu_index 8, thread id 30233
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.284 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.300 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.315 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.330 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.351 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 12, chip 0, ts 0, aicpu_index 10, thread id 30235
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.377 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.393 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.407 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.422 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.441 [hardware/dev_plat/aicpufw/aicpufw_thread.c:475][AICPUFW] [aicpufw_thread_set_affinity 475] thread_set_affinity, bind_cpu_index 14, chip 0, ts 0, aicpu_index 12, thread id 30237
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.465 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.481 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.496 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.510 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.527 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index7
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.543 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.558 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index8
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.572 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.587 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index9
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.602 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.618 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index10
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.633 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.652 [hardware/dev_plat/aicpufw/aicpufw_thread.c:536][AICPUFW] [aicpufw_thread_callback 536] sem_wait start, chip_id: 0, ts_ind: 0, aicpu index: 11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.668 [hardware/dev_plat/aicpufw/aicpufw_thread.c:538][AICPUFW] [aicpufw_thread_callback 538] sem_wait end, chip_id: 0, ts_ind: 0, aicpu index: 11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.692 [hardware/dev_plat/aicpufw/aicpufw_thread.c:546][AICPUFW] [aicpufw_thread_callback 546] sem_post start, chip_id: 0, ts_ind: 0, aicpu index: 11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.703 [hardware/dev_plat/aicpufw/aicpufw_thread.c:548][AICPUFW] [aicpufw_thread_callback 548] sem_post end, chip_id: 0, ts_ind: 0, aicpu index: 11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.713 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index11
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.725 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.734 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index12
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.742 [hardware/dev_plat/aicpufw/aicpufw_thread.c:737][AICPUFW] [aicpufw_thread_create 737] sem_wait start chip_id=0, ts_ind=0, aicpu_index13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.749 [hardware/dev_plat/aicpufw/aicpufw_thread.c:739][AICPUFW] [aicpufw_thread_create 739] sem_wait end chip_id=0, ts_ind=0, aicpu_index13
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.761 [hardware/dev_plat/aicpufw/aicpufw_timer.c:133][AICPUFW] [aicpufw_timer_init 133] aicpufw_timer_init end, AICPU_TASK_TIMEOUT=30.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.377.775 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.743120] [devdrv] [devdrv_manager_get_kernel_lib_process 198] <devdrv_load_ser:30223> begin to get kernel lib, device pid: 30223.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.377.790 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.743123] [devdrv] [devdrv_manager_get_kernel_lib_process 211] <devdrv_load_ser:30223> host_pid: 40927.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.377.799 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.743131] [devdrv] [devdrv_manager_get_kernel_lib_process 247] <devdrv_load_ser:30223> begin to wait, host pid: 40927, device pid: 30223.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.833 [hardware/dev_plat/aicpufw/aicpufw_supervisor_thread.c:173][AICPUFW] [aicpufw_sup_thread_proc 173] supervisor thread begin, current tid:30239 thread
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.854 [hardware/dev_plat/aicpufw/aicpufw_thread.c:797][AICPUFW] [aicpufw_thread_init 797] chip_id 0 thread init end.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.866 [hardware/dev_plat/aicpufw/aicpufw_api.c:77][AICPUFW] [aicpufw_create_work_tasks 77] drvCreateAicpuWorkTasks exit: chip_id = 0, pid=40927, mode=0.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.377.892 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:233][AICPUFW] [StartTdtServer 233] Start tdt server, deviceId=0.
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.377.932 [tdt/device/../common/src/log.cpp:158]BindCpu init and bindCoreList size is 1,[tdt/device/src/hdc/bind_cpu.cpp:38:Init]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.034 [tdt/device/../common/src/log.cpp:158]BindCpu cpu core num = 64,[tdt/device/src/hdc/bind_cpu.cpp:40:Init]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.051 [tdt/device/../common/src/log.cpp:158]BindCoreList member is 1,[tdt/device/src/hdc/bind_cpu.cpp:42:Init]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.075 [tdt/device/../common/src/log.cpp:158]Begin to Init tdtserver [devicID_=0] and [newInputDeviceId=0],[tdt/device/src/hdc/tdt_server_impl.cpp:664:Init]30223 Msg: running ok
[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.110 [tdt/device/../common/src/log.cpp:143]{"Start":"TDT_RECV"},[tdt/device/../common/src/statistic.cpp:113:PeriodStatisticManager]30223 Msg: warnging
[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.128 [tdt/device/../common/src/log.cpp:143]{"Start":"DP_ENQUEUE"},[tdt/device/../common/src/statistic.cpp:114:PeriodStatisticManager]30223 Msg: warnging
[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.147 [tdt/device/../common/src/log.cpp:143]{"Start":"RECV_ENLARGE"},[tdt/device/../common/src/statistic.cpp:115:PeriodStatisticManager]30223 Msg: warnging
[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.167 [tdt/device/../common/src/log.cpp:143]{"Start":"RECV_REDUCE"},[tdt/device/../common/src/statistic.cpp:116:PeriodStatisticManager]30223 Msg: warnging
[WARNING] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.185 [tdt/device/../common/src/log.cpp:143]{"Start":"RELEASE_DATA"},[tdt/device/../common/src/statistic.cpp:117:PeriodStatisticManager]30223 Msg: warnging
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.285 [tdt/device/../common/src/log.cpp:158]HdcServer::Init Start,[tdt/device/../common/src/hdc_server.cpp:104:Init]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.378.335 [hardware/dev_plat/../dev_plat/devhdc/hdc_server.c:287][drvHdcServerCreate:287] >>> create server (listen device: 0) success
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.356 [tdt/device/../common/src/log.cpp:158]HdcCommon::InitMsgSize Start,[tdt/device/../common/src/hdc_common.cpp:28:InitMsgSize]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.374 [tdt/device/../common/src/log.cpp:158]msgMaxSize_ = 524224, msgShortHeadDataMaxSize_ = 524212 msgLongHeadDataMaxSize_ = 524200,[tdt/device/../common/src/hdc_common.cpp:42:InitMsgSize]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.410 [tdt/device/../common/src/log.cpp:158]hdcserver in tdtserver is already initialed,[tdt/device/src/hdc/tdt_server_impl.cpp:652:InitDirectly]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.428 [tdt/device/../common/src/log.cpp:158]Begin to init device's hdc channel of tdt.,[tdt/device/src/hdc/tdt_server.cpp:32:TDTServerInit]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.448 [tdt/device/../common/src/log.cpp:158]begin tdt device init, [deviceId=0],[tdt/device/src/hdc/tdt_device_impl.cpp:65:Init]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.466 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer is initialized with deviceID:0,[tdt/device/src/hdc/tuning_data_transfer.cpp:71:Init]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.640 [tdt/device/../common/src/log.cpp:158]HdcCommon::InitMsgSize Start,[tdt/device/../common/src/hdc_common.cpp:28:InitMsgSize]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.657 [tdt/device/../common/src/log.cpp:158]msgMaxSize_ = 524224, msgShortHeadDataMaxSize_ = 524212 msgLongHeadDataMaxSize_ = 524200,[tdt/device/../common/src/hdc_common.cpp:42:InitMsgSize]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.673 [tdt/device/../common/src/log.cpp:158]capacity.maxSegment: 524224,[tdt/device/../common/src/hdc_client.cpp:152:Init]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.690 [tdt/device/../common/src/log.cpp:158]HdcClient::CreateHdcSession Start,[tdt/device/../common/src/hdc_client.cpp:284:CreateHdcSession]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.809 [tdt/device/../common/src/log.cpp:158]HdcServer::Accept thread = 281470521594256,[tdt/device/../common/src/hdc_server.cpp:259:Accept]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.878 [tdt/device/../common/src/log.cpp:158][HdcClient] deviceId: 0 connect session,[tdt/device/../common/src/hdc_client.cpp:306:CreateHdcSession]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.378.908 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:1609][drvHdcSetSessionReference:1609] >>> session 56, pid 30223
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.926 [tdt/device/../common/src/log.cpp:158][HdcClient] drvHdcSetSessionReference success,[tdt/device/../common/src/hdc_client.cpp:317:CreateHdcSession]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.378.945 [tdt/device/../common/src/log.cpp:158][HdcClient] deviceId: 0 connect session and drvHdcSetSessionReference success, sessionId=1,[tdt/device/../common/src/hdc_client.cpp:327:CreateHdcSession]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.006 [tdt/device/../common/src/log.cpp:158][HdcClient] SendPidMsg sessionId=1, tdt_main_pid=30223,[tdt/device/../common/src/hdc_client.cpp:346:sendPidMsg]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.073 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer tdt client wait for send data begin,[tdt/device/src/hdc/tuning_data_transfer.cpp:173:CreateSingleSession]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.101 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.213 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470521594256 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.239 [tdt/device/../common/src/log.cpp:158]Thread[281470521594256] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.282 [tdt/device/../common/src/log.cpp:158]Free Tdt thread ID = 281470529986960,[tdt/device/src/hdc/tdt_server_impl.cpp:555:FreeTdtMemoryThread]30241 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.304 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30241 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.359 [tdt/device/../common/src/log.cpp:158]TimerFunc thread = 281470538379664,[tdt/device/../common/src/statistic.cpp:140:TimerFunc]30240 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.377 [tdt/device/../common/src/log.cpp:158]HdcServer::AcceptConnection Start,[tdt/device/../common/src/hdc_server.cpp:310:AcceptHdcSession]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.403 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470529986960 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30241 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.421 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30240 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.476 [tdt/device/../common/src/log.cpp:158]BindCPUCore Start,[tdt/device/src/hdc/bind_cpu.cpp:58:BindCPUCore]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.496 [tdt/device/../common/src/log.cpp:158]Thread[281470529986960] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30241 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.516 [tdt/device/../common/src/log.cpp:158][deviceId=0] Free Tdt thread is success to bind cpu core,[tdt/device/src/hdc/tdt_server_impl.cpp:559:FreeTdtMemoryThread]30241 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.534 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470538379664 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30240 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.550 [tdt/device/../common/src/log.cpp:158]Thread[281470538379664] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30240 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.576 [tdt/device/../common/src/log.cpp:158]BindCpu thread=281470513033616 CPU_ISSET successfully on processor[1],[tdt/device/src/hdc/bind_cpu.cpp:103:BindCPUCore]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.591 [tdt/device/../common/src/log.cpp:158]Thread[281470513033616] bindCpu setaffinity successfully,[tdt/device/src/hdc/bind_cpu.cpp:109:BindCPUCore]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.637 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer find channel OK, sessionID: 1,[tdt/device/src/hdc/tuning_data_transfer.cpp:126:SetSendFlagBySessionID]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.664 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer tdt client wait for send data end,[tdt/device/src/hdc/tuning_data_transfer.cpp:178:CreateSingleSession]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.688 [tdt/device/../common/src/log.cpp:158]Begin to start send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:179:StartSendThread]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.732 [tdt/device/../common/src/log.cpp:158]Success start send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:181:StartSendThread]30223 Msg: running ok
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.379.746 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:265][AICPUFW] [StartTdtServer 265] TDT server init success.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.379.758 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:135][AICPUFW] [Start 135] Aicpu_scheduler has started, deviceId=0, hostpid=40927.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:22.379.771 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:206][AICPUFW] [main 206] Compute process start success.
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.793 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] TsdWaitForShutdownProc() deviceId: 0 waitType:1(0:HCCP,1:COMPUTE) is running,[tdt/device/src/tsd/ppc_client.cpp:220:TsdWaitForShutdownProc]30223 Msg: running ok
[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.810 [tdt/device/../common/src/log.cpp:149][PPCCLIEVENT] TsdWaitForShutdown!,[tdt/device/src/tsd/ppc_client.cpp:225:TsdWaitForShutdownProc]30223
[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.826 [tdt/device/../common/src/log.cpp:149][PPCCLIEVENT] TsdWaitForShutdown Start Rsp device[0] procType:1(0:HCCP,1:COMPUTE), subProcPid[30223] ,[tdt/device/src/tsd/ppc_client.cpp:244:TsdWaitForShutdownProc]30223
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.846 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] GetPpcSession() will Create session use serverId: 19280,[tdt/device/src/tsd/ppc_client.cpp:110:GetPpcSession]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:22.379.899 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:129][drvPpcSessionConnect:129] >>> Ppc connect session 26, pid 30223
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.917 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] [serverId=19280] GetPpcSession() receive thread has been started,[tdt/device/src/tsd/ppc_client.cpp:118:GetPpcSession]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.379.935 [tdt/device/../common/src/log.cpp:158]PpcInterface::SendMsg,size=10, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:39:SendMsg]30223 Msg: running ok
[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:22.379.941 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:268][drvPpcSessionAccept:268] >>> Ppc Accept Session 16, Server fd 12, pid 8380
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.380.006 [tdt/device/../common/src/log.cpp:158]tdt device begin to send to host.,[tdt/device/src/hdc/tdt_device_impl.cpp:275:TdtDeviceSendImpl]30244 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:22.380.025 [tdt/device/../common/src/log.cpp:158]begin to check queueDataSize_.,[tdt/device/src/hdc/tdt_device_impl.cpp:286:TdtDeviceSendImpl]30244 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.091 [tdt/device/../common/src/log.cpp:158]PpcInterface::RecvMsg, size=10, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:122:RecvMsg]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.114 [tdt/device/../common/src/log.cpp:158][PpcServer] SetPpcSession, deviceId:0, subProcPid:30223,[tdt/device/src/tsd/ppc_server.cpp:338:SetPpcSession]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.139 [tdt/device/../common/src/log.cpp:158][TSDaemon]PPCSerToTsdMsg deviceId[0], subProcPid[30223]msgType[0](0:START RSP,2:SHUTDOWN,1:SHUTDOWN RSP,3:SOCKET CLOSE), procType[1](0:HCCP,1:COMPUTE) state[6],[tdt/device/src/tsd/tsdaemon.cpp:1441:PPCSerToTsdMsg]30245 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.160 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### PPCSer->TSD RspMsg device[0] Start Rsp procType[1] ####,[tdt/device/src/tsd/tsdaemon.cpp:1406:PPCSerToTsdProc]30245
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.205 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] ##### [threadName:ppc_srv_recv_0] RecvData [ret=16846848] Save [notifyDeviceId:0] [notifyProcType:1]####,[tdt/device/src/tsd/ppc_server.cpp:242:RecvData]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.256 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartRspProc() [dev=0][subProcPid=30223][procType=1(0:HCCP,1:COMPUTE)][tid=281470572192176]!,[tdt/device/src/tsd/tsdaemon.cpp:1190:StartRspProc]30246 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.276 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartRspProc curState is: 6,[tdt/device/src/tsd/tsdaemon.cpp:1193:StartRspProc]30246 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.295 [tdt/device/../common/src/log.cpp:158][TSDaemon] GetTsdToFmkMsg deviceId[0] subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:959:GetTsdToFmkMsg]30246 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.314 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap size = 1,[tdt/device/src/tsd/tsdaemon.cpp:964:GetTsdToFmkMsg]30246 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.331 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap [deviceId] size = 1,[tdt/device/src/tsd/tsdaemon.cpp:967:GetTsdToFmkMsg]30246 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.349 [tdt/device/../common/src/log.cpp:158][TSDaemon] SendRspMsgToFmk deviceId: 0, subProcPid: 30223, sessionID: 1,[tdt/device/src/tsd/tsdaemon.cpp:1016:SendRspMsgToFmk]30246 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.364 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### Start Rsp TSD->FMK device[0] sessionID[1] realDeviceId[0]####,[tdt/device/src/tsd/tsdaemon.cpp:1018:SendRspMsgToFmk]30246
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:22.380.402 [tdt/device/../common/src/log.cpp:158][TSDaemon] StartRspProc subRunState is: 3,[tdt/device/src/tsd/tsdaemon.cpp:1173:TsdWaitRspProcForStar]30246 Msg: running ok
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:22.381.752 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48430.745228] [hdcdrv] [hdcdrv_server_create 2330] <aicpu_scheduler:30223> dev_id 0 service_type service_TDT server create
[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:23.379.610 [tdt/device/../common/src/log.cpp:149],[tdt/common/common_inc/queue_manager.h:683:ShowSizeForEveryChannel]30240
[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:23.379.656 [tdt/device/../common/src/log.cpp:149]"DeviceSendPool: " "DeviceRecvPool: " "HostRecvPool: " "DeviceCtrlPool: {SendPool: 0, FreePool: 0}, {RecvPool: 0, FreePool: 0}",[tdt/device/../common/src/memory_pool.cpp:707:GetDevicePoolStatus]30240
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:23.379.673 [tdt/device/../common/src/log.cpp:158]DeviceRecNormalData: Device receive normal message number:0,[tdt/device/../common/src/memory_pool.cpp:709:GetDevicePoolStatus]30240 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.322 [tdt/device/../common/src/log.cpp:158]tsdaemon get process sign successfully, procpid:0 signSize:0,[tdt/device/src/tsd/tsdaemon.cpp:901:FmkToTsdMsg]30221 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.360 [tdt/device/../common/src/log.cpp:149][TsdEVENT]FmkToTsdMsg dev[0] msg[7] sessionId[1] realDev[0] fmkSignPid[0] profilingMode[0] rankSize[1],[tdt/device/src/tsd/tsdaemon.cpp:905:FmkToTsdMsg]30221
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.375 [tdt/device/../common/src/log.cpp:149][TsdEVENT]From FMK Close <<<<<<<<<<< TSDdev[0] sessionId[1] realDev[0] fmkPid[0],[tdt/device/src/tsd/tsdaemon.cpp:861:FmkToTsdMsgProc]30221
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.397 [tdt/device/../common/src/log.cpp:158][TSDaemon] Begin closeSubProcess deviceId:0, sessionId:1, cpProcPid:30223, hccpProcPid:0, subProcState:3,[tdt/device/src/tsd/tsdaemon.cpp:729:CloseSubProcess]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.414 [tdt/device/../common/src/log.cpp:158][TSDaemon] SetTsdToFmkMsg:deviceId[0], sessionId[1], subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:774:SetTsdToFmkMsg]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.429 [tdt/device/../common/src/log.cpp:158][TSDaemon] Process HCCP is abandoned to close, the rank size is 1,[tdt/device/src/tsd/tsdaemon.cpp:747:CloseSubProcess]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.473 [tdt/device/../common/src/log.cpp:158][TSDaemon] start delete file, direct is /home/HwHiAiUser/hdcd/device0/,[tdt/device/src/tsd/tsdaemon.cpp:1878:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.509 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is .,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.524 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is ..,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.541 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is etc,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.560 [tdt/device/../common/src/log.cpp:158][TSDaemon] start scan file, ent name is upgrade,[tdt/device/src/tsd/tsdaemon.cpp:1887:DeleteFileByPath]30221 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.624 [tdt/device/../common/src/log.cpp:158][PpcServer] GetPpcSession, deviceId:0, subProcPid:30223,[tdt/device/src/tsd/ppc_server.cpp:324:GetPpcSession]30255 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.648 [tdt/device/../common/src/log.cpp:158]PpcInterface::SendMsg,size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:39:SendMsg]30255 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.686 [tdt/device/../common/src/log.cpp:158][TSDaemon] Begin ExecuteClose deviceId:0, subProcPid:30223,[tdt/device/src/tsd/tsdaemon.cpp:703:ExecuteClose]30255 Msg: running ok
[OPLOG] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.704 [tdt/device/../common/src/log.cpp:151][tdt/device/src/tsd/tsdaemon.cpp:705:ExecuteClose]30255 free resource {devOS:[30223]} for {dev:0}
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.731 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### Send TSD->SubProcess[PPCSer] Close Msg Device[0] proType[1] [tid=281470597370288]####,[tdt/device/src/tsd/tsdaemon.cpp:709:ExecuteClose]30255
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.728 [tdt/device/../common/src/log.cpp:158]PpcInterface::RecvMsg, size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:122:RecvMsg]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.755 [tdt/device/../common/src/log.cpp:158]PpcClient::RecvMsgProc, size=12, subpid=30223,[tdt/device/src/tsd/ppc_client.cpp:150:RecvMsgProc]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.773 [tdt/device/../common/src/log.cpp:158]PpcClient::RecvMsgProc, subpid1=30223, subpid2=30223,[tdt/device/src/tsd/ppc_client.cpp:160:RecvMsgProc]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.789 [tdt/device/../common/src/log.cpp:158]PpcInterface::SendMsg,size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:39:SendMsg]30223 Msg: running ok
[EVENT] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.816 [tdt/device/../common/src/log.cpp:149][PPCCLIEVENT] #### PPCClient->TSD Close Rsp send OK device[0] procType:1 ####,[tdt/device/src/tsd/ppc_client.cpp:166:RecvMsgProc]30223
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.832 [tdt/device/../common/src/log.cpp:158][PPCClient]Process Exit DevId:0 procType:1(0:HCCP,1:COMPUTE),[tdt/device/src/tsd/ppc_client.cpp:200:RecvData]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.063.848 [tdt/device/../common/src/log.cpp:158]TsdWaitForShutdown exit,[tdt/device/src/tsd/ppc_client.cpp:274:TsdWaitForShutdown]30223 Msg: running ok
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.063.861 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:214][AICPUFW] [main 214] Tsd wait for shut down success.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.063.872 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:272][AICPUFW] [StopTdtServer 272] Stop tdt server, deviceId=0.
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.063.987 [tdt/device/../common/src/log.cpp:158]PpcInterface::RecvMsg, size=12, subpid=30223,[tdt/device/src/tsd/ppc_interface.cpp:122:RecvMsg]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.025 [tdt/device/../common/src/log.cpp:158][PpcServer] SetPpcSession, deviceId:0, subProcPid:30223,[tdt/device/src/tsd/ppc_server.cpp:338:SetPpcSession]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.057 [tdt/device/../common/src/log.cpp:158][TSDaemon]PPCSerToTsdMsg deviceId[0], subProcPid[30223]msgType[1](0:START RSP,2:SHUTDOWN,1:SHUTDOWN RSP,3:SOCKET CLOSE), procType[1](0:HCCP,1:COMPUTE) state[6],[tdt/device/src/tsd/tsdaemon.cpp:1441:PPCSerToTsdMsg]30245 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.078 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### PPCSer->TSD RspMsg device[0] Close Rsp procType[1] ####,[tdt/device/src/tsd/tsdaemon.cpp:1415:PPCSerToTsdProc]30245
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.195 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] ##### [threadName:ppc_srv_recv_0] RecvData [ret=16846848] Save [notifyDeviceId:0] [notifyProcType:1]####,[tdt/device/src/tsd/ppc_server.cpp:242:RecvData]30245 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.227 [tdt/device/../common/src/log.cpp:158]tdtserver is destroying, [devicID_=0],[tdt/device/src/hdc/tdt_server_impl.cpp:709:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.248 [tdt/device/../common/src/log.cpp:158]Stop QueueManager success,[tdt/device/src/hdc/tdt_server_impl.cpp:716:Destroy]30223 Msg: running ok
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.263 [datapreprocess/src/dp_interface.cc:288][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:288] Release blocked TDT threads.
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.280 [datapreprocess/src/dp_interface.cc:406][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:406] Begin write queue blocking eventfd of source().
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.294 [datapreprocess/src/dp_interface.cc:409][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:409] Got empty source name, start write all blocking eventfd.
[INFO] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.064.305 [datapreprocess/src/dp_interface.cc:291][DP_PREPROCESS] [I] [datapreprocess/src/dp_interface.cc:291] All TDT threads mark info memory have released.
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.320 [tdt/device/../common/src/log.cpp:158]DataPreprocess exited,[tdt/device/src/hdc/tdt_server_impl.cpp:720:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.338 [tdt/device/../common/src/log.cpp:158]Enqueue Thread exited,[tdt/device/src/hdc/tdt_server_impl.cpp:722:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.368 [tdt/device/../common/src/log.cpp:158][deviceId=0] Free Tdt thread is exist,total free number = 0,enqueue number is = 0,[tdt/device/src/hdc/tdt_server_impl.cpp:570:FreeTdtMemoryThread]30241 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.412 [tdt/device/../common/src/log.cpp:158]Free Thread exited,[tdt/device/src/hdc/tdt_server_impl.cpp:725:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.064.428 [tdt/device/../common/src/log.cpp:158]enter HdcServer::Destroy() function,[tdt/device/../common/src/hdc_server.cpp:582:Destroy]30223 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.462 [tdt/device/../common/src/log.cpp:158][TSDaemon] CloseRspProc() [dev=0][subPid=30223][procType=1(0:HCCP,1:COMPUTE)][tid=281470588977584]!,[tdt/device/src/tsd/tsdaemon.cpp:1267:CloseRspProc]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.485 [tdt/device/../common/src/log.cpp:158][TSDaemon] CloseRspProc curState is: 6,[tdt/device/src/tsd/tsdaemon.cpp:1271:CloseRspProc]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.508 [tdt/device/../common/src/log.cpp:158][TSDaemon] GetTsdToFmkMsg deviceId[0] subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:959:GetTsdToFmkMsg]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.527 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap size = 1,[tdt/device/src/tsd/tsdaemon.cpp:964:GetTsdToFmkMsg]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.544 [tdt/device/../common/src/log.cpp:158][TSDaemon] tsdToFmkSessionIdMap [deviceId] size = 1,[tdt/device/src/tsd/tsdaemon.cpp:967:GetTsdToFmkMsg]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.568 [tdt/device/../common/src/log.cpp:158][TSDaemon] subProcessPid is 30223 on device[0], procType[1](0:HCCP PROC,1:COMPUTE PROC),[tdt/device/src/tsd/tsdaemon.cpp:1085:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.064.588 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.535 [hardware/dev_plat/../dev_plat/devhdc/hdc_server.c:367][drvHdcServerDestroy:367] >>> destroy server success, deviceId 0, serviceType 10
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.556 [hardware/dev_plat/../dev_plat/devhdc/hdc_server.c:236][drvHdcPcieSessionAccept:236] >>> device:0 server 10 is destroyed, ret:18
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.575 [tdt/device/../common/src/log.cpp:158]drv accept exception, because acceptSwitch has been set false, ret=18,[tdt/device/../common/src/hdc_server.cpp:324:AcceptHdcSession]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.592 [tdt/device/../common/src/log.cpp:158]acceptSwitch has been set false,[tdt/device/../common/src/hdc_server.cpp:269:Accept]30242 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.634 [tdt/device/../common/src/log.cpp:158][HdcServer] SessionIdPidMsg enter into ClearSessionIdPid,[tdt/device/../common/src/hdc_server.cpp:495:ClearSessionIdPid]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.658 [tdt/device/../common/src/log.cpp:158]Begin StopMemoryPool,[tdt/device/../common/src/memory_pool.cpp:3389:StopMemoryPool]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.685 [tdt/device/../common/src/log.cpp:158]DestroyMemoryPool, memoryEnd = 2,[tdt/device/../common/src/memory_pool.cpp:1323:DestroyMemoryPool]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.703 [tdt/device/../common/src/log.cpp:158]FreeMemoryByMap, memoryEnd = 2, memoryType = 1, devId_ = 0,[tdt/device/../common/src/memory_pool.cpp:1131:FreeMemoryByMap]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.721 [tdt/device/../common/src/log.cpp:158]FreeMemoryByMap, memoryEnd = 2, memoryType = 1, devId_ = 0,[tdt/device/../common/src/memory_pool.cpp:1131:FreeMemoryByMap]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.743 [tdt/device/../common/src/log.cpp:158]hdcServer_ destroyed,[tdt/device/src/hdc/tdt_server_impl.cpp:735:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.759 [tdt/device/../common/src/log.cpp:158]Begin to destroy device's hdc channel of tdt.,[tdt/device/src/hdc/tdt_server.cpp:48:TDTServerStop]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.774 [tdt/device/../common/src/log.cpp:158]begin to destroy.,[tdt/device/src/hdc/tdt_device_impl.cpp:367:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.793 [tdt/device/../common/src/log.cpp:158]Stop QueueManager success,[tdt/device/src/hdc/tdt_device_impl.cpp:381:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.808 [tdt/device/../common/src/log.cpp:158]Begin to stop send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:192:StopSendThread]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.866 [tdt/device/../common/src/log.cpp:158]Success stop send thread.,[tdt/device/src/hdc/tdt_device_impl.cpp:196:StopSendThread]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.885 [tdt/device/../common/src/log.cpp:158]TuningDataTransfer destory hdc client,[tdt/device/src/hdc/tuning_data_transfer.cpp:456:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.900 [tdt/device/../common/src/log.cpp:158]enter HdcClient::Destroy() function,[tdt/device/../common/src/hdc_client.cpp:468:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.917 [tdt/device/../common/src/log.cpp:158]begin drvHdcSessionClose,[tdt/device/../common/src/hdc_client.cpp:415:ClearAllSession]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.930 [hardware/dev_plat/../dev_plat/devhdc/hdc_client.c:413][drvHdcClientSessionClose:413] >>> destroy client session(sock: 56)
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.960 [tdt/device/../common/src/log.cpp:158]end drvHdcSessionClose,[tdt/device/../common/src/hdc_client.cpp:420:ClearAllSession]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.074.976 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:1300][drvHdcRecvMsgLen:1300] >>> the session 56 local or remote was closed
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.074.996 [tdt/device/../common/src/log.cpp:158]begin HdcClient::JoinAllRecvThread,[tdt/device/../common/src/hdc_client.cpp:383:JoinAllRecvThread]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.016 [tdt/device/../common/src/log.cpp:158]drvHdcRecv() return 25,[tdt/device/../common/src/hdc_common.cpp:494:RecvHdcDefaultMsg]30243 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.075.018 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.038 [tdt/device/../common/src/log.cpp:158]Receive() return 17903651, which means : hdc service or client socket closed,[tdt/device/../common/src/hdc_common.cpp:438:RecvMsg]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.057 [tdt/device/../common/src/log.cpp:158][deviceId=0][sessionId=1] recv runswitch has been set false,[tdt/device/../common/src/hdc_client.cpp:175:RecvData]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.072 [tdt/device/../common/src/log.cpp:158][deviceId=0] the recv data pthread exit,[tdt/device/../common/src/hdc_client.cpp:190:RecvData]30243 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.138 [tdt/device/../common/src/log.cpp:158]end HdcClient::JoinAllRecvThread,[tdt/device/../common/src/hdc_client.cpp:392:JoinAllRecvThread]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.156 [tdt/device/../common/src/log.cpp:158]begin drvHdcClientDestroy,[tdt/device/../common/src/hdc_client.cpp:450:DestroyClient]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.236 [tdt/device/../common/src/log.cpp:158]end drvHdcClientDestroy,[tdt/device/../common/src/hdc_client.cpp:455:DestroyClient]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.252 [tdt/device/../common/src/log.cpp:158]begin HdcClient::ClearClientPtr,[tdt/device/../common/src/hdc_client.cpp:432:ClearClientPtr]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.269 [tdt/device/../common/src/log.cpp:158]end HdcClient::ClearClientPtr,[tdt/device/../common/src/hdc_client.cpp:440:ClearClientPtr]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.284 [tdt/device/../common/src/log.cpp:158]begin HdcClient::ClearAll,[tdt/device/../common/src/hdc_client.cpp:401:ClearAll]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.299 [tdt/device/../common/src/log.cpp:158]end HdcClient::ClearAll,[tdt/device/../common/src/hdc_client.cpp:404:ClearAll]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.314 [tdt/device/../common/src/log.cpp:158]end HdcClient::Destroy() function,[tdt/device/../common/src/hdc_client.cpp:476:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.330 [tdt/device/../common/src/log.cpp:158]success destroy.,[tdt/device/src/hdc/tdt_device_impl.cpp:388:Destroy]30223 Msg: running ok
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.075.343 [aicpu/aicpu_device/aicpu_schedule/compute_process/compute_process.cc:277][AICPUFW] [StopTdtServer 277] TDT server stop success.
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.075.355 [hardware/dev_plat/aicpufw/aicpufw_dev.c:94][AICPUFW] [aicpufw_dev_close 94] chip_id:0 will be closed, fd=18.
[TRACE] DP(30223,aicpu_scheduler):2020-05-12-11:05:24.075.373 [status:STOP] [datapreprocess/src/task_queue.cc:292]DP_PREPROCESS module has been closed
[INFO] DRV(30223,aicpu_scheduler):2020-05-12-11:05:24.075.389 [aicpu/aicpu_device/aicpu_schedule/compute_process/main.cc:219][AICPUFW] [main 219] Compute process stopped.
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.408 [tdt/device/../common/src/log.cpp:158]PpcClient::~PpcClient() destructor function called,[tdt/device/src/tsd/ppc_client.cpp:23:~PpcClient]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.422 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] Destroy() enter,[tdt/device/src/tsd/ppc_client.cpp:44:Destroy]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.075.437 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:151][drvPpcSessionDestroy:151] >>> Ppc Destroy session 26, pid 30223
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.473 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] Destroy() call drvPpcSessionDestroy func return [drvRet:0],[tdt/device/src/tsd/ppc_client.cpp:54:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.490 [tdt/device/../common/src/log.cpp:158][TSDPPCCLI] Destroy() exit,[tdt/device/src/tsd/ppc_client.cpp:73:Destroy]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.508 [tdt/device/../common/src/log.cpp:158]TdtDeviceImpl::~TdtDeviceImpl() destructor function called.,[tdt/device/src/hdc/tdt_device_impl.cpp:54:~TdtDeviceImpl]30223 Msg: running ok
[INFO] TDT(30223,aicpu_scheduler):2020-05-12-11:05:24.075.532 [tdt/device/../common/src/log.cpp:158]TdtServerImpl::~TdtServerImpl() destructor function called,[tdt/device/src/hdc/tdt_server_impl.cpp:71:~TdtServerImpl]30223 Msg: running ok
[INFO] HDC(30223,aicpu_scheduler):2020-05-12-11:05:24.076.267 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:2002][drv_hdc_exit:2002] >>> HDC uninit success
[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:24.077.353 [hardware/dev_plat/../dev_plat/devhdc/hdc_core.c:626][hdcSocketRecvPeek:626] >>> client connection closed: Success(errno: 0)(sock: 16)
[WARNING] TDT(8380,tsdaemon):2020-05-12-11:05:24.077.393 [tdt/device/../common/src/log.cpp:143][TSDPPCIF] drvPpcRecv fail 25,[tdt/device/src/tsd/ppc_interface.cpp:92:RecvMsg]30245 Msg: warnging
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.077.412 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] 0 SOCKET_CLOSED notify dev[0] procType[1] ret=[17379389]?[17379389]TSD to clean,[tdt/device/src/tsd/ppc_server.cpp:199:RecvData]30245
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.077.429 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] 1 SOCKET_CLOSED notify dev[0] procType[1] ret[17379389] TSD to clean,[tdt/device/src/tsd/ppc_server.cpp:204:RecvData]30245
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.804 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.441417] [hdcdrv] [hdcdrv_server_destroy 2415] <aicpu_scheduler:30223> dev_id 0 service_type service_TDT server destroy
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.829 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.441427] [hdcdrv] [hdcdrv_accept 2470] <Accept:30223> dev_id 0 service_type service_TDT accept wait dev 0 quit, dev status 1, listen status 0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.841 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.441858] [hdcdrv] [hdcdrv_recv_peek 2899] <RecvData:30223> dev 0 session 56 local or remote close, local_close_state closed_by_user, remote_close_state closed_by_user,local_session_fd 56, remote_session_fd 227.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.850 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.444353] [devdrv] [devdrv_manager_get_kernel_lib_process 251] <devdrv_load_ser:30223> wait_event_interruptible return: -512.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.077.860 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.444495] [devmm] [devmm_notifier_release_private 1225] <aicpufw_sup:30223,30239> device wait ts exit hostpid(40927) exit(0).
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.085.102 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.095.180 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.105.257 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.115.334 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.125.408 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.133.771 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.496591] [devmm] [devmm_notifier_release_private 1231] <aicpufw_sup:30223,30239> ts exited,device hostpid(40927) begin recover resource.
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.137.750 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.147.847 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.157.927 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.168.004 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.178.083 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.188.161 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.198.238 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.208.316 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.218.391 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.228.469 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.238.544 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.248.619 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.258.697 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.268.776 [tdt/device/../common/src/log.cpp:158]The current kill result is [0],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] DRV(8314,aicpufw_monitor):2020-05-12-11:05:24.275.875 [hardware/dev_plat/aicpufw/aicpufw_thread.c:931][AICPUFW] [aicpufw_monitor_recycle_so 931] recycle so pid_dir=/home/HwHiAiUser/tmp/30223/.
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.856 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642610] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>0 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.878 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642614] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>1 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.889 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642616] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>2 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.899 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642618] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>3 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.915 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642620] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>4 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.924 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642622] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>5 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.933 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642624] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>6 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.942 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642627] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>7 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.950 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642628] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>8 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.959 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642630] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>9 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.967 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642632] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>10 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.975 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642634] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>11 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.984 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642636] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>12 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.277.994 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642638] [aicpufw-drv] [aicpufw_drv_release_debug 650] <aicpufw_sup:30223>13 :rev_int_cnt 0 rev_int_ok 0, rev_int_invalid 0,send_to_int_cnt 0 wait_satisfied 0 wait_event_time 0 rev_int_pending 0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.278.003 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642641] [aicpufw-drv] [aicpufw_drv_delete_context 568] <aicpufw_sup:30223>delete match-pid(40927). dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.278.011 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642642] [aicpufw-drv] [aicpufw_drv_release 690] <aicpufw_sup:30223>processes(1),process pid(30223) released.current tgid: 30223 numa node:0. dev_id:0
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.278.019 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.642657] [aicpufw-drv] [aicpufw_drv_get_moniter_info 1583] <aicpu_m_ioctl:8314>aicpufw event happened. dev_id:0
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.860 [tdt/device/../common/src/log.cpp:158]The current kill result is [-1],[tdt/device/src/tsd/tsdaemon.cpp:1090:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.880 [tdt/device/../common/src/log.cpp:158][TSDaemon] Computer Process stop success.,[tdt/device/src/tsd/tsdaemon.cpp:1096:CheckSubProcessExitByType]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.892 [tdt/device/../common/src/log.cpp:158][TSDaemon] SubProcesses exit success on device[0], tryTimes is 21,[tdt/device/src/tsd/tsdaemon.cpp:1098:CheckSubProcessExitByType]30256 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.908 [tdt/device/../common/src/log.cpp:149][TsdEVENT] #### Close Rsp TSD->FMK device[0] sessionID[1] realDeviceId[0]####,[tdt/device/src/tsd/tsdaemon.cpp:1023:SendRspMsgToFmk]30256
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.950 [tdt/device/../common/src/log.cpp:158][TSDaemon] CloseRspProc subRunState is: 0,[tdt/device/src/tsd/tsdaemon.cpp:1248:TsdWaitRspProcForClose]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.278.966 [tdt/device/../common/src/log.cpp:158][TSDaemon] EraseTsdToFmkMsg:deviceId[0], subProcPid[30223],[tdt/device/src/tsd/tsdaemon.cpp:789:EraseTsdToFmkMsg]30256 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.027 [tdt/device/../common/src/log.cpp:158][TSDaemon]###### PPCSerToTsdAbnormalMsg deviceId[0] msgType[3](0:START RSP,1:SHUTDOWN,2:SHUTDOWN RSP,3:SOCKET CLOSE), procType[1](0:HCCP,1:COMPUTE), state[0],[tdt/device/src/tsd/tsdaemon.cpp:1471:PPCSerToTsdAbnormalMsg]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.057 [tdt/device/../common/src/log.cpp:158][TSDaemon] Begin to ClearPPCThreadCleanFlag,[tdt/device/src/tsd/tsdaemon.cpp:1473:PPCSerToTsdAbnormalMsg]30245 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.081 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] 2 SOCKET_CLOSED notify dev[0] procType[1] ret[17379389] TSD to clean,[tdt/device/src/tsd/ppc_server.cpp:211:RecvData]30245
[INFO] HDC(8380,tsdaemon):2020-05-12-11:05:24.279.117 [hardware/dev_plat/../dev_plat/devhdc/hdc_ppc.c:304][drvPpcSessionClose:304] >>> Ppc Close session fd 16, pid 8380 session 0xfffee0000d30
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.139 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] CloseSomeSession enter [sessionSize:1],[tdt/device/src/tsd/ppc_server.cpp:80:RemoveFromPpcSessionList]30245 Msg: running ok
[INFO] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.157 [tdt/device/../common/src/log.cpp:158][TSDPPCSER] CloseSomeSession exit [sessionSize:0],[tdt/device/src/tsd/ppc_server.cpp:85:RemoveFromPpcSessionList]30245 Msg: running ok
[EVENT] TDT(8380,tsdaemon):2020-05-12-11:05:24.279.190 [tdt/device/../common/src/log.cpp:149][TSDPPCSER] [threadName:ppc_srv_recv_0] RecvData [ret=17379389] [tid=281470580584880]thread exit,[tdt/device/src/tsd/ppc_server.cpp:218:RecvData]30245
[INFO] KERNEL(8189,sklogd):2020-05-12-11:05:24.609.779 [toolchain/log/slog/sklog/device/../src/klogd.c:249][48432.973818] [devmm] [devmm_chan_close_device_h2d 1124] <kworker/0:0:55678,55678> device process exited, hostpid=40927, devpid=30223, devid=0.
@@ -0,0 +1,207 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Convenience functions for managing dataset file buffers."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
import multiprocessing
import multiprocessing.dummy
import os
import tempfile
import uuid
import numpy as np
import six
import tensorflow as tf
class _GarbageCollector(object):
"""Deletes temporary buffer files at exit.
Certain tasks (such as NCF Recommendation) require writing buffers to
temporary files. (Which may be local or distributed.) It is not generally safe
to delete these files during operation, but they should be cleaned up. This
class keeps track of temporary files created, and deletes them at exit.
"""
def __init__(self):
self.temp_buffers = []
def register(self, filepath):
self.temp_buffers.append(filepath)
def purge(self):
try:
for i in self.temp_buffers:
if tf.io.gfile.exists(i):
tf.io.gfile.remove(i)
tf.compat.v1.logging.info("Buffer file {} removed".format(i))
except Exception as e:
tf.compat.v1.logging.error("Failed to cleanup buffer files: {}".format(e))
_GARBAGE_COLLECTOR = _GarbageCollector()
atexit.register(_GARBAGE_COLLECTOR.purge)
_ROWS_PER_CORE = 50000
def write_to_temp_buffer(dataframe, buffer_folder, columns):
if buffer_folder is None:
_, buffer_path = tempfile.mkstemp()
else:
tf.io.gfile.makedirs(buffer_folder)
buffer_path = os.path.join(buffer_folder, str(uuid.uuid4()))
_GARBAGE_COLLECTOR.register(buffer_path)
return write_to_buffer(dataframe, buffer_path, columns)
def iter_shard_dataframe(df, rows_per_core=1000):
"""Two way shard of a dataframe.
This function evenly shards a dataframe so that it can be mapped efficiently.
It yields a list of dataframes with length equal to the number of CPU cores,
with each dataframe having rows_per_core rows. (Except for the last batch
which may have fewer rows in the dataframes.) Passing vectorized inputs to
a pool is more effecient than iterating through a dataframe in serial and
passing a list of inputs to the pool.
Args:
df: Pandas dataframe to be sharded.
rows_per_core: Number of rows in each shard.
Returns:
A list of dataframe shards.
"""
n = len(df)
num_cores = min([multiprocessing.cpu_count(), n])
num_blocks = int(np.ceil(n / num_cores / rows_per_core))
max_batch_size = num_cores * rows_per_core
for i in range(num_blocks):
min_index = i * max_batch_size
max_index = min([(i + 1) * max_batch_size, n])
df_shard = df[min_index:max_index]
n_shard = len(df_shard)
boundaries = np.linspace(0, n_shard, num_cores + 1, dtype=np.int64)
yield [df_shard[boundaries[j]:boundaries[j+1]] for j in range(num_cores)]
def _shard_dict_to_examples(shard_dict):
"""Converts a dict of arrays into a list of example bytes."""
n = [i for i in shard_dict.values()][0].shape[0]
feature_list = [{} for _ in range(n)]
for column, values in shard_dict.items():
if len(values.shape) == 1:
values = np.reshape(values, values.shape + (1,))
if values.dtype.kind == "i":
feature_map = lambda x: tf.train.Feature(
int64_list=tf.train.Int64List(value=x))
elif values.dtype.kind == "f":
feature_map = lambda x: tf.train.Feature(
float_list=tf.train.FloatList(value=x))
else:
raise ValueError("Invalid dtype")
for i in range(n):
feature_list[i][column] = feature_map(values[i])
examples = [
tf.train.Example(features=tf.train.Features(feature=example_features))
for example_features in feature_list
]
return [e.SerializeToString() for e in examples]
def _serialize_shards(df_shards, columns, pool, writer):
"""Map sharded dataframes to bytes, and write them to a buffer.
Args:
df_shards: A list of pandas dataframes. (Should be of similar size)
columns: The dataframe columns to be serialized.
pool: A pool to serialize in parallel.
writer: A TFRecordWriter to write the serialized shards.
"""
# Pandas does not store columns of arrays as nd arrays. stack remedies this.
map_inputs = [{c: np.stack(shard[c].values, axis=0) for c in columns}
for shard in df_shards]
# Failure within pools is very irksome. Thus, it is better to thoroughly check
# inputs in the main process.
for inp in map_inputs:
# Check that all fields have the same number of rows.
assert len(set([v.shape[0] for v in inp.values()])) == 1
for val in inp.values():
assert hasattr(val, "dtype")
assert hasattr(val.dtype, "kind")
assert val.dtype.kind in ("i", "f")
assert len(val.shape) in (1, 2)
shard_bytes = pool.map(_shard_dict_to_examples, map_inputs)
for s in shard_bytes:
for example in s:
writer.write(example)
def write_to_buffer(dataframe, buffer_path, columns, expected_size=None):
"""Write a dataframe to a binary file for a dataset to consume.
Args:
dataframe: The pandas dataframe to be serialized.
buffer_path: The path where the serialized results will be written.
columns: The dataframe columns to be serialized.
expected_size: The size in bytes of the serialized results. This is used to
lazily construct the buffer.
Returns:
The path of the buffer.
"""
if (tf.io.gfile.exists(buffer_path) and
tf.io.gfile.stat(buffer_path).length > 0):
actual_size = tf.io.gfile.stat(buffer_path).length
if expected_size == actual_size:
return buffer_path
tf.compat.v1.logging.warning(
"Existing buffer {} has size {}. Expected size {}. Deleting and "
"rebuilding buffer.".format(buffer_path, actual_size, expected_size))
tf.io.gfile.remove(buffer_path)
if dataframe is None:
raise ValueError(
"dataframe was None but a valid existing buffer was not found.")
tf.io.gfile.makedirs(os.path.split(buffer_path)[0])
tf.compat.v1.logging.info("Constructing TFRecordDataset buffer: {}"
.format(buffer_path))
count = 0
pool = multiprocessing.dummy.Pool(multiprocessing.cpu_count())
try:
with tf.io.TFRecordWriter(buffer_path) as writer:
for df_shards in iter_shard_dataframe(df=dataframe,
rows_per_core=_ROWS_PER_CORE):
_serialize_shards(df_shards, columns, pool, writer)
count += sum([len(s) for s in df_shards])
tf.compat.v1.logging.info("{}/{} examples written."
.format(str(count).ljust(8), len(dataframe)))
finally:
pool.terminate()
tf.compat.v1.logging.info("Buffer write complete.")
return buffer_path
@@ -0,0 +1,199 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for binary data file utilities."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import contextlib
import multiprocessing
# pylint: disable=wrong-import-order
import numpy as np
import pandas as pd
import tensorflow as tf
# pylint: enable=wrong-import-order
from official.r1.utils.data import file_io
from official.utils.misc import keras_utils
_RAW_ROW = "raw_row"
_DUMMY_COL = "column_0"
_DUMMY_VEC_COL = "column_1"
_DUMMY_VEC_LEN = 4
_ROWS_PER_CORE = 4
_TEST_CASES = [
# One batch of one
dict(row_count=1, cpu_count=1, expected=[
[[0]]
]),
dict(row_count=10, cpu_count=1, expected=[
[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]
]),
dict(row_count=21, cpu_count=1, expected=[
[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]],
[[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]
]),
dict(row_count=1, cpu_count=4, expected=[
[[0]]
]),
dict(row_count=10, cpu_count=4, expected=[
[[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]
]),
dict(row_count=21, cpu_count=4, expected=[
[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]],
[[16], [17], [18], [19, 20]]
]),
dict(row_count=10, cpu_count=8, expected=[
[[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]
]),
dict(row_count=40, cpu_count=8, expected=[
[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15],
[16, 17, 18, 19], [20, 21, 22, 23], [24, 25, 26, 27],
[28, 29, 30, 31]],
[[32], [33], [34], [35], [36], [37], [38], [39]]
]),
]
_FEATURE_MAP = {
_RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
_DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
_DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32)
}
@contextlib.contextmanager
def fixed_core_count(cpu_count):
"""Override CPU count.
file_io.py uses the cpu_count function to scale to the size of the instance.
However, this is not desirable for testing because it can make the test flaky.
Instead, this context manager fixes the count for more robust testing.
Args:
cpu_count: How many cores multiprocessing claims to have.
Yields:
Nothing. (for context manager only)
"""
old_count_fn = multiprocessing.cpu_count
multiprocessing.cpu_count = lambda: cpu_count
yield
multiprocessing.cpu_count = old_count_fn
class BaseTest(tf.test.TestCase):
def setUp(self):
super(BaseTest, self).setUp()
if keras_utils.is_v2_0:
tf.compat.v1.disable_eager_execution()
def _test_sharding(self, row_count, cpu_count, expected):
df = pd.DataFrame({_DUMMY_COL: list(range(row_count))})
with fixed_core_count(cpu_count):
shards = list(file_io.iter_shard_dataframe(df, _ROWS_PER_CORE))
result = [[j[_DUMMY_COL].tolist() for j in i] for i in shards]
self.assertAllEqual(expected, result)
def test_tiny_rows_low_core(self):
self._test_sharding(**_TEST_CASES[0])
def test_small_rows_low_core(self):
self._test_sharding(**_TEST_CASES[1])
def test_large_rows_low_core(self):
self._test_sharding(**_TEST_CASES[2])
def test_tiny_rows_medium_core(self):
self._test_sharding(**_TEST_CASES[3])
def test_small_rows_medium_core(self):
self._test_sharding(**_TEST_CASES[4])
def test_large_rows_medium_core(self):
self._test_sharding(**_TEST_CASES[5])
def test_small_rows_large_core(self):
self._test_sharding(**_TEST_CASES[6])
def test_large_rows_large_core(self):
self._test_sharding(**_TEST_CASES[7])
def _serialize_deserialize(self, num_cores=1, num_rows=20):
np.random.seed(1)
df = pd.DataFrame({
# Serialization order is only deterministic for num_cores=1. raw_row is
# used in validation after the deserialization.
_RAW_ROW: np.array(range(num_rows), dtype=np.int64),
_DUMMY_COL: np.random.randint(0, 35, size=(num_rows,)),
_DUMMY_VEC_COL: [
np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)])
for i in range(num_rows) # pylint: disable=unused-variable
]
})
with fixed_core_count(num_cores):
buffer_path = file_io.write_to_temp_buffer(
df, self.get_temp_dir(), [_RAW_ROW, _DUMMY_COL, _DUMMY_VEC_COL])
with self.session(graph=tf.Graph()) as sess:
dataset = tf.data.TFRecordDataset(buffer_path)
dataset = dataset.batch(1).map(
lambda x: tf.io.parse_example(serialized=x, features=_FEATURE_MAP))
data_iter = tf.compat.v1.data.make_one_shot_iterator(dataset)
seen_rows = set()
for i in range(num_rows+5):
row = data_iter.get_next()
try:
row_id, val_0, val_1 = sess.run(
[row[_RAW_ROW], row[_DUMMY_COL], row[_DUMMY_VEC_COL]])
row_id, val_0, val_1 = row_id[0][0], val_0[0][0], val_1[0]
assert row_id not in seen_rows
seen_rows.add(row_id)
self.assertEqual(val_0, df[_DUMMY_COL][row_id])
self.assertAllClose(val_1, df[_DUMMY_VEC_COL][row_id])
self.assertLess(i, num_rows, msg="Too many rows.")
except tf.errors.OutOfRangeError:
self.assertGreaterEqual(i, num_rows, msg="Too few rows.")
file_io._GARBAGE_COLLECTOR.purge()
assert not tf.io.gfile.exists(buffer_path)
def test_serialize_deserialize_0(self):
self._serialize_deserialize(num_cores=1)
def test_serialize_deserialize_1(self):
self._serialize_deserialize(num_cores=2)
def test_serialize_deserialize_2(self):
self._serialize_deserialize(num_cores=8)
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,49 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Convenience functions for exporting models as SavedModels or other types."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32,
batch_size=1):
"""Returns a input_receiver_fn that can be used during serving.
This expects examples to come through as float tensors, and simply
wraps them as TensorServingInputReceivers.
Arguably, this should live in tf.estimator.export. Testing here first.
Args:
shape: list representing target size of a single example.
dtype: the expected datatype for the input example
batch_size: number of input tensors that will be passed for prediction
Returns:
A function that itself returns a TensorServingInputReceiver.
"""
def serving_input_receiver_fn():
# Prep a placeholder where the input example will be fed in
features = tf.compat.v1.placeholder(
dtype=dtype, shape=[batch_size] + shape, name='input_tensor')
return tf.estimator.export.TensorServingInputReceiver(
features=features, receiver_tensors=features)
return serving_input_receiver_fn
@@ -0,0 +1,63 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for exporting utils."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.r1.utils import export
class ExportUtilsTest(tf.test.TestCase):
"""Tests for the ExportUtils."""
def test_build_tensor_serving_input_receiver_fn(self):
receiver_fn = export.build_tensor_serving_input_receiver_fn(shape=[4, 5])
with tf.Graph().as_default():
receiver = receiver_fn()
self.assertIsInstance(
receiver, tf.estimator.export.TensorServingInputReceiver)
self.assertIsInstance(receiver.features, tf.Tensor)
self.assertEqual(receiver.features.shape, tf.TensorShape([1, 4, 5]))
self.assertEqual(receiver.features.dtype, tf.float32)
self.assertIsInstance(receiver.receiver_tensors, dict)
# Note that Python 3 can no longer index .values() directly; cast to list.
self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
tf.TensorShape([1, 4, 5]))
def test_build_tensor_serving_input_receiver_fn_batch_dtype(self):
receiver_fn = export.build_tensor_serving_input_receiver_fn(
shape=[4, 5], dtype=tf.int8, batch_size=10)
with tf.Graph().as_default():
receiver = receiver_fn()
self.assertIsInstance(
receiver, tf.estimator.export.TensorServingInputReceiver)
self.assertIsInstance(receiver.features, tf.Tensor)
self.assertEqual(receiver.features.shape, tf.TensorShape([10, 4, 5]))
self.assertEqual(receiver.features.dtype, tf.int8)
self.assertIsInstance(receiver.receiver_tensors, dict)
# Note that Python 3 can no longer index .values() directly; cast to list.
self.assertEqual(list(receiver.receiver_tensors.values())[0].shape,
tf.TensorShape([10, 4, 5]))
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,116 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions specific to running TensorFlow on TPUs."""
import tensorflow as tf
# "local" is a magic word in the TPU cluster resolver; it informs the resolver
# to use the local CPU as the compute device. This is useful for testing and
# debugging; the code flow is ostensibly identical, but without the need to
# actually have a TPU on the other end.
LOCAL = "local"
def construct_scalar_host_call(metric_dict, model_dir, prefix=""):
"""Construct a host call to log scalars when training on TPU.
Args:
metric_dict: A dict of the tensors to be logged.
model_dir: The location to write the summary.
prefix: The prefix (if any) to prepend to the metric names.
Returns:
A tuple of (function, args_to_be_passed_to_said_function)
"""
# type: (dict, str) -> (function, list)
metric_names = list(metric_dict.keys())
def host_call_fn(global_step, *args):
"""Training host call. Creates scalar summaries for training metrics.
This function is executed on the CPU and should not directly reference
any Tensors in the rest of the `model_fn`. To pass Tensors from the
model to the `metric_fn`, provide as part of the `host_call`. See
https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
for more information.
Arguments should match the list of `Tensor` objects passed as the second
element in the tuple passed to `host_call`.
Args:
global_step: `Tensor with shape `[batch]` for the global_step
*args: Remaining tensors to log.
Returns:
List of summary ops to run on the CPU host.
"""
step = global_step[0]
with tf.compat.v1.summary.create_file_writer(
logdir=model_dir, filename_suffix=".host_call").as_default():
with tf.compat.v1.summary.always_record_summaries():
for i, name in enumerate(metric_names):
tf.compat.v1.summary.scalar(prefix + name, args[i][0], step=step)
return tf.compat.v1.summary.all_summary_ops()
# To log the current learning rate, and gradient norm for Tensorboard, the
# summary op needs to be run on the host CPU via host_call. host_call
# expects [batch_size, ...] Tensors, thus reshape to introduce a batch
# dimension. These Tensors are implicitly concatenated to
# [params['batch_size']].
global_step_tensor = tf.reshape(
tf.compat.v1.train.get_or_create_global_step(), [1])
other_tensors = [tf.reshape(metric_dict[key], [1]) for key in metric_names]
return host_call_fn, [global_step_tensor] + other_tensors
def embedding_matmul(embedding_table, values, mask, name="embedding_matmul"):
"""Performs embedding lookup via a matmul.
The matrix to be multiplied by the embedding table Tensor is constructed
via an implementation of scatter based on broadcasting embedding indices
and performing an equality comparison against a broadcasted
range(num_embedding_table_rows). All masked positions will produce an
embedding vector of zeros.
Args:
embedding_table: Tensor of embedding table.
Rank 2 (table_size x embedding dim)
values: Tensor of embedding indices. Rank 2 (batch x n_indices)
mask: Tensor of mask / weights. Rank 2 (batch x n_indices)
name: Optional name scope for created ops
Returns:
Rank 3 tensor of embedding vectors.
"""
with tf.name_scope(name):
n_embeddings = embedding_table.get_shape().as_list()[0]
batch_size, padded_size = values.shape.as_list()
emb_idcs = tf.tile(
tf.reshape(values, (batch_size, padded_size, 1)), (1, 1, n_embeddings))
emb_weights = tf.tile(
tf.reshape(mask, (batch_size, padded_size, 1)), (1, 1, n_embeddings))
col_idcs = tf.tile(
tf.reshape(tf.range(n_embeddings), (1, 1, n_embeddings)),
(batch_size, padded_size, 1))
one_hot = tf.where(
tf.equal(emb_idcs, col_idcs), emb_weights,
tf.zeros((batch_size, padded_size, n_embeddings)))
return tf.tensordot(one_hot, embedding_table, 1)
@@ -0,0 +1,108 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Test TPU optimized matmul embedding."""
import numpy as np
import tensorflow as tf
from official.r1.utils import tpu as tpu_utils
TEST_CASES = [
dict(embedding_dim=256, vocab_size=1000, sequence_length=64,
batch_size=32, seed=54131),
dict(embedding_dim=8, vocab_size=15, sequence_length=12,
batch_size=256, seed=536413),
dict(embedding_dim=2048, vocab_size=512, sequence_length=50,
batch_size=8, seed=35124)
]
class TPUBaseTester(tf.test.TestCase):
def construct_embedding_and_values(self, embedding_dim, vocab_size,
sequence_length, batch_size, seed):
np.random.seed(seed)
embeddings = np.random.random(size=(vocab_size, embedding_dim))
embedding_table = tf.convert_to_tensor(value=embeddings, dtype=tf.float32)
tokens = np.random.randint(low=1, high=vocab_size-1,
size=(batch_size, sequence_length))
for i in range(batch_size):
tokens[i, np.random.randint(low=0, high=sequence_length-1):] = 0
values = tf.convert_to_tensor(value=tokens, dtype=tf.int32)
mask = tf.cast(tf.not_equal(values, 0), dtype=tf.float32)
return embedding_table, values, mask
def _test_embedding(self, embedding_dim, vocab_size,
sequence_length, batch_size, seed):
"""Test that matmul embedding matches embedding lookup (gather)."""
with self.test_session():
embedding_table, values, mask = self.construct_embedding_and_values(
embedding_dim=embedding_dim,
vocab_size=vocab_size,
sequence_length=sequence_length,
batch_size=batch_size,
seed=seed
)
embedding = (tf.nn.embedding_lookup(params=embedding_table, ids=values) *
tf.expand_dims(mask, -1))
matmul_embedding = tpu_utils.embedding_matmul(
embedding_table=embedding_table, values=values, mask=mask)
self.assertAllClose(embedding, matmul_embedding)
def _test_masking(self, embedding_dim, vocab_size,
sequence_length, batch_size, seed):
"""Test that matmul embedding properly zeros masked positions."""
with self.test_session():
embedding_table, values, mask = self.construct_embedding_and_values(
embedding_dim=embedding_dim,
vocab_size=vocab_size,
sequence_length=sequence_length,
batch_size=batch_size,
seed=seed
)
matmul_embedding = tpu_utils.embedding_matmul(
embedding_table=embedding_table, values=values, mask=mask)
self.assertAllClose(matmul_embedding,
matmul_embedding * tf.expand_dims(mask, -1))
def test_embedding_0(self):
self._test_embedding(**TEST_CASES[0])
def test_embedding_1(self):
self._test_embedding(**TEST_CASES[1])
def test_embedding_2(self):
self._test_embedding(**TEST_CASES[2])
def test_masking_0(self):
self._test_masking(**TEST_CASES[0])
def test_masking_1(self):
self._test_masking(**TEST_CASES[1])
def test_masking_2(self):
self._test_masking(**TEST_CASES[2])
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,24 @@
six
google-api-python-client>=1.6.7
google-cloud-bigquery>=0.31.0
kaggle>=1.3.9
mlperf_compliance==0.0.10
numpy>=1.15.4
oauth2client>=4.1.2
pandas>=0.22.0
psutil>=5.4.3
py-cpuinfo>=3.3.0
scipy>=0.19.1
tensorflow-hub>=0.6.0
tensorflow-model-optimization>=0.2.1
tensorflow_datasets
dataclasses
gin-config
typing
sentencepiece
Cython
matplotlib
opencv-python-headless
pyyaml
Pillow
-e git+https://github.com/cocodataset/cocoapi#egg=pycocotools&subdirectory=PythonAPI
@@ -0,0 +1,97 @@
# Adding Abseil (absl) flags quickstart
## Defining a flag
absl flag definitions are similar to argparse, although they are defined on a global namespace.
For instance defining a string flag looks like:
```$xslt
from absl import flags
flags.DEFINE_string(
name="my_flag",
default="a_sensible_default",
help="Here is what this flag does."
)
```
All three arguments are required, but default may be `None`. A common optional argument is
short_name for defining abreviations. Certain `DEFINE_*` methods will have other required arguments.
For instance `DEFINE_enum` requires the `enum_values` argument to be specified.
## Key Flags
absl has the concept of a key flag. Any flag defined in `__main__` is considered a key flag by
default. Key flags are displayed in `--help`, others only appear in `--helpfull`. In order to
handle key flags that are defined outside the module in question, absl provides the
`flags.adopt_module_key_flags()` method. This adds the key flags of a different module to one's own
key flags. For example:
```$xslt
File: flag_source.py
---------------------------------------
from absl import flags
flags.DEFINE_string(name="my_flag", default="abc", help="a flag.")
```
```$xslt
File: my_module.py
---------------------------------------
from absl import app as absl_app
from absl import flags
import flag_source
flags.adopt_module_key_flags(flag_source)
def main(_):
pass
absl_app.run(main, [__file__, "-h"]
```
when `my_module.py` is run it will show the help text for `my_flag`. Because not all flags defined
in a file are equally important, `official/utils/flags/core.py` (generally imported as flags_core)
provides an abstraction for handling key flag declaration in an easy way through the
`register_key_flags_in_core()` function, which allows a module to make a single
`adopt_key_flags(flags_core)` call when using the util flag declaration functions.
## Validators
Often the constraints on a flag are complicated. absl provides the validator decorator to allow
one to mark a function as a flag validation function. Suppose we want users to provide a flag
which is a palindrome.
```$xslt
from absl import flags
flags.DEFINE_string(name="pal_flag", short_name="pf", default="", help="Give me a palindrome")
@flags.validator("pal_flag")
def _check_pal(provided_pal_flag):
return provided_pal_flag == provided_pal_flag[::-1]
```
Validators take the form that returning True (truthy) passes, and all others
(False, None, exception) fail.
## Testing
To test using absl, simply declare flags in the setupClass method of TensorFlow's TestCase.
```$xslt
from absl import flags
import tensorflow as tf
def define_flags():
flags.DEFINE_string(name="test_flag", default="abc", help="an example flag")
class BaseTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
super(BaseTester, cls).setUpClass()
define_flags()
def test_trivial(self):
flags_core.parse_flags([__file__, "test_flag", "def"])
self.AssertEqual(flags.FLAGS.test_flag, "def")
```
@@ -0,0 +1,163 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Flags which will be nearly universal across models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
from official.utils.flags._conventions import help_wrap
from official.utils.logs import hooks_helper
############## npu modify begin #############
from hccl.manage.api import get_rank_size
from hccl.manage.api import get_rank_id
############## npu modify end ###############
def define_base(data_dir=True, model_dir=True, clean=False, train_epochs=False,
epochs_between_evals=False, stop_threshold=False,
batch_size=True, num_gpu=False, hooks=False, export_dir=False,
distribution_strategy=False, run_eagerly=False):
"""Register base flags.
Args:
data_dir: Create a flag for specifying the input data directory.
model_dir: Create a flag for specifying the model file directory.
clean: Create a flag for removing the model_dir.
train_epochs: Create a flag to specify the number of training epochs.
epochs_between_evals: Create a flag to specify the frequency of testing.
stop_threshold: Create a flag to specify a threshold accuracy or other
eval metric which should trigger the end of training.
batch_size: Create a flag to specify the batch size.
num_gpu: Create a flag to specify the number of GPUs used.
hooks: Create a flag to specify hooks for logging.
export_dir: Create a flag to specify where a SavedModel should be exported.
distribution_strategy: Create a flag to specify which Distribution Strategy
to use.
run_eagerly: Create a flag to specify to run eagerly op by op.
Returns:
A list of flags for core.py to marks as key flags.
"""
key_flags = []
if data_dir:
flags.DEFINE_string(
name="data_dir", short_name="dd", default="/tmp",
help=help_wrap("The location of the input data."))
key_flags.append("data_dir")
if model_dir:
flags.DEFINE_string(
name="model_dir", short_name="md", default="/tmp",
help=help_wrap("The location of the model checkpoint files."))
key_flags.append("model_dir")
if clean:
flags.DEFINE_boolean(
name="clean", default=False,
help=help_wrap("If set, model_dir will be removed if it exists."))
key_flags.append("clean")
if train_epochs:
flags.DEFINE_integer(
name="train_epochs", short_name="te", default=1,
help=help_wrap("The number of epochs used to train."))
key_flags.append("train_epochs")
if epochs_between_evals:
flags.DEFINE_integer(
name="epochs_between_evals", short_name="ebe", default=1,
help=help_wrap("The number of training epochs to run between "
"evaluations."))
key_flags.append("epochs_between_evals")
if stop_threshold:
flags.DEFINE_float(
name="stop_threshold", short_name="st",
default=None,
help=help_wrap("If passed, training will stop at the earlier of "
"train_epochs and when the evaluation metric is "
"greater than or equal to stop_threshold."))
if batch_size:
flags.DEFINE_integer(
name="batch_size", short_name="bs", default=32,
help=help_wrap("Batch size for training and evaluation. When using "
"multiple gpus, this is the global batch size for "
"all devices. For example, if the batch size is 32 "
"and there are 4 GPUs, each GPU will get 8 examples on "
"each step."))
key_flags.append("batch_size")
if num_gpu:
flags.DEFINE_integer(
name="num_gpus", short_name="ng",
default=1,
help=help_wrap(
"How many GPUs to use at each worker with the "
"DistributionStrategies API. The default is 1."))
if run_eagerly:
flags.DEFINE_boolean(
name="run_eagerly", default=False,
help="Run the model op by op without building a model function.")
if hooks:
# Construct a pretty summary of hooks.
hook_list_str = (
u"\ufeff Hook:\n" + u"\n".join([u"\ufeff {}".format(key) for key
in hooks_helper.HOOKS]))
flags.DEFINE_list(
name="hooks", short_name="hk", default="LoggingTensorHook",
help=help_wrap(
u"A list of (case insensitive) strings to specify the names of "
u"training hooks.\n{}\n\ufeff Example: `--hooks ProfilerHook,"
u"ExamplesPerSecondHook`\n See official.utils.logs.hooks_helper "
u"for details.".format(hook_list_str))
)
key_flags.append("hooks")
if export_dir:
flags.DEFINE_string(
name="export_dir", short_name="ed", default=None,
help=help_wrap("If set, a SavedModel serialization of the model will "
"be exported to this directory at the end of training. "
"See the README for more details and relevant links.")
)
key_flags.append("export_dir")
if distribution_strategy:
flags.DEFINE_string(
name="distribution_strategy", short_name="ds", default="mirrored",
help=help_wrap("The Distribution Strategy to use for training. "
"Accepted values are 'off', 'one_device', "
"'mirrored', 'parameter_server', 'collective', "
"case insensitive. 'off' means not to use "
"Distribution Strategy; 'default' means to choose "
"from `MirroredStrategy` or `OneDeviceStrategy` "
"according to the number of GPUs.")
)
return key_flags
def get_num_gpus(flags_obj):
"""get the num npus using hccl api"""
############## npu modify begin #############
return get_rank_size()
############## npu modify end ###############
@@ -0,0 +1,109 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Flags for benchmarking models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from official.utils.flags._conventions import help_wrap
def define_log_steps():
flags.DEFINE_integer(
name="log_steps", default=100,
help="Frequency with which to log timing information with TimeHistory.")
return []
def define_benchmark(benchmark_log_dir=True, bigquery_uploader=True):
"""Register benchmarking flags.
Args:
benchmark_log_dir: Create a flag to specify location for benchmark logging.
bigquery_uploader: Create flags for uploading results to BigQuery.
Returns:
A list of flags for core.py to marks as key flags.
"""
key_flags = []
flags.DEFINE_enum(
name="benchmark_logger_type", default="BaseBenchmarkLogger",
enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger",
"BenchmarkBigQueryLogger"],
help=help_wrap("The type of benchmark logger to use. Defaults to using "
"BaseBenchmarkLogger which logs to STDOUT. Different "
"loggers will require other flags to be able to work."))
flags.DEFINE_string(
name="benchmark_test_id", short_name="bti", default=None,
help=help_wrap("The unique test ID of the benchmark run. It could be the "
"combination of key parameters. It is hardware "
"independent and could be used compare the performance "
"between different test runs. This flag is designed for "
"human consumption, and does not have any impact within "
"the system."))
define_log_steps()
if benchmark_log_dir:
flags.DEFINE_string(
name="benchmark_log_dir", short_name="bld", default=None,
help=help_wrap("The location of the benchmark logging.")
)
if bigquery_uploader:
flags.DEFINE_string(
name="gcp_project", short_name="gp", default=None,
help=help_wrap(
"The GCP project name where the benchmark will be uploaded."))
flags.DEFINE_string(
name="bigquery_data_set", short_name="bds", default="test_benchmark",
help=help_wrap(
"The Bigquery dataset name where the benchmark will be uploaded."))
flags.DEFINE_string(
name="bigquery_run_table", short_name="brt", default="benchmark_run",
help=help_wrap("The Bigquery table name where the benchmark run "
"information will be uploaded."))
flags.DEFINE_string(
name="bigquery_run_status_table", short_name="brst",
default="benchmark_run_status",
help=help_wrap("The Bigquery table name where the benchmark run "
"status information will be uploaded."))
flags.DEFINE_string(
name="bigquery_metric_table", short_name="bmt",
default="benchmark_metric",
help=help_wrap("The Bigquery table name where the benchmark metric "
"information will be uploaded."))
@flags.multi_flags_validator(
["benchmark_logger_type", "benchmark_log_dir"],
message="--benchmark_logger_type=BenchmarkFileLogger will require "
"--benchmark_log_dir being set")
def _check_benchmark_log_dir(flags_dict):
benchmark_logger_type = flags_dict["benchmark_logger_type"]
if benchmark_logger_type == "BenchmarkFileLogger":
return flags_dict["benchmark_log_dir"]
return True
return key_flags
@@ -0,0 +1,54 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Central location for shared argparse convention definitions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
import codecs
import functools
from absl import app as absl_app
from absl import flags
# This codifies help string conventions and makes it easy to update them if
# necessary. Currently the only major effect is that help bodies start on the
# line after flags are listed. All flag definitions should wrap the text bodies
# with help wrap when calling DEFINE_*.
_help_wrap = functools.partial(flags.text_wrap, length=80, indent="",
firstline_indent="\n")
# Pretty formatting causes issues when utf-8 is not installed on a system.
def _stdout_utf8():
try:
codecs.lookup("utf-8")
except LookupError:
return False
return sys.stdout.encoding == "UTF-8"
if _stdout_utf8():
help_wrap = _help_wrap
else:
def help_wrap(text, *args, **kwargs):
return _help_wrap(text, *args, **kwargs).replace(u"\ufeff", u"")
# Replace None with h to also allow -h
absl_app.HelpshortFlag.SHORT_NAME = "h"
@@ -0,0 +1,85 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Flags for managing compute devices. Currently only contains TPU flags."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
from official.utils.flags._conventions import help_wrap
def require_cloud_storage(flag_names):
"""Register a validator to check directory flags.
Args:
flag_names: An iterable of strings containing the names of flags to be
checked.
"""
msg = "TPU requires GCS path for {}".format(", ".join(flag_names))
@flags.multi_flags_validator(["tpu"] + flag_names, message=msg)
def _path_check(flag_values): # pylint: disable=missing-docstring
if flag_values["tpu"] is None:
return True
valid_flags = True
for key in flag_names:
if not flag_values[key].startswith("gs://"):
tf.compat.v1.logging.error("{} must be a GCS path.".format(key))
valid_flags = False
return valid_flags
def define_device(tpu=True):
"""Register device specific flags.
Args:
tpu: Create flags to specify TPU operation.
Returns:
A list of flags for core.py to marks as key flags.
"""
key_flags = []
if tpu:
flags.DEFINE_string(
name="tpu", default=None,
help=help_wrap(
"The Cloud TPU to use for training. This should be either the name "
"used when creating the Cloud TPU, or a "
"grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
"CPU of the local instance instead. (Good for debugging.)"))
key_flags.append("tpu")
flags.DEFINE_string(
name="tpu_zone", default=None,
help=help_wrap(
"[Optional] GCE zone where the Cloud TPU is located in. If not "
"specified, we will attempt to automatically detect the GCE "
"project from metadata."))
flags.DEFINE_string(
name="tpu_gcp_project", default=None,
help=help_wrap(
"[Optional] Project name for the Cloud TPU-enabled project. If not "
"specified, we will attempt to automatically detect the GCE "
"project from metadata."))
flags.DEFINE_integer(name="num_tpu_shards", default=8,
help=help_wrap("Number of shards (TPU chips)."))
return key_flags
@@ -0,0 +1,54 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Flags related to distributed execution."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
from official.utils.flags._conventions import help_wrap
def define_distribution(worker_hosts=True, task_index=True):
"""Register distributed execution flags.
Args:
worker_hosts: Create a flag for specifying comma-separated list of workers.
task_index: Create a flag for specifying index of task.
Returns:
A list of flags for core.py to marks as key flags.
"""
key_flags = []
if worker_hosts:
flags.DEFINE_string(
name='worker_hosts', default=None,
help=help_wrap(
'Comma-separated list of worker ip:port pairs for running '
'multi-worker models with DistributionStrategy. The user would '
'start the program on each host with identical value for this '
'flag.'))
if task_index:
flags.DEFINE_integer(
name='task_index', default=-1,
help=help_wrap('If multi-worker training, the task_index of this '
'worker.'))
return key_flags
@@ -0,0 +1,50 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Misc flags."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from official.utils.flags._conventions import help_wrap
def define_image(data_format=True):
"""Register image specific flags.
Args:
data_format: Create a flag to specify image axis convention.
Returns:
A list of flags for core.py to marks as key flags.
"""
key_flags = []
if data_format:
flags.DEFINE_enum(
name="data_format", short_name="df", default=None,
enum_values=["channels_first", "channels_last"],
help=help_wrap(
"A flag to override the data format used in the model. "
"channels_first provides a performance boost on GPU but is not "
"always compatible with CPU. If left unspecified, the data format "
"will be chosen automatically based on whether TensorFlow was "
"built for CPU or GPU."))
key_flags.append("data_format")
return key_flags
@@ -0,0 +1,289 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Register flags for optimizing performance."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
from absl import flags # pylint: disable=g-bad-import-order
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.flags._conventions import help_wrap
# Map string to TensorFlow dtype
DTYPE_MAP = {
"fp16": tf.float16,
"bf16": tf.bfloat16,
"fp32": tf.float32,
}
def get_tf_dtype(flags_obj):
if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
# If the graph_rewrite is used, we build the graph with fp32, and let the
# graph rewrite change ops to fp16.
return tf.float32
return DTYPE_MAP[flags_obj.dtype]
def get_loss_scale(flags_obj, default_for_fp16):
dtype = get_tf_dtype(flags_obj)
if flags_obj.loss_scale == "dynamic":
return flags_obj.loss_scale
elif flags_obj.loss_scale is not None:
return float(flags_obj.loss_scale)
elif dtype == tf.float32 or dtype == tf.bfloat16:
return 1 # No loss scaling is needed for fp32
else:
assert dtype == tf.float16
return default_for_fp16
def define_performance(num_parallel_calls=False, inter_op=False, intra_op=False,
synthetic_data=False, max_train_steps=False, dtype=False,
all_reduce_alg=False, num_packs=False,
tf_gpu_thread_mode=False,
datasets_num_private_threads=False,
datasets_num_parallel_batches=False,
dynamic_loss_scale=False, fp16_implementation=False,
loss_scale=False,
tf_data_experimental_slack=False, enable_xla=False,
training_dataset_cache=False):
"""Register flags for specifying performance tuning arguments.
Args:
num_parallel_calls: Create a flag to specify parallelism of data loading.
inter_op: Create a flag to allow specification of inter op threads.
intra_op: Create a flag to allow specification of intra op threads.
synthetic_data: Create a flag to allow the use of synthetic data.
max_train_steps: Create a flags to allow specification of maximum number
of training steps
dtype: Create flags for specifying dtype.
all_reduce_alg: If set forces a specific algorithm for multi-gpu.
num_packs: If set provides number of packs for MirroredStrategy's cross
device ops.
tf_gpu_thread_mode: gpu_private triggers us of private thread pool.
datasets_num_private_threads: Number of private threads for datasets.
datasets_num_parallel_batches: Determines how many batches to process in
parallel when using map and batch from tf.data.
dynamic_loss_scale: Allow the "loss_scale" flag to take on the value
"dynamic". Only valid if `dtype` is True.
fp16_implementation: Create fp16_implementation flag.
loss_scale: Controls the loss scaling, normally for mixed-precision
training. Can only be turned on if dtype is also True.
tf_data_experimental_slack: Determines whether to enable tf.data's
`experimental_slack` option.
enable_xla: Determines if XLA (auto clustering) is turned on.
training_dataset_cache: Whether to cache the training dataset on workers.
Typically used to improve training performance when training data is in
remote storage and can fit into worker memory.
Returns:
A list of flags for core.py to marks as key flags.
"""
key_flags = []
if num_parallel_calls:
flags.DEFINE_integer(
name="num_parallel_calls", short_name="npc",
default=multiprocessing.cpu_count(),
help=help_wrap("The number of records that are processed in parallel "
"during input processing. This can be optimized per "
"data set but for generally homogeneous data sets, "
"should be approximately the number of available CPU "
"cores. (default behavior)"))
if inter_op:
flags.DEFINE_integer(
name="inter_op_parallelism_threads", short_name="inter", default=0,
help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. "
"See TensorFlow config.proto for details.")
)
if intra_op:
flags.DEFINE_integer(
name="intra_op_parallelism_threads", short_name="intra", default=0,
help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. "
"See TensorFlow config.proto for details."))
if synthetic_data:
flags.DEFINE_bool(
name="use_synthetic_data", short_name="synth", default=False,
help=help_wrap(
"If set, use fake data (zeroes) instead of a real dataset. "
"This mode is useful for performance debugging, as it removes "
"input processing steps, but will not learn anything."))
if max_train_steps:
flags.DEFINE_integer(
name="max_train_steps", short_name="mts", default=None, help=help_wrap(
"The model will stop training if the global_step reaches this "
"value. If not set, training will run until the specified number "
"of epochs have run as usual. It is generally recommended to set "
"--train_epochs=1 when using this flag."
))
if dtype:
flags.DEFINE_enum(
name="dtype", short_name="dt", default="fp32",
enum_values=DTYPE_MAP.keys(),
help=help_wrap("The TensorFlow datatype used for calculations. "
"Variables may be cast to a higher precision on a "
"case-by-case basis for numerical stability."))
loss_scale_help_text = (
"The amount to scale the loss by when the model is run. {}. Before "
"gradients are computed, the loss is multiplied by the loss scale, "
"making all gradients loss_scale times larger. To adjust for this, "
"gradients are divided by the loss scale before being applied to "
"variables. This is mathematically equivalent to training without "
"a loss scale, but the loss scale helps avoid some intermediate "
"gradients from underflowing to zero. If not provided the default "
"for fp16 is 128 and 1 for all other dtypes.{}"
)
if dynamic_loss_scale:
loss_scale_help_text = loss_scale_help_text.format(
"This can be an int/float or the string 'dynamic'",
" The string 'dynamic' can be used to dynamically determine the "
"optimal loss scale during training, but currently this "
"significantly slows down performance")
loss_scale_validation_msg = ("loss_scale should be a positive int/float "
"or the string 'dynamic'.")
else:
loss_scale_help_text = loss_scale_help_text.format(
"This must be an int/float", "")
loss_scale_validation_msg = "loss_scale should be a positive int/float."
if loss_scale:
flags.DEFINE_string(
name="loss_scale", short_name="ls", default=None,
help=help_wrap(loss_scale_help_text))
@flags.validator(flag_name="loss_scale",
message=loss_scale_validation_msg)
def _check_loss_scale(loss_scale): # pylint: disable=unused-variable
"""Validator to check the loss scale flag is valid."""
if loss_scale is None:
return True # null case is handled in get_loss_scale()
if loss_scale == "dynamic" and dynamic_loss_scale:
return True
try:
loss_scale = float(loss_scale)
except ValueError:
return False
return loss_scale > 0
if fp16_implementation:
flags.DEFINE_enum(
name="fp16_implementation", default="keras",
enum_values=("keras', 'graph_rewrite"),
help=help_wrap(
"When --dtype=fp16, how fp16 should be implemented. This has no "
"impact on correctness. 'keras' uses the "
"tf.keras.mixed_precision API. 'graph_rewrite' uses the "
"tf.train.experimental.enable_mixed_precision_graph_rewrite "
"API."))
@flags.multi_flags_validator(["fp16_implementation", "dtype",
"loss_scale"])
def _check_fp16_implementation(flags_dict):
"""Validator to check fp16_implementation flag is valid."""
if (flags_dict["fp16_implementation"] == "graph_rewrite" and
flags_dict["dtype"] != "fp16"):
raise flags.ValidationError("--fp16_implementation should not be "
"specified unless --dtype=fp16")
return True
if all_reduce_alg:
flags.DEFINE_string(
name="all_reduce_alg", short_name="ara", default=None,
help=help_wrap("Defines the algorithm to use for performing all-reduce."
"When specified with MirroredStrategy for single "
"worker, this controls "
"tf.contrib.distribute.AllReduceCrossTowerOps. When "
"specified with MultiWorkerMirroredStrategy, this "
"controls "
"tf.distribute.experimental.CollectiveCommunication; "
"valid options are `ring` and `nccl`."))
if num_packs:
flags.DEFINE_integer(
name="num_packs", default=1,
help=help_wrap("Sets `num_packs` in the cross device ops used in "
"MirroredStrategy. For details, see "
"tf.distribute.NcclAllReduce."))
if tf_gpu_thread_mode:
flags.DEFINE_string(
name="tf_gpu_thread_mode", short_name="gt_mode", default=None,
help=help_wrap(
"Whether and how the GPU device uses its own threadpool.")
)
flags.DEFINE_integer(
name="per_gpu_thread_count", short_name="pgtc", default=0,
help=help_wrap(
"The number of threads to use for GPU. Only valid when "
"tf_gpu_thread_mode is not global.")
)
if datasets_num_private_threads:
flags.DEFINE_integer(
name="datasets_num_private_threads",
default=None,
help=help_wrap(
"Number of threads for a private threadpool created for all"
"datasets computation..")
)
if datasets_num_parallel_batches:
flags.DEFINE_integer(
name="datasets_num_parallel_batches",
default=None,
help=help_wrap(
"Determines how many batches to process in parallel when using "
"map and batch from tf.data.")
)
if training_dataset_cache:
flags.DEFINE_boolean(
name="training_dataset_cache",
default=False,
help=help_wrap(
"Determines whether to cache the training dataset on workers. "
"Typically used to improve training performance when training "
"data is in remote storage and can fit into worker memory.")
)
if tf_data_experimental_slack:
flags.DEFINE_boolean(
name="tf_data_experimental_slack",
default=False,
help=help_wrap(
"Whether to enable tf.data's `experimental_slack` option.")
)
if enable_xla:
flags.DEFINE_boolean(
name="enable_xla", default=False,
help="Whether to enable XLA auto jit compilation")
return key_flags
@@ -0,0 +1,133 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Public interface for flag definition.
See _example.py for detailed instructions on defining flags.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import sys
from six.moves import shlex_quote
from absl import app as absl_app
from absl import flags
from official.utils.flags import _base
from official.utils.flags import _benchmark
from official.utils.flags import _conventions
from official.utils.flags import _device
from official.utils.flags import _distribution
from official.utils.flags import _misc
from official.utils.flags import _performance
def set_defaults(**kwargs):
for key, value in kwargs.items():
flags.FLAGS.set_default(name=key, value=value)
def parse_flags(argv=None):
"""Reset flags and reparse. Currently only used in testing."""
flags.FLAGS.unparse_flags()
absl_app.parse_flags_with_usage(argv or sys.argv)
def register_key_flags_in_core(f):
"""Defines a function in core.py, and registers its key flags.
absl uses the location of a flags.declare_key_flag() to determine the context
in which a flag is key. By making all declares in core, this allows model
main functions to call flags.adopt_module_key_flags() on core and correctly
chain key flags.
Args:
f: The function to be wrapped
Returns:
The "core-defined" version of the input function.
"""
def core_fn(*args, **kwargs):
key_flags = f(*args, **kwargs)
[flags.declare_key_flag(fl) for fl in key_flags] # pylint: disable=expression-not-assigned
return core_fn
define_base = register_key_flags_in_core(_base.define_base)
# We have define_base_eager for compatibility, since it used to be a separate
# function from define_base.
define_base_eager = define_base
define_log_steps = register_key_flags_in_core(_benchmark.define_log_steps)
define_benchmark = register_key_flags_in_core(_benchmark.define_benchmark)
define_device = register_key_flags_in_core(_device.define_device)
define_image = register_key_flags_in_core(_misc.define_image)
define_performance = register_key_flags_in_core(_performance.define_performance)
define_distribution = register_key_flags_in_core(
_distribution.define_distribution)
help_wrap = _conventions.help_wrap
get_num_gpus = _base.get_num_gpus
get_tf_dtype = _performance.get_tf_dtype
get_loss_scale = _performance.get_loss_scale
DTYPE_MAP = _performance.DTYPE_MAP
require_cloud_storage = _device.require_cloud_storage
def _get_nondefault_flags_as_dict():
"""Returns the nondefault flags as a dict from flag name to value."""
nondefault_flags = {}
for flag_name in flags.FLAGS:
flag_value = getattr(flags.FLAGS, flag_name)
if (flag_name != flags.FLAGS[flag_name].short_name and
flag_value != flags.FLAGS[flag_name].default):
nondefault_flags[flag_name] = flag_value
return nondefault_flags
def get_nondefault_flags_as_str():
"""Returns flags as a string that can be passed as command line arguments.
E.g., returns: "--batch_size=256 --use_synthetic_data" for the following code
block:
```
flags.FLAGS.batch_size = 256
flags.FLAGS.use_synthetic_data = True
print(get_nondefault_flags_as_str())
```
Only flags with nondefault values are returned, as passing default flags as
command line arguments has no effect.
Returns:
A string with the flags, that can be passed as command line arguments to a
program to use the flags.
"""
nondefault_flags = _get_nondefault_flags_as_dict()
flag_strings = []
for name, value in sorted(nondefault_flags.items()):
if isinstance(value, bool):
flag_str = '--{}'.format(name) if value else '--no{}'.format(name)
elif isinstance(value, list):
flag_str = '--{}={}'.format(name, ','.join(value))
else:
flag_str = '--{}={}'.format(name, value)
flag_strings.append(flag_str)
return ' '.join(shlex_quote(flag_str) for flag_str in flag_strings)
@@ -0,0 +1,162 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import unittest
from absl import flags
import tensorflow as tf
from official.utils.flags import core as flags_core # pylint: disable=g-bad-import-order
def define_flags():
flags_core.define_base(clean=True, num_gpu=False, stop_threshold=True,
hooks=True, train_epochs=True,
epochs_between_evals=True)
flags_core.define_performance(
num_parallel_calls=True, inter_op=True, intra_op=True,
dynamic_loss_scale=True, loss_scale=True, synthetic_data=True,
dtype=True)
flags_core.define_image()
flags_core.define_benchmark()
class BaseTester(unittest.TestCase):
@classmethod
def setUpClass(cls):
super(BaseTester, cls).setUpClass()
define_flags()
def test_default_setting(self):
"""Test to ensure fields exist and defaults can be set.
"""
defaults = dict(
data_dir="dfgasf",
model_dir="dfsdkjgbs",
train_epochs=534,
epochs_between_evals=15,
batch_size=256,
hooks=["LoggingTensorHook"],
num_parallel_calls=18,
inter_op_parallelism_threads=5,
intra_op_parallelism_threads=10,
data_format="channels_first"
)
flags_core.set_defaults(**defaults)
flags_core.parse_flags()
for key, value in defaults.items():
assert flags.FLAGS.get_flag_value(name=key, default=None) == value
def test_benchmark_setting(self):
defaults = dict(
hooks=["LoggingMetricHook"],
benchmark_log_dir="/tmp/12345",
gcp_project="project_abc",
)
flags_core.set_defaults(**defaults)
flags_core.parse_flags()
for key, value in defaults.items():
assert flags.FLAGS.get_flag_value(name=key, default=None) == value
def test_booleans(self):
"""Test to ensure boolean flags trigger as expected.
"""
flags_core.parse_flags([__file__, "--use_synthetic_data"])
assert flags.FLAGS.use_synthetic_data
def test_parse_dtype_info(self):
flags_core.parse_flags([__file__, "--dtype", "fp16"])
self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float16)
self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
default_for_fp16=2), 2)
flags_core.parse_flags(
[__file__, "--dtype", "fp16", "--loss_scale", "5"])
self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
default_for_fp16=2), 5)
flags_core.parse_flags(
[__file__, "--dtype", "fp16", "--loss_scale", "dynamic"])
self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
default_for_fp16=2), "dynamic")
flags_core.parse_flags([__file__, "--dtype", "fp32"])
self.assertEqual(flags_core.get_tf_dtype(flags.FLAGS), tf.float32)
self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
default_for_fp16=2), 1)
flags_core.parse_flags([__file__, "--dtype", "fp32", "--loss_scale", "5"])
self.assertEqual(flags_core.get_loss_scale(flags.FLAGS,
default_for_fp16=2), 5)
with self.assertRaises(SystemExit):
flags_core.parse_flags([__file__, "--dtype", "int8"])
with self.assertRaises(SystemExit):
flags_core.parse_flags([__file__, "--dtype", "fp16",
"--loss_scale", "abc"])
def test_get_nondefault_flags_as_str(self):
defaults = dict(
clean=True,
data_dir="abc",
hooks=["LoggingTensorHook"],
stop_threshold=1.5,
use_synthetic_data=False
)
flags_core.set_defaults(**defaults)
flags_core.parse_flags()
expected_flags = ""
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
flags.FLAGS.clean = False
expected_flags += "--noclean"
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
flags.FLAGS.data_dir = "xyz"
expected_flags += " --data_dir=xyz"
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
flags.FLAGS.hooks = ["aaa", "bbb", "ccc"]
expected_flags += " --hooks=aaa,bbb,ccc"
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
flags.FLAGS.stop_threshold = 3.
expected_flags += " --stop_threshold=3.0"
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
flags.FLAGS.use_synthetic_data = True
expected_flags += " --use_synthetic_data"
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
# Assert that explicit setting a flag to its default value does not cause it
# to appear in the string
flags.FLAGS.use_synthetic_data = False
expected_flags = expected_flags[:-len(" --use_synthetic_data")]
self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
if __name__ == "__main__":
unittest.main()
@@ -0,0 +1,65 @@
# Using flags in official models
1. **All common flags must be incorporated in the models.**
Common flags (i.e. batch_size, model_dir, etc.) are provided by various flag definition functions,
and channeled through `official.utils.flags.core`. For instance to define common supervised
learning parameters one could use the following code:
```$xslt
from absl import app as absl_app
from absl import flags
from official.utils.flags import core as flags_core
def define_flags():
flags_core.define_base()
flags.adopt_key_flags(flags_core)
def main(_):
flags_obj = flags.FLAGS
print(flags_obj)
if __name__ == "__main__"
absl_app.run(main)
```
2. **Validate flag values.**
See the [Validators](#validators) section for implementation details.
Validators in the official model repo should not access the file system, such as verifying
that files exist, due to the strict ordering requirements.
3. **Flag values should not be mutated.**
Instead of mutating flag values, use getter functions to return the desired values. An example
getter function is `get_tf_dtype` function below:
```
# Map string to TensorFlow dtype
DTYPE_MAP = {
"fp16": tf.float16,
"fp32": tf.float32,
}
def get_tf_dtype(flags_obj):
if getattr(flags_obj, "fp16_implementation", None) == "graph_rewrite":
# If the graph_rewrite is used, we build the graph with fp32, and let the
# graph rewrite change ops to fp16.
return tf.float32
return DTYPE_MAP[flags_obj.dtype]
def main(_):
flags_obj = flags.FLAGS()
# Do not mutate flags_obj
# if flags_obj.fp16_implementation == "graph_rewrite":
# flags_obj.dtype = "float32" # Don't do this
print(get_tf_dtype(flags_obj))
...
```
@@ -0,0 +1,119 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common flags for importing hyperparameters."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
from absl import flags
from official.utils.flags import core as flags_core
FLAGS = flags.FLAGS
def define_common_hparams_flags():
"""Define the common flags across models."""
flags.DEFINE_string(
'model_dir',
default=None,
help=('The directory where the model and training/evaluation summaries'
'are stored.'))
flags.DEFINE_integer(
'train_batch_size', default=None, help='Batch size for training.')
flags.DEFINE_integer(
'eval_batch_size', default=None, help='Batch size for evaluation.')
flags.DEFINE_string(
'precision',
default=None,
help=('Precision to use; one of: {bfloat16, float32}'))
flags.DEFINE_string(
'config_file',
default=None,
help=('A YAML file which specifies overrides. Note that this file can be '
'used as an override template to override the default parameters '
'specified in Python. If the same parameter is specified in both '
'`--config_file` and `--params_override`, the one in '
'`--params_override` will be used finally.'))
flags.DEFINE_string(
'params_override',
default=None,
help=('a YAML/JSON string or a YAML file which specifies additional '
'overrides over the default parameters and those specified in '
'`--config_file`. Note that this is supposed to be used only to '
'override the model parameters, but not the parameters like TPU '
'specific flags. One canonical use case of `--config_file` and '
'`--params_override` is users first define a template config file '
'using `--config_file`, then use `--params_override` to adjust the '
'minimal set of tuning parameters, for example setting up different'
' `train_batch_size`. '
'The final override order of parameters: default_model_params --> '
'params from config_file --> params in params_override.'
'See also the help message of `--config_file`.'))
flags.DEFINE_integer('save_checkpoint_freq', None,
'Number of steps to save checkpoint.')
def initialize_common_flags():
"""Define the common flags across models."""
define_common_hparams_flags()
flags_core.define_device(tpu=True)
flags_core.define_base(
num_gpu=True, model_dir=False, data_dir=False, batch_size=False)
flags_core.define_distribution(worker_hosts=True, task_index=True)
flags_core.define_performance(all_reduce_alg=True, num_packs=True)
# Reset the default value of num_gpus to zero.
FLAGS.num_gpus = 0
flags.DEFINE_string(
'strategy_type', 'mirrored', 'Type of distribute strategy.'
'One of mirrored, tpu and multiworker.')
def strategy_flags_dict():
"""Returns TPU and/or GPU related flags in a dictionary."""
return {
# TPUStrategy related flags.
'tpu': FLAGS.tpu,
# MultiWorkerMirroredStrategy related flags.
'all_reduce_alg': FLAGS.all_reduce_alg,
'worker_hosts': FLAGS.worker_hosts,
'task_index': FLAGS.task_index,
# MirroredStrategy and OneDeviceStrategy
'num_gpus': FLAGS.num_gpus,
'num_packs': FLAGS.num_packs,
}
def hparam_flags_dict():
"""Returns model params related flags in a dictionary."""
return {
'data_dir': FLAGS.data_dir,
'model_dir': FLAGS.model_dir,
'train_batch_size': FLAGS.train_batch_size,
'eval_batch_size': FLAGS.eval_batch_size,
'precision': FLAGS.precision,
'config_file': FLAGS.config_file,
'params_override': FLAGS.params_override,
}
@@ -0,0 +1,34 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities that interact with cloud service.
"""
import requests
GCP_METADATA_URL = "http://metadata/computeMetadata/v1/instance/hostname"
GCP_METADATA_HEADER = {"Metadata-Flavor": "Google"}
def on_gcp():
"""Detect whether the current running environment is on GCP."""
try:
# Timeout in 5 seconds, in case the test environment has connectivity issue.
# There is not default timeout, which means it might block forever.
response = requests.get(
GCP_METADATA_URL, headers=GCP_METADATA_HEADER, timeout=5)
return response.status_code == 200
except requests.exceptions.RequestException:
return False
@@ -0,0 +1,48 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for cloud_lib."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import mock
import requests
from official.utils.logs import cloud_lib
class CloudLibTest(unittest.TestCase):
@mock.patch("requests.get")
def test_on_gcp(self, mock_requests_get):
mock_response = mock.MagicMock()
mock_requests_get.return_value = mock_response
mock_response.status_code = 200
self.assertEqual(cloud_lib.on_gcp(), True)
@mock.patch("requests.get")
def test_not_on_gcp(self, mock_requests_get):
mock_requests_get.side_effect = requests.exceptions.ConnectionError()
self.assertEqual(cloud_lib.on_gcp(), False)
if __name__ == "__main__":
unittest.main()
@@ -0,0 +1,58 @@
# Logging in official models
This library adds logging functions that print or save tensor values. Official models should define all common hooks
(using hooks helper) and a benchmark logger.
1. **Training Hooks**
Hooks are a TensorFlow concept that define specific actions at certain points of the execution. We use them to obtain and log
tensor values during training.
hooks_helper.py provides an easy way to create common hooks. The following hooks are currently defined:
* LoggingTensorHook: Logs tensor values
* ProfilerHook: Writes a timeline json that can be loaded into chrome://tracing.
* ExamplesPerSecondHook: Logs the number of examples processed per second.
* LoggingMetricHook: Similar to LoggingTensorHook, except that the tensors are logged in a format defined by our data
anaylsis pipeline.
2. **Benchmarks**
The benchmark logger provides useful functions for logging environment information, and evaluation results.
The module also contains a context which is used to update the status of the run.
Example usage:
```
from absl import app as absl_app
from official.utils.logs import hooks_helper
from official.utils.logs import logger
def model_main(flags_obj):
estimator = ...
benchmark_logger = logger.get_benchmark_logger()
benchmark_logger.log_run_info(...)
train_hooks = hooks_helper.get_train_hooks(...)
for epoch in range(10):
estimator.train(..., hooks=train_hooks)
eval_results = estimator.evaluate(...)
# Log a dictionary of metrics
benchmark_logger.log_evaluation_result(eval_results)
# Log an individual metric
benchmark_logger.log_metric(...)
def main(_):
with logger.benchmark_context(flags.FLAGS):
model_main(flags.FLAGS)
if __name__ == "__main__":
# define flags
absl_app.run(main)
```
@@ -0,0 +1,146 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Hook that counts examples per second every N steps or seconds."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from hccl.manage.api import get_rank_size
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import logger
from benchmark_log import hwlog
import time
import sys
class ExamplesPerSecondHook(tf.estimator.SessionRunHook):
"""Hook to print out examples per second.
Total time is tracked and then divided by the total number of steps
to get the average step time and then batch_size is used to determine
the running average of examples per second. The examples per second for the
most recent interval is also logged.
"""
def __init__(self,
batch_size,
every_n_steps=None,
every_n_secs=None,
warm_steps=0,
metric_logger=None):
"""Initializer for ExamplesPerSecondHook.
Args:
batch_size: Total batch size across all workers used to calculate
examples/second from global time.
every_n_steps: Log stats every n steps.
every_n_secs: Log stats every n seconds. Exactly one of the
`every_n_steps` or `every_n_secs` should be set.
warm_steps: The number of steps to be skipped before logging and running
average calculation. warm_steps steps refers to global steps across all
workers, not on each worker
metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
hook should use to write the log. If None, BaseBenchmarkLogger will
be used.
Raises:
ValueError: if neither `every_n_steps` or `every_n_secs` is set, or
both are set.
"""
if (every_n_steps is None) == (every_n_secs is None):
raise ValueError("exactly one of every_n_steps"
" and every_n_secs should be provided.")
self._logger = metric_logger or logger.BaseBenchmarkLogger()
self._timer = tf.estimator.SecondOrStepTimer(
every_steps=every_n_steps, every_secs=every_n_secs)
self._step_train_time = 0
self._total_steps = 0
self._batch_size = batch_size
self._warm_steps = warm_steps
# List of examples per second logged every_n_steps.
self.current_examples_per_sec_list = []
def begin(self):
"""Called once before using the session to check global step."""
tf.compat.v1.logging.warning("##########ExamplesPerSecondHook begin")
self._global_step_tensor = tf.compat.v1.train.get_global_step()
if self._global_step_tensor is None:
raise RuntimeError(
"Global step should be created to use StepCounterHook.")
def before_run(self, run_context): # pylint: disable=unused-argument
"""Called before each call to run().
Args:
run_context: A SessionRunContext object.
Returns:
A SessionRunArgs object or None if never triggered.
"""
self.t0 = time.time()
tf.compat.v1.logging.warning("##########ExamplesPerSecondHook before")
return tf.estimator.SessionRunArgs(self._global_step_tensor)
def after_run(self, run_context, run_values): # pylint: disable=unused-argument
"""Called after each call to run().
Args:
run_context: A SessionRunContext object.
run_values: A SessionRunValues object.
"""
tf.compat.v1.logging.warning("##########ExamplesPerSecondHook after_run")
global_step = run_values.results
#if self._timer.should_trigger_for_step(
#global_step) and global_step > self._warm_steps:
elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
global_step)
batch_time = time.time() - self.t0
ips = self._batch_size/batch_time
if elapsed_time is not None:
self._step_train_time += elapsed_time
self._total_steps += elapsed_steps
# average examples per second is based on the total (accumulative)
# training steps and training time so far
average_examples_per_sec = self._batch_size * (
self._total_steps / self._step_train_time)
# current examples per second is based on the elapsed training steps
# and training time per batch
current_examples_per_sec = self._batch_size * get_rank_size() * (
elapsed_steps / elapsed_time)
# Logs entries to be read from hook during or after run.
self.current_examples_per_sec_list.append(current_examples_per_sec)
self._logger.log_metric(
"average_examples_per_sec", average_examples_per_sec,
global_step=global_step)
self._logger.log_metric(
"current_examples_per_sec", current_examples_per_sec,
global_step=global_step)
tf.compat.v1.logging.warning(
"steps: %s,elapsed_steps:%d,batch:%d,FPS:%f,ips:%f,batch_time:%f", int(self._total_steps),
int(elapsed_steps),int(self._batch_size),float(current_examples_per_sec),float(ips),
float(batch_time))
# get FPS info, add by wx933135
#date_time = hwlog.get_time()
#remark_logger.info("ABK time_ts: %s, fps: %f, steps: %s, file: %s, lineno: %s" % (date_time,
# float(current_examples_per_sec), int(self._total_steps), file_name, sys._getframe().f_lineno))
hwlog.remark_print(key=hwlog.FPS, value=float(current_examples_per_sec))
@@ -0,0 +1,172 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Hooks helper to return a list of TensorFlow hooks for training by name.
More hooks can be added to this set. To add a new hook, 1) add the new hook to
the registry in HOOKS, 2) add a corresponding function that parses out necessary
parameters.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import hooks
from official.utils.logs import logger
from official.utils.logs import metric_hook
_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
'cross_entropy',
'train_accuracy'])
def get_train_hooks(name_list, use_tpu=False, **kwargs):
"""Factory for getting a list of TensorFlow hooks for training by name.
Args:
name_list: a list of strings to name desired hook classes. Allowed:
LoggingTensorHook, ProfilerHook, ExamplesPerSecondHook, which are defined
as keys in HOOKS
use_tpu: Boolean of whether computation occurs on a TPU. This will disable
hooks altogether.
**kwargs: a dictionary of arguments to the hooks.
Returns:
list of instantiated hooks, ready to be used in a classifier.train call.
Raises:
ValueError: if an unrecognized name is passed.
"""
if not name_list:
return []
if use_tpu:
tf.compat.v1.logging.warning('hooks_helper received name_list `{}`, but a '
'TPU is specified. No hooks will be used.'
.format(name_list))
return []
train_hooks = []
for name in name_list:
hook_name = HOOKS.get(name.strip().lower())
if hook_name is None:
raise ValueError('Unrecognized training hook requested: {}'.format(name))
else:
train_hooks.append(hook_name(**kwargs))
return train_hooks
def get_logging_tensor_hook(every_n_iter=100, tensors_to_log=None, **kwargs): # pylint: disable=unused-argument
"""Function to get LoggingTensorHook.
Args:
every_n_iter: `int`, print the values of `tensors` once every N local
steps taken on the current worker.
tensors_to_log: List of tensor names or dictionary mapping labels to tensor
names. If not set, log _TENSORS_TO_LOG by default.
**kwargs: a dictionary of arguments to LoggingTensorHook.
Returns:
Returns a LoggingTensorHook with a standard set of tensors that will be
printed to stdout.
"""
if tensors_to_log is None:
tensors_to_log = _TENSORS_TO_LOG
return tf.estimator.LoggingTensorHook(
tensors=tensors_to_log,
every_n_iter=every_n_iter)
def get_profiler_hook(model_dir, save_steps=1000, **kwargs): # pylint: disable=unused-argument
"""Function to get ProfilerHook.
Args:
model_dir: The directory to save the profile traces to.
save_steps: `int`, print profile traces every N steps.
**kwargs: a dictionary of arguments to ProfilerHook.
Returns:
Returns a ProfilerHook that writes out timelines that can be loaded into
profiling tools like chrome://tracing.
"""
return tf.estimator.ProfilerHook(save_steps=save_steps, output_dir=model_dir)
def get_examples_per_second_hook(every_n_steps=100,
batch_size=128,
warm_steps=5,
**kwargs): # pylint: disable=unused-argument
"""Function to get ExamplesPerSecondHook.
Args:
every_n_steps: `int`, print current and average examples per second every
N steps.
batch_size: `int`, total batch size used to calculate examples/second from
global time.
warm_steps: skip this number of steps before logging and running average.
**kwargs: a dictionary of arguments to ExamplesPerSecondHook.
Returns:
Returns a ProfilerHook that writes out timelines that can be loaded into
profiling tools like chrome://tracing.
"""
return hooks.ExamplesPerSecondHook(
batch_size=batch_size, every_n_steps=every_n_steps,
warm_steps=warm_steps, metric_logger=logger.get_benchmark_logger())
def get_logging_metric_hook(tensors_to_log=None,
every_n_secs=600,
**kwargs): # pylint: disable=unused-argument
"""Function to get LoggingMetricHook.
Args:
tensors_to_log: List of tensor names or dictionary mapping labels to tensor
names. If not set, log _TENSORS_TO_LOG by default.
every_n_secs: `int`, the frequency for logging the metric. Default to every
10 mins.
**kwargs: a dictionary of arguments.
Returns:
Returns a LoggingMetricHook that saves tensor values in a JSON format.
"""
if tensors_to_log is None:
tensors_to_log = _TENSORS_TO_LOG
return metric_hook.LoggingMetricHook(
tensors=tensors_to_log,
metric_logger=logger.get_benchmark_logger(),
every_n_secs=every_n_secs)
def get_step_counter_hook(**kwargs):
"""Function to get StepCounterHook."""
del kwargs
return tf.estimator.StepCounterHook()
# A dictionary to map one hook name and its corresponding function
HOOKS = {
'loggingtensorhook': get_logging_tensor_hook,
'profilerhook': get_profiler_hook,
'examplespersecondhook': get_examples_per_second_hook,
'loggingmetrichook': get_logging_metric_hook,
'stepcounterhook': get_step_counter_hook
}
@@ -0,0 +1,73 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for hooks_helper."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import hooks_helper
from official.utils.misc import keras_utils
class BaseTest(unittest.TestCase):
def setUp(self):
super(BaseTest, self).setUp()
if keras_utils.is_v2_0:
tf.compat.v1.disable_eager_execution()
def test_raise_in_non_list_names(self):
with self.assertRaises(ValueError):
hooks_helper.get_train_hooks(
'LoggingTensorHook, ProfilerHook', model_dir="", batch_size=256)
def test_raise_in_invalid_names(self):
invalid_names = ['StepCounterHook', 'StopAtStepHook']
with self.assertRaises(ValueError):
hooks_helper.get_train_hooks(invalid_names, model_dir="", batch_size=256)
def validate_train_hook_name(self,
test_hook_name,
expected_hook_name,
**kwargs):
returned_hook = hooks_helper.get_train_hooks(
[test_hook_name], model_dir="", **kwargs)
self.assertEqual(len(returned_hook), 1)
self.assertIsInstance(returned_hook[0], tf.estimator.SessionRunHook)
self.assertEqual(returned_hook[0].__class__.__name__.lower(),
expected_hook_name)
def test_get_train_hooks_logging_tensor_hook(self):
self.validate_train_hook_name('LoggingTensorHook', 'loggingtensorhook')
def test_get_train_hooks_profiler_hook(self):
self.validate_train_hook_name('ProfilerHook', 'profilerhook')
def test_get_train_hooks_examples_per_second_hook(self):
self.validate_train_hook_name('ExamplesPerSecondHook',
'examplespersecondhook')
def test_get_logging_metric_hook(self):
test_hook_name = 'LoggingMetricHook'
self.validate_train_hook_name(test_hook_name, 'loggingmetrichook')
if __name__ == '__main__':
tf.test.main()
@@ -0,0 +1,158 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for hooks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import hooks
from official.utils.testing import mock_lib
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
class ExamplesPerSecondHookTest(tf.test.TestCase):
"""Tests for the ExamplesPerSecondHook.
In the test, we explicitly run global_step tensor after train_op in order to
keep the global_step value and the train_op (which increase the glboal_step
by 1) consistent. This is to correct the discrepancies in reported global_step
value when running on GPUs.
"""
def setUp(self):
"""Mock out logging calls to verify if correct info is being monitored."""
self._logger = mock_lib.MockBenchmarkLogger()
self.graph = tf.Graph()
with self.graph.as_default():
tf.compat.v1.train.create_global_step()
self.train_op = tf.compat.v1.assign_add(
tf.compat.v1.train.get_global_step(), 1)
self.global_step = tf.compat.v1.train.get_global_step()
def test_raise_in_both_secs_and_steps(self):
with self.assertRaises(ValueError):
hooks.ExamplesPerSecondHook(
batch_size=256,
every_n_steps=10,
every_n_secs=20,
metric_logger=self._logger)
def test_raise_in_none_secs_and_steps(self):
with self.assertRaises(ValueError):
hooks.ExamplesPerSecondHook(
batch_size=256,
every_n_steps=None,
every_n_secs=None,
metric_logger=self._logger)
def _validate_log_every_n_steps(self, every_n_steps, warm_steps):
hook = hooks.ExamplesPerSecondHook(
batch_size=256,
every_n_steps=every_n_steps,
warm_steps=warm_steps,
metric_logger=self._logger)
with tf.compat.v1.train.MonitoredSession(
tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess:
for _ in range(every_n_steps):
# Explicitly run global_step after train_op to get the accurate
# global_step value
mon_sess.run(self.train_op)
mon_sess.run(self.global_step)
# Nothing should be in the list yet
self.assertFalse(self._logger.logged_metric)
mon_sess.run(self.train_op)
global_step_val = mon_sess.run(self.global_step)
if global_step_val > warm_steps:
self._assert_metrics()
else:
# Nothing should be in the list yet
self.assertFalse(self._logger.logged_metric)
# Add additional run to verify proper reset when called multiple times.
prev_log_len = len(self._logger.logged_metric)
mon_sess.run(self.train_op)
global_step_val = mon_sess.run(self.global_step)
if every_n_steps == 1 and global_step_val > warm_steps:
# Each time, we log two additional metrics. Did exactly 2 get added?
self.assertEqual(len(self._logger.logged_metric), prev_log_len + 2)
else:
# No change in the size of the metric list.
self.assertEqual(len(self._logger.logged_metric), prev_log_len)
def test_examples_per_sec_every_1_steps(self):
with self.graph.as_default():
self._validate_log_every_n_steps(1, 0)
def test_examples_per_sec_every_5_steps(self):
with self.graph.as_default():
self._validate_log_every_n_steps(5, 0)
def test_examples_per_sec_every_1_steps_with_warm_steps(self):
with self.graph.as_default():
self._validate_log_every_n_steps(1, 10)
def test_examples_per_sec_every_5_steps_with_warm_steps(self):
with self.graph.as_default():
self._validate_log_every_n_steps(5, 10)
def _validate_log_every_n_secs(self, every_n_secs):
hook = hooks.ExamplesPerSecondHook(
batch_size=256,
every_n_steps=None,
every_n_secs=every_n_secs,
metric_logger=self._logger)
with tf.compat.v1.train.MonitoredSession(
tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess:
# Explicitly run global_step after train_op to get the accurate
# global_step value
mon_sess.run(self.train_op)
mon_sess.run(self.global_step)
# Nothing should be in the list yet
self.assertFalse(self._logger.logged_metric)
time.sleep(every_n_secs)
mon_sess.run(self.train_op)
mon_sess.run(self.global_step)
self._assert_metrics()
def test_examples_per_sec_every_1_secs(self):
with self.graph.as_default():
self._validate_log_every_n_secs(1)
def test_examples_per_sec_every_5_secs(self):
with self.graph.as_default():
self._validate_log_every_n_secs(5)
def _assert_metrics(self):
metrics = self._logger.logged_metric
self.assertEqual(metrics[-2]["name"], "average_examples_per_sec")
self.assertEqual(metrics[-1]["name"], "current_examples_per_sec")
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,423 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Logging utilities for benchmark.
For collecting local environment metrics like CPU and memory, certain python
packages need be installed. See README for details.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import contextlib
import datetime
import json
import multiprocessing
import numbers
import os
import threading
import uuid
from six.moves import _thread as thread
from absl import flags
import tensorflow as tf
from tensorflow.python.client import device_lib
from official.utils.logs import cloud_lib
METRIC_LOG_FILE_NAME = "metric.log"
BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
GCP_TEST_ENV = "GCP"
RUN_STATUS_SUCCESS = "success"
RUN_STATUS_FAILURE = "failure"
RUN_STATUS_RUNNING = "running"
FLAGS = flags.FLAGS
# Don't use it directly. Use get_benchmark_logger to access a logger.
_benchmark_logger = None
_logger_lock = threading.Lock()
def config_benchmark_logger(flag_obj=None):
"""Config the global benchmark logger."""
_logger_lock.acquire()
try:
global _benchmark_logger
if not flag_obj:
flag_obj = FLAGS
if (not hasattr(flag_obj, "benchmark_logger_type") or
flag_obj.benchmark_logger_type == "BaseBenchmarkLogger"):
_benchmark_logger = BaseBenchmarkLogger()
elif flag_obj.benchmark_logger_type == "BenchmarkFileLogger":
_benchmark_logger = BenchmarkFileLogger(flag_obj.benchmark_log_dir)
elif flag_obj.benchmark_logger_type == "BenchmarkBigQueryLogger":
from official.benchmark import benchmark_uploader as bu # pylint: disable=g-import-not-at-top
bq_uploader = bu.BigQueryUploader(gcp_project=flag_obj.gcp_project)
_benchmark_logger = BenchmarkBigQueryLogger(
bigquery_uploader=bq_uploader,
bigquery_data_set=flag_obj.bigquery_data_set,
bigquery_run_table=flag_obj.bigquery_run_table,
bigquery_run_status_table=flag_obj.bigquery_run_status_table,
bigquery_metric_table=flag_obj.bigquery_metric_table,
run_id=str(uuid.uuid4()))
else:
raise ValueError("Unrecognized benchmark_logger_type: %s"
% flag_obj.benchmark_logger_type)
finally:
_logger_lock.release()
return _benchmark_logger
def get_benchmark_logger():
if not _benchmark_logger:
config_benchmark_logger()
return _benchmark_logger
@contextlib.contextmanager
def benchmark_context(flag_obj):
"""Context of benchmark, which will update status of the run accordingly."""
benchmark_logger = config_benchmark_logger(flag_obj)
try:
yield
benchmark_logger.on_finish(RUN_STATUS_SUCCESS)
except Exception: # pylint: disable=broad-except
# Catch all the exception, update the run status to be failure, and re-raise
benchmark_logger.on_finish(RUN_STATUS_FAILURE)
raise
class BaseBenchmarkLogger(object):
"""Class to log the benchmark information to STDOUT."""
def log_evaluation_result(self, eval_results):
"""Log the evaluation result.
The evaluate result is a dictionary that contains metrics defined in
model_fn. It also contains a entry for global_step which contains the value
of the global step when evaluation was performed.
Args:
eval_results: dict, the result of evaluate.
"""
if not isinstance(eval_results, dict):
tf.compat.v1.logging.warning(
"eval_results should be dictionary for logging. Got %s",
type(eval_results))
return
global_step = eval_results[tf.compat.v1.GraphKeys.GLOBAL_STEP]
for key in sorted(eval_results):
if key != tf.compat.v1.GraphKeys.GLOBAL_STEP:
self.log_metric(key, eval_results[key], global_step=global_step)
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to local file.
Currently the logging is done in a synchronized way. This should be updated
to log asynchronously.
Args:
name: string, the name of the metric to log.
value: number, the value of the metric. The value will not be logged if it
is not a number type.
unit: string, the unit of the metric, E.g "image per second".
global_step: int, the global_step when the metric is logged.
extras: map of string:string, the extra information about the metric.
"""
metric = _process_metric_to_json(name, value, unit, global_step, extras)
if metric:
tf.compat.v1.logging.info("Benchmark metric: %s", metric)
def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
tf.compat.v1.logging.info(
"Benchmark run: %s", _gather_run_info(model_name, dataset_name,
run_params, test_id))
def on_finish(self, status):
pass
class BenchmarkFileLogger(BaseBenchmarkLogger):
"""Class to log the benchmark information to local disk."""
def __init__(self, logging_dir):
super(BenchmarkFileLogger, self).__init__()
self._logging_dir = logging_dir
if not tf.io.gfile.isdir(self._logging_dir):
tf.io.gfile.makedirs(self._logging_dir)
self._metric_file_handler = tf.io.gfile.GFile(
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a")
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to local file.
Currently the logging is done in a synchronized way. This should be updated
to log asynchronously.
Args:
name: string, the name of the metric to log.
value: number, the value of the metric. The value will not be logged if it
is not a number type.
unit: string, the unit of the metric, E.g "image per second".
global_step: int, the global_step when the metric is logged.
extras: map of string:string, the extra information about the metric.
"""
metric = _process_metric_to_json(name, value, unit, global_step, extras)
if metric:
try:
json.dump(metric, self._metric_file_handler)
self._metric_file_handler.write("\n")
self._metric_file_handler.flush()
except (TypeError, ValueError) as e:
tf.compat.v1.logging.warning(
"Failed to dump metric to log file: name %s, value %s, error %s",
name, value, e)
def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
"""Collect most of the TF runtime information for the local env.
The schema of the run info follows official/benchmark/datastore/schema.
Args:
model_name: string, the name of the model.
dataset_name: string, the name of dataset for training and evaluation.
run_params: dict, the dictionary of parameters for the run, it could
include hyperparameters or other params that are important for the run.
test_id: string, the unique name of the test run by the combination of key
parameters, eg batch size, num of GPU. It is hardware independent.
"""
run_info = _gather_run_info(model_name, dataset_name, run_params, test_id)
with tf.io.gfile.GFile(os.path.join(
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
try:
json.dump(run_info, f)
f.write("\n")
except (TypeError, ValueError) as e:
tf.compat.v1.logging.warning(
"Failed to dump benchmark run info to log file: %s", e)
def on_finish(self, status):
self._metric_file_handler.flush()
self._metric_file_handler.close()
class BenchmarkBigQueryLogger(BaseBenchmarkLogger):
"""Class to log the benchmark information to BigQuery data store."""
def __init__(self,
bigquery_uploader,
bigquery_data_set,
bigquery_run_table,
bigquery_run_status_table,
bigquery_metric_table,
run_id):
super(BenchmarkBigQueryLogger, self).__init__()
self._bigquery_uploader = bigquery_uploader
self._bigquery_data_set = bigquery_data_set
self._bigquery_run_table = bigquery_run_table
self._bigquery_run_status_table = bigquery_run_status_table
self._bigquery_metric_table = bigquery_metric_table
self._run_id = run_id
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to bigquery.
Args:
name: string, the name of the metric to log.
value: number, the value of the metric. The value will not be logged if it
is not a number type.
unit: string, the unit of the metric, E.g "image per second".
global_step: int, the global_step when the metric is logged.
extras: map of string:string, the extra information about the metric.
"""
metric = _process_metric_to_json(name, value, unit, global_step, extras)
if metric:
# Starting new thread for bigquery upload in case it might take long time
# and impact the benchmark and performance measurement. Starting a new
# thread might have potential performance impact for model that run on
# CPU.
thread.start_new_thread(
self._bigquery_uploader.upload_benchmark_metric_json,
(self._bigquery_data_set,
self._bigquery_metric_table,
self._run_id,
[metric]))
def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
"""Collect most of the TF runtime information for the local env.
The schema of the run info follows official/benchmark/datastore/schema.
Args:
model_name: string, the name of the model.
dataset_name: string, the name of dataset for training and evaluation.
run_params: dict, the dictionary of parameters for the run, it could
include hyperparameters or other params that are important for the run.
test_id: string, the unique name of the test run by the combination of key
parameters, eg batch size, num of GPU. It is hardware independent.
"""
run_info = _gather_run_info(model_name, dataset_name, run_params, test_id)
# Starting new thread for bigquery upload in case it might take long time
# and impact the benchmark and performance measurement. Starting a new
# thread might have potential performance impact for model that run on CPU.
thread.start_new_thread(
self._bigquery_uploader.upload_benchmark_run_json,
(self._bigquery_data_set,
self._bigquery_run_table,
self._run_id,
run_info))
thread.start_new_thread(
self._bigquery_uploader.insert_run_status,
(self._bigquery_data_set,
self._bigquery_run_status_table,
self._run_id,
RUN_STATUS_RUNNING))
def on_finish(self, status):
self._bigquery_uploader.update_run_status(
self._bigquery_data_set,
self._bigquery_run_status_table,
self._run_id,
status)
def _gather_run_info(model_name, dataset_name, run_params, test_id):
"""Collect the benchmark run information for the local environment."""
run_info = {
"model_name": model_name,
"dataset": {"name": dataset_name},
"machine_config": {},
"test_id": test_id,
"run_date": datetime.datetime.utcnow().strftime(
_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_run_params(run_info, run_params)
_collect_cpu_info(run_info)
_collect_memory_info(run_info)
_collect_test_environment(run_info)
return run_info
def _process_metric_to_json(
name, value, unit=None, global_step=None, extras=None):
"""Validate the metric data and generate JSON for insert."""
if not isinstance(value, numbers.Number):
tf.compat.v1.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return None
extras = _convert_to_json_dict(extras)
return {
"name": name,
"value": float(value),
"unit": unit,
"global_step": global_step,
"timestamp": datetime.datetime.utcnow().strftime(
_DATE_TIME_FORMAT_PATTERN),
"extras": extras}
def _collect_tensorflow_info(run_info):
run_info["tensorflow_version"] = {
"version": tf.version.VERSION, "git_hash": tf.version.GIT_VERSION}
def _collect_run_params(run_info, run_params):
"""Log the parameter information for the benchmark run."""
def process_param(name, value):
type_check = {
str: {"name": name, "string_value": value},
int: {"name": name, "long_value": value},
bool: {"name": name, "bool_value": str(value)},
float: {"name": name, "float_value": value},
}
return type_check.get(type(value),
{"name": name, "string_value": str(value)})
if run_params:
run_info["run_parameters"] = [
process_param(k, v) for k, v in sorted(run_params.items())]
def _collect_tensorflow_environment_variables(run_info):
run_info["tensorflow_environment_variables"] = [
{"name": k, "value": v}
for k, v in sorted(os.environ.items()) if k.startswith("TF_")]
# The following code is mirrored from tensorflow/tools/test/system_info_lib
# which is not exposed for import.
def _collect_cpu_info(run_info):
"""Collect the CPU information for the local environment."""
cpu_info = {}
cpu_info["num_cores"] = multiprocessing.cpu_count()
try:
# Note: cpuinfo is not installed in the TensorFlow OSS tree.
# It is installable via pip.
import cpuinfo # pylint: disable=g-import-not-at-top
info = cpuinfo.get_cpu_info()
cpu_info["cpu_info"] = info["brand"]
cpu_info["mhz_per_cpu"] = info["hz_advertised_raw"][0] / 1.0e6
run_info["machine_config"]["cpu_info"] = cpu_info
except ImportError:
tf.compat.v1.logging.warn(
"'cpuinfo' not imported. CPU info will not be logged.")
def _collect_memory_info(run_info):
try:
# Note: psutil is not installed in the TensorFlow OSS tree.
# It is installable via pip.
import psutil # pylint: disable=g-import-not-at-top
vmem = psutil.virtual_memory()
run_info["machine_config"]["memory_total"] = vmem.total
run_info["machine_config"]["memory_available"] = vmem.available
except ImportError:
tf.compat.v1.logging.warn(
"'psutil' not imported. Memory info will not be logged.")
def _collect_test_environment(run_info):
"""Detect the local environment, eg GCE, AWS or DGX, etc."""
if cloud_lib.on_gcp():
run_info["test_environment"] = GCP_TEST_ENV
# TODO(scottzhu): Add more testing env detection for other platform
def _parse_gpu_model(physical_device_desc):
# Assume all the GPU connected are same model
for kv in physical_device_desc.split(","):
k, _, v = kv.partition(":")
if k.strip() == "name":
return v.strip()
return None
def _convert_to_json_dict(input_dict):
if input_dict:
return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
else:
return []
@@ -0,0 +1,365 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for benchmark logger."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import tempfile
import time
import unittest
import mock
from absl.testing import flagsaver
import tensorflow as tf # pylint: disable=g-bad-import-order
try:
from google.cloud import bigquery
except ImportError:
bigquery = None
from official.utils.misc import keras_utils
from official.utils.flags import core as flags_core
from official.utils.logs import logger
class BenchmarkLoggerTest(tf.test.TestCase):
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(BenchmarkLoggerTest, cls).setUpClass()
flags_core.define_benchmark()
def test_get_default_benchmark_logger(self):
with flagsaver.flagsaver(benchmark_logger_type="foo"):
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BaseBenchmarkLogger)
def test_config_base_benchmark_logger(self):
with flagsaver.flagsaver(benchmark_logger_type="BaseBenchmarkLogger"):
logger.config_benchmark_logger()
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BaseBenchmarkLogger)
def test_config_benchmark_file_logger(self):
# Set the benchmark_log_dir first since the benchmark_logger_type will need
# the value to be set when it does the validation.
with flagsaver.flagsaver(benchmark_log_dir="/tmp"):
with flagsaver.flagsaver(benchmark_logger_type="BenchmarkFileLogger"):
logger.config_benchmark_logger()
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BenchmarkFileLogger)
@unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
@mock.patch.object(bigquery, "Client")
def test_config_benchmark_bigquery_logger(self, mock_bigquery_client):
with flagsaver.flagsaver(benchmark_logger_type="BenchmarkBigQueryLogger"):
logger.config_benchmark_logger()
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BenchmarkBigQueryLogger)
@mock.patch("official.utils.logs.logger.config_benchmark_logger")
def test_benchmark_context(self, mock_config_benchmark_logger):
mock_logger = mock.MagicMock()
mock_config_benchmark_logger.return_value = mock_logger
with logger.benchmark_context(None):
tf.compat.v1.logging.info("start benchmarking")
mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_SUCCESS)
@mock.patch("official.utils.logs.logger.config_benchmark_logger")
def test_benchmark_context_failure(self, mock_config_benchmark_logger):
mock_logger = mock.MagicMock()
mock_config_benchmark_logger.return_value = mock_logger
with self.assertRaises(RuntimeError):
with logger.benchmark_context(None):
raise RuntimeError("training error")
mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_FAILURE)
class BaseBenchmarkLoggerTest(tf.test.TestCase):
def setUp(self):
super(BaseBenchmarkLoggerTest, self).setUp()
self._actual_log = tf.compat.v1.logging.info
self.logged_message = None
def mock_log(*args, **kwargs):
self.logged_message = args
self._actual_log(*args, **kwargs)
tf.compat.v1.logging.info = mock_log
def tearDown(self):
super(BaseBenchmarkLoggerTest, self).tearDown()
tf.compat.v1.logging.info = self._actual_log
def test_log_metric(self):
log = logger.BaseBenchmarkLogger()
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
expected_log_prefix = "Benchmark metric:"
self.assertRegexpMatches(str(self.logged_message), expected_log_prefix)
class BenchmarkFileLoggerTest(tf.test.TestCase):
def setUp(self):
super(BenchmarkFileLoggerTest, self).setUp()
# Avoid pulling extra env vars from test environment which affects the test
# result, eg. Kokoro test has a TF_PKG env which affect the test case
# test_collect_tensorflow_environment_variables()
self.original_environ = dict(os.environ)
os.environ.clear()
def tearDown(self):
super(BenchmarkFileLoggerTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
os.environ.clear()
os.environ.update(self.original_environ)
def test_create_logging_dir(self):
non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
self.assertFalse(tf.io.gfile.isdir(non_exist_temp_dir))
logger.BenchmarkFileLogger(non_exist_temp_dir)
self.assertTrue(tf.io.gfile.isdir(non_exist_temp_dir))
def test_log_metric(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkFileLogger(log_dir)
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
metric_log = os.path.join(log_dir, "metric.log")
self.assertTrue(tf.io.gfile.exists(metric_log))
with tf.io.gfile.GFile(metric_log) as f:
metric = json.loads(f.readline())
self.assertEqual(metric["name"], "accuracy")
self.assertEqual(metric["value"], 0.999)
self.assertEqual(metric["unit"], None)
self.assertEqual(metric["global_step"], 1e4)
self.assertEqual(metric["extras"], [{"name": "name", "value": "value"}])
def test_log_multiple_metrics(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkFileLogger(log_dir)
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
log.log_metric("loss", 0.02, global_step=1e4)
metric_log = os.path.join(log_dir, "metric.log")
self.assertTrue(tf.io.gfile.exists(metric_log))
with tf.io.gfile.GFile(metric_log) as f:
accuracy = json.loads(f.readline())
self.assertEqual(accuracy["name"], "accuracy")
self.assertEqual(accuracy["value"], 0.999)
self.assertEqual(accuracy["unit"], None)
self.assertEqual(accuracy["global_step"], 1e4)
self.assertEqual(accuracy["extras"], [{"name": "name", "value": "value"}])
loss = json.loads(f.readline())
self.assertEqual(loss["name"], "loss")
self.assertEqual(loss["value"], 0.02)
self.assertEqual(loss["unit"], None)
self.assertEqual(loss["global_step"], 1e4)
self.assertEqual(loss["extras"], [])
def test_log_non_number_value(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkFileLogger(log_dir)
const = tf.constant(1)
log.log_metric("accuracy", const)
metric_log = os.path.join(log_dir, "metric.log")
self.assertFalse(tf.io.gfile.exists(metric_log))
def test_log_evaluation_result(self):
eval_result = {"loss": 0.46237424,
"global_step": 207082,
"accuracy": 0.9285}
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkFileLogger(log_dir)
log.log_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log")
self.assertTrue(tf.io.gfile.exists(metric_log))
with tf.io.gfile.GFile(metric_log) as f:
accuracy = json.loads(f.readline())
self.assertEqual(accuracy["name"], "accuracy")
self.assertEqual(accuracy["value"], 0.9285)
self.assertEqual(accuracy["unit"], None)
self.assertEqual(accuracy["global_step"], 207082)
loss = json.loads(f.readline())
self.assertEqual(loss["name"], "loss")
self.assertEqual(loss["value"], 0.46237424)
self.assertEqual(loss["unit"], None)
self.assertEqual(loss["global_step"], 207082)
def test_log_evaluation_result_with_invalid_type(self):
eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkFileLogger(log_dir)
log.log_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log")
self.assertFalse(tf.io.gfile.exists(metric_log))
@mock.patch("official.utils.logs.logger._gather_run_info")
def test_log_run_info(self, mock_gather_run_info):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkFileLogger(log_dir)
run_info = {"model_name": "model_name",
"dataset": "dataset_name",
"run_info": "run_value"}
mock_gather_run_info.return_value = run_info
log.log_run_info("model_name", "dataset_name", {})
run_log = os.path.join(log_dir, "benchmark_run.log")
self.assertTrue(tf.io.gfile.exists(run_log))
with tf.io.gfile.GFile(run_log) as f:
run_info = json.loads(f.readline())
self.assertEqual(run_info["model_name"], "model_name")
self.assertEqual(run_info["dataset"], "dataset_name")
self.assertEqual(run_info["run_info"], "run_value")
def test_collect_tensorflow_info(self):
run_info = {}
logger._collect_tensorflow_info(run_info)
self.assertNotEqual(run_info["tensorflow_version"], {})
self.assertEqual(run_info["tensorflow_version"]["version"],
tf.version.VERSION)
self.assertEqual(run_info["tensorflow_version"]["git_hash"],
tf.version.GIT_VERSION)
def test_collect_run_params(self):
run_info = {}
run_parameters = {
"batch_size": 32,
"synthetic_data": True,
"train_epochs": 100.00,
"dtype": "fp16",
"resnet_size": 50,
"random_tensor": tf.constant(2.0)
}
logger._collect_run_params(run_info, run_parameters)
self.assertEqual(len(run_info["run_parameters"]), 6)
self.assertEqual(run_info["run_parameters"][0],
{"name": "batch_size", "long_value": 32})
self.assertEqual(run_info["run_parameters"][1],
{"name": "dtype", "string_value": "fp16"})
v1_tensor = {"name": "random_tensor", "string_value":
"Tensor(\"Const:0\", shape=(), dtype=float32)"}
v2_tensor = {"name": "random_tensor", "string_value":
"tf.Tensor(2.0, shape=(), dtype=float32)"}
self.assertIn(run_info["run_parameters"][2], [v1_tensor, v2_tensor])
self.assertEqual(run_info["run_parameters"][3],
{"name": "resnet_size", "long_value": 50})
self.assertEqual(run_info["run_parameters"][4],
{"name": "synthetic_data", "bool_value": "True"})
self.assertEqual(run_info["run_parameters"][5],
{"name": "train_epochs", "float_value": 100.00})
def test_collect_tensorflow_environment_variables(self):
os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"
os.environ["TF_OTHER"] = "2"
os.environ["OTHER"] = "3"
run_info = {}
logger._collect_tensorflow_environment_variables(run_info)
self.assertIsNotNone(run_info["tensorflow_environment_variables"])
expected_tf_envs = [
{"name": "TF_ENABLE_WINOGRAD_NONFUSED", "value": "1"},
{"name": "TF_OTHER", "value": "2"},
]
self.assertEqual(run_info["tensorflow_environment_variables"],
expected_tf_envs)
def test_collect_memory_info(self):
run_info = {"machine_config": {}}
logger._collect_memory_info(run_info)
self.assertIsNotNone(run_info["machine_config"]["memory_total"])
self.assertIsNotNone(run_info["machine_config"]["memory_available"])
@unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
class BenchmarkBigQueryLoggerTest(tf.test.TestCase):
def setUp(self):
super(BenchmarkBigQueryLoggerTest, self).setUp()
# Avoid pulling extra env vars from test environment which affects the test
# result, eg. Kokoro test has a TF_PKG env which affect the test case
# test_collect_tensorflow_environment_variables()
self.original_environ = dict(os.environ)
os.environ.clear()
self.mock_bq_uploader = mock.MagicMock()
self.logger = logger.BenchmarkBigQueryLogger(
self.mock_bq_uploader, "dataset", "run_table", "run_status_table",
"metric_table", "run_id")
def tearDown(self):
super(BenchmarkBigQueryLoggerTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
os.environ.clear()
os.environ.update(self.original_environ)
def test_log_metric(self):
self.logger.log_metric(
"accuracy", 0.999, global_step=1e4, extras={"name": "value"})
expected_metric_json = [{
"name": "accuracy",
"value": 0.999,
"unit": None,
"global_step": 1e4,
"timestamp": mock.ANY,
"extras": [{"name": "name", "value": "value"}]
}]
# log_metric will call upload_benchmark_metric_json in a separate thread.
# Give it some grace period for the new thread before assert.
time.sleep(1)
self.mock_bq_uploader.upload_benchmark_metric_json.assert_called_once_with(
"dataset", "metric_table", "run_id", expected_metric_json)
@mock.patch("official.utils.logs.logger._gather_run_info")
def test_log_run_info(self, mock_gather_run_info):
run_info = {"model_name": "model_name",
"dataset": "dataset_name",
"run_info": "run_value"}
mock_gather_run_info.return_value = run_info
self.logger.log_run_info("model_name", "dataset_name", {})
# log_metric will call upload_benchmark_metric_json in a separate thread.
# Give it some grace period for the new thread before assert.
time.sleep(1)
self.mock_bq_uploader.upload_benchmark_run_json.assert_called_once_with(
"dataset", "run_table", "run_id", run_info)
self.mock_bq_uploader.insert_run_status.assert_called_once_with(
"dataset", "run_status_table", "run_id", "running")
def test_on_finish(self):
self.logger.on_finish(logger.RUN_STATUS_SUCCESS)
# log_metric will call upload_benchmark_metric_json in a separate thread.
# Give it some grace period for the new thread before assert.
time.sleep(1)
self.mock_bq_uploader.update_run_status.assert_called_once_with(
"dataset", "run_status_table", "run_id", logger.RUN_STATUS_SUCCESS)
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,97 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Session hook for logging benchmark metric."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
class LoggingMetricHook(tf.estimator.LoggingTensorHook):
"""Hook to log benchmark metric information.
This hook is very similar as tf.train.LoggingTensorHook, which logs given
tensors every N local steps, every N seconds, or at the end. The metric
information will be logged to given log_dir or via metric_logger in JSON
format, which can be consumed by data analysis pipeline later.
Note that if `at_end` is True, `tensors` should not include any tensor
whose evaluation produces a side effect such as consuming additional inputs.
"""
def __init__(self, tensors, metric_logger=None,
every_n_iter=None, every_n_secs=None, at_end=False):
"""Initializer for LoggingMetricHook.
Args:
tensors: `dict` that maps string-valued tags to tensors/tensor names,
or `iterable` of tensors/tensor names.
metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
hook should use to write the log.
every_n_iter: `int`, print the values of `tensors` once every N local
steps taken on the current worker.
every_n_secs: `int` or `float`, print the values of `tensors` once every N
seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
provided.
at_end: `bool` specifying whether to print the values of `tensors` at the
end of the run.
Raises:
ValueError:
1. `every_n_iter` is non-positive, or
2. Exactly one of every_n_iter and every_n_secs should be provided.
3. Exactly one of log_dir and metric_logger should be provided.
"""
super(LoggingMetricHook, self).__init__(
tensors=tensors,
every_n_iter=every_n_iter,
every_n_secs=every_n_secs,
at_end=at_end)
if metric_logger is None:
raise ValueError("metric_logger should be provided.")
self._logger = metric_logger
def begin(self):
super(LoggingMetricHook, self).begin()
self._global_step_tensor = tf.compat.v1.train.get_global_step()
if self._global_step_tensor is None:
raise RuntimeError(
"Global step should be created to use LoggingMetricHook.")
if self._global_step_tensor.name not in self._current_tensors:
self._current_tensors[self._global_step_tensor.name] = (
self._global_step_tensor)
def after_run(self, unused_run_context, run_values):
# should_trigger is a internal state that populated at before_run, and it is
# using self_timer to determine whether it should trigger.
if self._should_trigger:
self._log_metric(run_values.results)
self._iter_count += 1
def end(self, session):
if self._log_at_end:
values = session.run(self._current_tensors)
self._log_metric(values)
def _log_metric(self, tensor_values):
self._timer.update_last_triggered_step(self._iter_count)
global_step = tensor_values[self._global_step_tensor.name]
# self._tag_order is populated during the init of LoggingTensorHook
for tag in self._tag_order:
self._logger.log_metric(tag, tensor_values[tag], global_step=global_step)
@@ -0,0 +1,217 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for metric_hook."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tempfile
import time
import tensorflow as tf # pylint: disable=g-bad-import-order
from tensorflow.python.training import monitored_session # pylint: disable=g-bad-import-order
from official.utils.logs import metric_hook
from official.utils.testing import mock_lib
class LoggingMetricHookTest(tf.test.TestCase):
"""Tests for LoggingMetricHook."""
def setUp(self):
super(LoggingMetricHookTest, self).setUp()
self._log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
self._logger = mock_lib.MockBenchmarkLogger()
def tearDown(self):
super(LoggingMetricHookTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
def test_illegal_args(self):
with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"):
metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=0)
with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"):
metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=-10)
with self.assertRaisesRegexp(ValueError, "xactly one of"):
metric_hook.LoggingMetricHook(
tensors=["t"], every_n_iter=5, every_n_secs=5)
with self.assertRaisesRegexp(ValueError, "xactly one of"):
metric_hook.LoggingMetricHook(tensors=["t"])
with self.assertRaisesRegexp(ValueError, "metric_logger"):
metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=5)
def test_print_at_end_only(self):
with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
tf.compat.v1.train.get_or_create_global_step()
t = tf.constant(42.0, name="foo")
train_op = tf.constant(3)
hook = metric_hook.LoggingMetricHook(
tensors=[t.name], at_end=True, metric_logger=self._logger)
hook.begin()
mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access
sess.run(tf.compat.v1.global_variables_initializer())
for _ in range(3):
mon_sess.run(train_op)
self.assertEqual(self._logger.logged_metric, [])
hook.end(sess)
self.assertEqual(len(self._logger.logged_metric), 1)
metric = self._logger.logged_metric[0]
self.assertRegexpMatches(metric["name"], "foo")
self.assertEqual(metric["value"], 42.0)
self.assertEqual(metric["unit"], None)
self.assertEqual(metric["global_step"], 0)
def test_global_step_not_found(self):
with tf.Graph().as_default():
t = tf.constant(42.0, name="foo")
hook = metric_hook.LoggingMetricHook(
tensors=[t.name], at_end=True, metric_logger=self._logger)
with self.assertRaisesRegexp(
RuntimeError, "should be created to use LoggingMetricHook."):
hook.begin()
def test_log_tensors(self):
with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
tf.compat.v1.train.get_or_create_global_step()
t1 = tf.constant(42.0, name="foo")
t2 = tf.constant(43.0, name="bar")
train_op = tf.constant(3)
hook = metric_hook.LoggingMetricHook(
tensors=[t1, t2], at_end=True, metric_logger=self._logger)
hook.begin()
mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access
sess.run(tf.compat.v1.global_variables_initializer())
for _ in range(3):
mon_sess.run(train_op)
self.assertEqual(self._logger.logged_metric, [])
hook.end(sess)
self.assertEqual(len(self._logger.logged_metric), 2)
metric1 = self._logger.logged_metric[0]
self.assertRegexpMatches(str(metric1["name"]), "foo")
self.assertEqual(metric1["value"], 42.0)
self.assertEqual(metric1["unit"], None)
self.assertEqual(metric1["global_step"], 0)
metric2 = self._logger.logged_metric[1]
self.assertRegexpMatches(str(metric2["name"]), "bar")
self.assertEqual(metric2["value"], 43.0)
self.assertEqual(metric2["unit"], None)
self.assertEqual(metric2["global_step"], 0)
def _validate_print_every_n_steps(self, sess, at_end):
t = tf.constant(42.0, name="foo")
train_op = tf.constant(3)
hook = metric_hook.LoggingMetricHook(
tensors=[t.name], every_n_iter=10, at_end=at_end,
metric_logger=self._logger)
hook.begin()
mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access
sess.run(tf.compat.v1.global_variables_initializer())
mon_sess.run(train_op)
self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
for _ in range(3):
self._logger.logged_metric = []
for _ in range(9):
mon_sess.run(train_op)
# assertNotRegexpMatches is not supported by python 3.1 and later
self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
mon_sess.run(train_op)
self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
# Add additional run to verify proper reset when called multiple times.
self._logger.logged_metric = []
mon_sess.run(train_op)
# assertNotRegexpMatches is not supported by python 3.1 and later
self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
self._logger.logged_metric = []
hook.end(sess)
if at_end:
self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
else:
# assertNotRegexpMatches is not supported by python 3.1 and later
self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
def test_print_every_n_steps(self):
with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
tf.compat.v1.train.get_or_create_global_step()
self._validate_print_every_n_steps(sess, at_end=False)
# Verify proper reset.
self._validate_print_every_n_steps(sess, at_end=False)
def test_print_every_n_steps_and_end(self):
with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
tf.compat.v1.train.get_or_create_global_step()
self._validate_print_every_n_steps(sess, at_end=True)
# Verify proper reset.
self._validate_print_every_n_steps(sess, at_end=True)
def _validate_print_every_n_secs(self, sess, at_end):
t = tf.constant(42.0, name="foo")
train_op = tf.constant(3)
hook = metric_hook.LoggingMetricHook(
tensors=[t.name], every_n_secs=1.0, at_end=at_end,
metric_logger=self._logger)
hook.begin()
mon_sess = monitored_session._HookedSession(sess, [hook]) # pylint: disable=protected-access
sess.run(tf.compat.v1.global_variables_initializer())
mon_sess.run(train_op)
self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
# assertNotRegexpMatches is not supported by python 3.1 and later
self._logger.logged_metric = []
mon_sess.run(train_op)
self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
time.sleep(1.0)
self._logger.logged_metric = []
mon_sess.run(train_op)
self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
self._logger.logged_metric = []
hook.end(sess)
if at_end:
self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
else:
# assertNotRegexpMatches is not supported by python 3.1 and later
self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
def test_print_every_n_secs(self):
with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
tf.compat.v1.train.get_or_create_global_step()
self._validate_print_every_n_secs(sess, at_end=False)
# Verify proper reset.
self._validate_print_every_n_secs(sess, at_end=False)
def test_print_every_n_secs_and_end(self):
with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
tf.compat.v1.train.get_or_create_global_step()
self._validate_print_every_n_secs(sess, at_end=True)
# Verify proper reset.
self._validate_print_every_n_secs(sess, at_end=True)
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,192 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Wrapper for the mlperf logging utils.
MLPerf compliance logging is only desired under a limited set of circumstances.
This module is intended to keep users from needing to consider logging (or
install the module) unless they are performing mlperf runs.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import json
import os
import re
import subprocess
import sys
import typing
import tensorflow as tf
_MIN_VERSION = (0, 0, 10)
_STACK_OFFSET = 2
SUDO = "sudo" if os.geteuid() else ""
# This indirection is used in docker.
DROP_CACHE_LOC = os.getenv("DROP_CACHE_LOC", "/proc/sys/vm/drop_caches")
_NCF_PREFIX = "NCF_RAW_"
# TODO(robieta): move line parsing to mlperf util
_PREFIX = r"(?:{})?:::MLPv([0-9]+).([0-9]+).([0-9]+)".format(_NCF_PREFIX)
_BENCHMARK = r"([a-zA-Z0-9_]+)"
_TIMESTAMP = r"([0-9]+\.[0-9]+)"
_CALLSITE = r"\((.+):([0-9]+)\)"
_TAG = r"([a-zA-Z0-9_]+)"
_VALUE = r"(.*)"
ParsedLine = namedtuple("ParsedLine", ["version", "benchmark", "timestamp",
"callsite", "tag", "value"])
LINE_PATTERN = re.compile(
"^{prefix} {benchmark} {timestamp} {callsite} {tag}(: |$){value}?$".format(
prefix=_PREFIX, benchmark=_BENCHMARK, timestamp=_TIMESTAMP,
callsite=_CALLSITE, tag=_TAG, value=_VALUE))
def parse_line(line): # type: (str) -> typing.Optional[ParsedLine]
match = LINE_PATTERN.match(line.strip())
if not match:
return
major, minor, micro, benchmark, timestamp = match.groups()[:5]
call_file, call_line, tag, _, value = match.groups()[5:]
return ParsedLine(version=(int(major), int(minor), int(micro)),
benchmark=benchmark, timestamp=timestamp,
callsite=(call_file, call_line), tag=tag, value=value)
def unparse_line(parsed_line): # type: (ParsedLine) -> str
version_str = "{}.{}.{}".format(*parsed_line.version)
callsite_str = "({}:{})".format(*parsed_line.callsite)
value_str = ": {}".format(parsed_line.value) if parsed_line.value else ""
return ":::MLPv{} {} {} {} {} {}".format(
version_str, parsed_line.benchmark, parsed_line.timestamp, callsite_str,
parsed_line.tag, value_str)
def get_mlperf_log():
"""Shielded import of mlperf_log module."""
try:
import mlperf_compliance
def test_mlperf_log_pip_version():
"""Check that mlperf_compliance is up to date."""
import pkg_resources
version = pkg_resources.get_distribution("mlperf_compliance")
version = tuple(int(i) for i in version.version.split("."))
if version < _MIN_VERSION:
tf.compat.v1.logging.warning(
"mlperf_compliance is version {}, must be >= {}".format(
".".join([str(i) for i in version]),
".".join([str(i) for i in _MIN_VERSION])))
raise ImportError
return mlperf_compliance.mlperf_log
mlperf_log = test_mlperf_log_pip_version()
except ImportError:
mlperf_log = None
return mlperf_log
class Logger(object):
"""MLPerf logger indirection class.
This logger only logs for MLPerf runs, and prevents various errors associated
with not having the mlperf_compliance package installed.
"""
class Tags(object):
def __init__(self, mlperf_log):
self._enabled = False
self._mlperf_log = mlperf_log
def __getattr__(self, item):
if self._mlperf_log is None or not self._enabled:
return
return getattr(self._mlperf_log, item)
def __init__(self):
self._enabled = False
self._mlperf_log = get_mlperf_log()
self.tags = self.Tags(self._mlperf_log)
def __call__(self, enable=False):
if enable and self._mlperf_log is None:
raise ImportError("MLPerf logging was requested, but mlperf_compliance "
"module could not be loaded.")
self._enabled = enable
self.tags._enabled = enable
return self
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
self._enabled = False
self.tags._enabled = False
@property
def log_file(self):
if self._mlperf_log is None:
return
return self._mlperf_log.LOG_FILE
@property
def enabled(self):
return self._enabled
def ncf_print(self, key, value=None, stack_offset=_STACK_OFFSET,
deferred=False, extra_print=False, prefix=_NCF_PREFIX):
if self._mlperf_log is None or not self.enabled:
return
self._mlperf_log.ncf_print(key=key, value=value, stack_offset=stack_offset,
deferred=deferred, extra_print=extra_print,
prefix=prefix)
def set_ncf_root(self, path):
if self._mlperf_log is None:
return
self._mlperf_log.ROOT_DIR_NCF = path
LOGGER = Logger()
ncf_print, set_ncf_root = LOGGER.ncf_print, LOGGER.set_ncf_root
TAGS = LOGGER.tags
def clear_system_caches():
if not LOGGER.enabled:
return
ret_code = subprocess.call(
["sync && echo 3 | {} tee {}".format(SUDO, DROP_CACHE_LOC)],
shell=True)
if ret_code:
raise ValueError("Failed to clear caches")
if __name__ == "__main__":
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
with LOGGER(True):
ncf_print(key=TAGS.RUN_START)
@@ -0,0 +1,62 @@
"""A simple Python callstack sampler."""
import contextlib
import datetime
import signal
import traceback
class CallstackSampler(object):
"""A simple signal-based Python callstack sampler.
"""
def __init__(self, interval=None):
self.stacks = []
self.interval = 0.001 if interval is None else interval
def _sample(self, signum, frame):
"""Samples the current stack."""
del signum
stack = traceback.extract_stack(frame)
formatted_stack = []
formatted_stack.append(datetime.datetime.utcnow())
for filename, lineno, function_name, text in stack:
formatted_frame = '{}:{}({})({})'.format(filename, lineno, function_name,
text)
formatted_stack.append(formatted_frame)
self.stacks.append(formatted_stack)
signal.setitimer(signal.ITIMER_VIRTUAL, self.interval, 0)
@contextlib.contextmanager
def profile(self):
signal.signal(signal.SIGVTALRM, self._sample)
signal.setitimer(signal.ITIMER_VIRTUAL, self.interval, 0)
try:
yield
finally:
signal.setitimer(signal.ITIMER_VIRTUAL, 0)
def save(self, fname):
with open(fname, 'w') as f:
for s in self.stacks:
for l in s:
f.write('%s\n' % l)
f.write('\n')
@contextlib.contextmanager
def callstack_sampling(filename, interval=None):
"""Periodically samples the Python callstack.
Args:
filename: the filename
interval: the sampling interval, in seconds. Defaults to 0.001.
Yields:
nothing
"""
sampler = CallstackSampler(interval=interval)
with sampler.profile():
yield
sampler.save(filename)
@@ -0,0 +1,338 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper functions for running models in a distributed setting."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import json
import os
import random
import string
import tensorflow.compat.v2 as tf
from official.utils.misc import tpu_lib
def _collective_communication(all_reduce_alg):
"""Return a CollectiveCommunication based on all_reduce_alg.
Args:
all_reduce_alg: a string specifying which collective communication to pick,
or None.
Returns:
tf.distribute.experimental.CollectiveCommunication object
Raises:
ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl']
"""
collective_communication_options = {
None: tf.distribute.experimental.CollectiveCommunication.AUTO,
"ring": tf.distribute.experimental.CollectiveCommunication.RING,
"nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
}
if all_reduce_alg not in collective_communication_options:
raise ValueError(
"When used with `multi_worker_mirrored`, valid values for "
"all_reduce_alg are ['ring', 'nccl']. Supplied value: {}".format(
all_reduce_alg))
return collective_communication_options[all_reduce_alg]
def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
"""Return a CrossDeviceOps based on all_reduce_alg and num_packs.
Args:
all_reduce_alg: a string specifying which cross device op to pick, or None.
num_packs: an integer specifying number of packs for the cross device op.
Returns:
tf.distribute.CrossDeviceOps object or None.
Raises:
ValueError: if `all_reduce_alg` not in [None, 'nccl', 'hierarchical_copy'].
"""
if all_reduce_alg is None:
return None
mirrored_all_reduce_options = {
"nccl": tf.distribute.NcclAllReduce,
"hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
}
if all_reduce_alg not in mirrored_all_reduce_options:
raise ValueError(
"When used with `mirrored`, valid values for all_reduce_alg are "
"['nccl', 'hierarchical_copy']. Supplied value: {}".format(
all_reduce_alg))
cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
return cross_device_ops_class(num_packs=num_packs)
def get_distribution_strategy(distribution_strategy="mirrored",
num_gpus=0,
all_reduce_alg=None,
num_packs=1,
tpu_address=None):
"""Return a DistributionStrategy for running the model.
Args:
distribution_strategy: a string specifying which distribution strategy to
use. Accepted values are 'off', 'one_device', 'mirrored',
'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive.
'off' means not to use Distribution Strategy; 'tpu' means to use
TPUStrategy using `tpu_address`.
num_gpus: Number of GPUs to run this model.
all_reduce_alg: Optional. Specifies which algorithm to use when performing
all-reduce. For `MirroredStrategy`, valid values are "nccl" and
"hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are
"ring" and "nccl". If None, DistributionStrategy will choose based on
device topology.
num_packs: Optional. Sets the `num_packs` in `tf.distribute.NcclAllReduce`
or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`.
tpu_address: Optional. String that represents TPU to connect to. Must not
be None if `distribution_strategy` is set to `tpu`.
Returns:
tf.distribute.DistibutionStrategy object.
Raises:
ValueError: if `distribution_strategy` is 'off' or 'one_device' and
`num_gpus` is larger than 1; or `num_gpus` is negative or if
`distribution_strategy` is `tpu` but `tpu_address` is not specified.
"""
if num_gpus < 0:
raise ValueError("`num_gpus` can not be negative.")
distribution_strategy = distribution_strategy.lower()
if distribution_strategy == "off":
if num_gpus > 1:
raise ValueError(
"When {} GPUs are specified, distribution_strategy "
"flag cannot be set to 'off'.".format(num_gpus))
return None
if distribution_strategy == "tpu":
# When tpu_address is an empty string, we communicate with local TPUs.
cluster_resolver = tpu_lib.tpu_initialize(tpu_address)
return tf.distribute.experimental.TPUStrategy(cluster_resolver)
if distribution_strategy == "multi_worker_mirrored":
return tf.distribute.experimental.MultiWorkerMirroredStrategy(
communication=_collective_communication(all_reduce_alg))
if distribution_strategy == "one_device":
if num_gpus == 0:
return tf.distribute.OneDeviceStrategy("device:CPU:0")
if num_gpus > 1:
raise ValueError("`OneDeviceStrategy` can not be used for more than "
"one device.")
return tf.distribute.OneDeviceStrategy("device:GPU:0")
if distribution_strategy == "mirrored":
if num_gpus == 0:
devices = ["device:CPU:0"]
else:
devices = ["device:GPU:%d" % i for i in range(num_gpus)]
return tf.distribute.MirroredStrategy(
devices=devices,
cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs))
if distribution_strategy == "parameter_server":
return tf.distribute.experimental.ParameterServerStrategy()
raise ValueError(
"Unrecognized Distribution Strategy: %r" % distribution_strategy)
def per_replica_batch_size(batch_size, num_gpus):
"""For multi-gpu, batch-size must be a multiple of the number of GPUs.
Note that distribution strategy handles this automatically when used with
Keras. For using with Estimator, we need to get per GPU batch.
Args:
batch_size: Global batch size to be divided among devices. This should be
equal to num_gpus times the single-GPU batch_size for multi-gpu training.
num_gpus: How many GPUs are used with DistributionStrategies.
Returns:
Batch size per device.
Raises:
ValueError: if batch_size is not divisible by number of devices
"""
if num_gpus <= 1:
return batch_size
remainder = batch_size % num_gpus
if remainder:
err = ('When running with multiple GPUs, batch size '
'must be a multiple of the number of available GPUs. Found {} '
'GPUs with a batch size of {}; try --batch_size={} instead.'
).format(num_gpus, batch_size, batch_size - remainder)
raise ValueError(err)
return int(batch_size / num_gpus)
# The `SyntheticDataset` is a temporary solution for generating synthetic data
# directly on devices. It is only useful for Keras with Distribution
# Strategies. We will have better support in `tf.data` or Distribution Strategy
# later.
class SyntheticDataset(object):
"""A dataset that generates synthetic data on each device."""
def __init__(self, dataset, split_by=1):
# dataset.take(1) doesn't have GPU kernel.
with tf.device('device:CPU:0'):
tensor = tf.data.experimental.get_single_element(dataset.take(1))
flat_tensor = tf.nest.flatten(tensor)
variable_data = []
initializers = []
for t in flat_tensor:
rebatched_t = tf.split(t, num_or_size_splits=split_by, axis=0)[0]
assert rebatched_t.shape.is_fully_defined(), rebatched_t.shape
v = tf.compat.v1.get_local_variable(self._random_name(),
initializer=rebatched_t)
variable_data.append(v)
initializers.append(v.initializer)
input_data = tf.nest.pack_sequence_as(tensor, variable_data)
self._iterator = SyntheticIterator(input_data, initializers)
def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
def __iter__(self):
return self._iterator
def make_one_shot_iterator(self):
return self._iterator
def make_initializable_iterator(self):
return self._iterator
class SyntheticIterator(object):
"""A dataset that generates synthetic data on each device."""
def __init__(self, input_data, initializers):
self._input_data = input_data
self._initializers = initializers
def get_next(self):
return self._input_data
def next(self):
return self.__next__()
def __next__(self):
try:
return self.get_next()
except tf.errors.OutOfRangeError:
raise StopIteration
def initialize(self):
if tf.executing_eagerly():
return tf.no_op()
else:
return self._initializers
def _monkey_patch_dataset_method(strategy):
"""Monkey-patch `strategy`'s `make_dataset_iterator` method."""
def make_dataset(self, dataset):
tf.compat.v1.logging.info('Using pure synthetic data.')
with self.scope():
if self.extended._global_batch_size: # pylint: disable=protected-access
return SyntheticDataset(dataset, self.num_replicas_in_sync)
else:
return SyntheticDataset(dataset)
def make_iterator(self, dataset):
dist_dataset = make_dataset(self, dataset)
return iter(dist_dataset)
strategy.orig_make_dataset_iterator = strategy.make_dataset_iterator
strategy.make_dataset_iterator = make_iterator
strategy.orig_distribute_dataset = strategy.experimental_distribute_dataset
strategy.experimental_distribute_dataset = make_dataset
def _undo_monkey_patch_dataset_method(strategy):
if hasattr(strategy, 'orig_make_dataset_iterator'):
strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
if hasattr(strategy, 'orig_distribute_dataset'):
strategy.make_dataset_iterator = strategy.orig_distribute_dataset
def set_up_synthetic_data():
_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
_monkey_patch_dataset_method(
tf.distribute.experimental.MultiWorkerMirroredStrategy)
def undo_set_up_synthetic_data():
_undo_monkey_patch_dataset_method(tf.distribute.OneDeviceStrategy)
_undo_monkey_patch_dataset_method(tf.distribute.MirroredStrategy)
_undo_monkey_patch_dataset_method(
tf.distribute.experimental.MultiWorkerMirroredStrategy)
def configure_cluster(worker_hosts=None, task_index=-1):
"""Set multi-worker cluster spec in TF_CONFIG environment variable.
Args:
worker_hosts: comma-separated list of worker ip:port pairs.
Returns:
Number of workers in the cluster.
"""
tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
if tf_config:
num_workers = (len(tf_config['cluster'].get('chief', [])) +
len(tf_config['cluster'].get('worker', [])))
elif worker_hosts:
workers = worker_hosts.split(',')
num_workers = len(workers)
if num_workers > 1 and task_index < 0:
raise ValueError('Must specify task_index when number of workers > 1')
task_index = 0 if num_workers == 1 else task_index
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': workers
},
'task': {'type': 'worker', 'index': task_index}
})
else:
num_workers = 1
return num_workers
def get_strategy_scope(strategy):
if strategy:
strategy_scope = strategy.scope()
else:
strategy_scope = DummyContextManager()
return strategy_scope
class DummyContextManager(object):
def __enter__(self):
pass
def __exit__(self, *args):
pass
@@ -0,0 +1,65 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
""" Tests for distribution util functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow.compat.v2 as tf
from official.utils.misc import distribution_utils
class GetDistributionStrategyTest(tf.test.TestCase):
"""Tests for get_distribution_strategy."""
def test_one_device_strategy_cpu(self):
ds = distribution_utils.get_distribution_strategy(num_gpus=0)
self.assertEquals(ds.num_replicas_in_sync, 1)
self.assertEquals(len(ds.extended.worker_devices), 1)
self.assertIn('CPU', ds.extended.worker_devices[0])
def test_one_device_strategy_gpu(self):
ds = distribution_utils.get_distribution_strategy(num_gpus=1)
self.assertEquals(ds.num_replicas_in_sync, 1)
self.assertEquals(len(ds.extended.worker_devices), 1)
self.assertIn('GPU', ds.extended.worker_devices[0])
def test_mirrored_strategy(self):
ds = distribution_utils.get_distribution_strategy(num_gpus=5)
self.assertEquals(ds.num_replicas_in_sync, 5)
self.assertEquals(len(ds.extended.worker_devices), 5)
for device in ds.extended.worker_devices:
self.assertIn('GPU', device)
class PerReplicaBatchSizeTest(tf.test.TestCase):
"""Tests for per_replica_batch_size."""
def test_batch_size(self):
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=0), 147)
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=1), 147)
self.assertEquals(
distribution_utils.per_replica_batch_size(147, num_gpus=7), 21)
def test_batch_size_with_remainder(self):
with self.assertRaises(ValueError):
distribution_utils.per_replica_batch_size(147, num_gpus=5)
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,262 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Helper functions for the Keras implementations of models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import multiprocessing
import os
import time
from absl import logging
import tensorflow.compat.v2 as tf
from tensorflow.python import tf2
from tensorflow.python.profiler import profiler as profiler
class BatchTimestamp(object):
"""A structure to store batch time stamp."""
def __init__(self, batch_index, timestamp):
self.batch_index = batch_index
self.timestamp = timestamp
def __repr__(self):
return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(
self.batch_index, self.timestamp)
class TimeHistory(tf.keras.callbacks.Callback):
"""Callback for Keras models."""
def __init__(self, batch_size, log_steps, logdir=None):
"""Callback for logging performance.
Args:
batch_size: Total batch size.
log_steps: Interval of steps between logging of batch level stats.
logdir: Optional directory to write TensorBoard summaries.
"""
# TODO(wcromar): remove this parameter and rely on `logs` parameter of
# on_train_batch_end()
self.batch_size = batch_size
super(TimeHistory, self).__init__()
self.log_steps = log_steps
self.last_log_step = 0
self.steps_before_epoch = 0
self.steps_in_epoch = 0
self.start_time = None
if logdir:
self.summary_writer = tf.summary.create_file_writer(logdir)
else:
self.summary_writer = None
# Logs start of step 1 then end of each step based on log_steps interval.
self.timestamp_log = []
# Records the time each epoch takes to run from start to finish of epoch.
self.epoch_runtime_log = []
@property
def global_steps(self):
"""The current 1-indexed global step."""
return self.steps_before_epoch + self.steps_in_epoch
@property
def average_steps_per_second(self):
"""The average training steps per second across all epochs."""
return self.global_steps / sum(self.epoch_runtime_log)
@property
def average_examples_per_second(self):
"""The average number of training examples per second across all epochs."""
return self.average_steps_per_second * self.batch_size
def on_train_end(self, logs=None):
self.train_finish_time = time.time()
if self.summary_writer:
self.summary_writer.flush()
def on_epoch_begin(self, epoch, logs=None):
self.epoch_start = time.time()
def on_batch_begin(self, batch, logs=None):
if not self.start_time:
self.start_time = time.time()
# Record the timestamp of the first global step
if not self.timestamp_log:
self.timestamp_log.append(BatchTimestamp(self.global_steps,
self.start_time))
def on_batch_end(self, batch, logs=None):
"""Records elapse time of the batch and calculates examples per second."""
self.steps_in_epoch = batch + 1
steps_since_last_log = self.global_steps - self.last_log_step
if steps_since_last_log >= self.log_steps:
now = time.time()
elapsed_time = now - self.start_time
steps_per_second = steps_since_last_log / elapsed_time
examples_per_second = steps_per_second * self.batch_size
self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
logging.info(
'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
'and %d', elapsed_time, examples_per_second, self.last_log_step,
self.global_steps)
if self.summary_writer:
with self.summary_writer.as_default():
tf.summary.scalar('global_step/sec', steps_per_second,
self.global_steps)
tf.summary.scalar('examples/sec', examples_per_second,
self.global_steps)
self.last_log_step = self.global_steps
self.start_time = None
def on_epoch_end(self, epoch, logs=None):
epoch_run_time = time.time() - self.epoch_start
self.epoch_runtime_log.append(epoch_run_time)
self.steps_before_epoch += self.steps_in_epoch
self.steps_in_epoch = 0
def get_profiler_callback(model_dir, profile_steps, enable_tensorboard,
steps_per_epoch):
"""Validate profile_steps flag value and return profiler callback."""
profile_steps_error_message = (
'profile_steps must be a comma separated pair of positive integers, '
'specifying the first and last steps to be profiled.'
)
try:
profile_steps = [int(i) for i in profile_steps.split(',')]
except ValueError:
raise ValueError(profile_steps_error_message)
if len(profile_steps) != 2:
raise ValueError(profile_steps_error_message)
start_step, stop_step = profile_steps
if start_step < 0 or start_step > stop_step:
raise ValueError(profile_steps_error_message)
if enable_tensorboard:
logging.warning(
'Both TensorBoard and profiler callbacks are used. Note that the '
'TensorBoard callback profiles the 2nd step (unless otherwise '
'specified). Please make sure the steps profiled by the two callbacks '
'do not overlap.')
return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
class ProfilerCallback(tf.keras.callbacks.Callback):
"""Save profiles in specified step range to log directory."""
def __init__(self, log_dir, start_step, stop_step, steps_per_epoch):
super(ProfilerCallback, self).__init__()
self.log_dir = log_dir
self.start_step = start_step
self.stop_step = stop_step
self.start_epoch = start_step // steps_per_epoch
self.stop_epoch = stop_step // steps_per_epoch
self.start_step_in_epoch = start_step % steps_per_epoch
self.stop_step_in_epoch = stop_step % steps_per_epoch
self.should_start = False
self.should_stop = False
def on_epoch_begin(self, epoch, logs=None):
if epoch == self.start_epoch:
self.should_start = True
if epoch == self.stop_epoch:
self.should_stop = True
def on_batch_begin(self, batch, logs=None):
if batch == self.start_step_in_epoch and self.should_start:
self.should_start = False
profiler.start(self.log_dir)
logging.info('Profiler started at Step %s', self.start_step)
def on_batch_end(self, batch, logs=None):
if batch == self.stop_step_in_epoch and self.should_stop:
self.should_stop = False
profiler.stop()
logging.info('Profiler saved profiles for steps between %s and %s to %s',
self.start_step, self.stop_step, self.log_dir)
def set_session_config(enable_eager=False,
enable_xla=False):
"""Sets the session config."""
if is_v2_0():
set_config_v2(enable_xla=enable_xla)
else:
config = get_config_proto_v1(enable_xla=enable_xla)
if enable_eager:
tf.compat.v1.enable_eager_execution(config=config)
else:
sess = tf.compat.v1.Session(config=config)
tf.compat.v1.keras.backend.set_session(sess)
def get_config_proto_v1(enable_xla=False):
"""Return config proto according to flag settings, or None to use default."""
config = None
if enable_xla:
config = tf.compat.v1.ConfigProto()
config.graph_options.optimizer_options.global_jit_level = (
tf.OptimizerOptions.ON_2)
return config
def set_config_v2(enable_xla=False):
"""Config eager context according to flag values using TF 2.0 API."""
if enable_xla:
tf.config.optimizer.set_jit(True)
def is_v2_0():
"""Returns true if using tf 2.0."""
return tf2.enabled()
def set_gpu_thread_mode_and_count(gpu_thread_mode,
datasets_num_private_threads,
num_gpus, per_gpu_thread_count):
"""Set GPU thread mode and count, and adjust dataset threads count."""
cpu_count = multiprocessing.cpu_count()
logging.info('Logical CPU cores: %s', cpu_count)
# Allocate private thread pool for each GPU to schedule and launch kernels
per_gpu_thread_count = per_gpu_thread_count or 2
os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
logging.info('TF_GPU_THREAD_COUNT: %s',
os.environ['TF_GPU_THREAD_COUNT'])
logging.info('TF_GPU_THREAD_MODE: %s',
os.environ['TF_GPU_THREAD_MODE'])
# Limit data preprocessing threadpool to CPU cores minus number of total GPU
# private threads and memory copy threads.
total_gpu_thread_count = per_gpu_thread_count * num_gpus
num_runtime_threads = num_gpus
if not datasets_num_private_threads:
datasets_num_private_threads = min(
cpu_count - total_gpu_thread_count - num_runtime_threads,
num_gpus * 8)
logging.info('Set datasets_num_private_threads to %s',
datasets_num_private_threads)
@@ -0,0 +1,93 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Miscellaneous functions that can be called by models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numbers
import tensorflow as tf
from tensorflow.python.util import nest
def past_stop_threshold(stop_threshold, eval_metric):
"""Return a boolean representing whether a model should be stopped.
Args:
stop_threshold: float, the threshold above which a model should stop
training.
eval_metric: float, the current value of the relevant metric to check.
Returns:
True if training should stop, False otherwise.
Raises:
ValueError: if either stop_threshold or eval_metric is not a number
"""
if stop_threshold is None:
return False
if not isinstance(stop_threshold, numbers.Number):
raise ValueError("Threshold for checking stop conditions must be a number.")
if not isinstance(eval_metric, numbers.Number):
raise ValueError("Eval metric being checked against stop conditions "
"must be a number.")
if eval_metric >= stop_threshold:
tf.compat.v1.logging.info(
"Stop threshold of {} was passed with metric value {}.".format(
stop_threshold, eval_metric))
return True
return False
def generate_synthetic_data(
input_shape, input_value=0, input_dtype=None, label_shape=None,
label_value=0, label_dtype=None):
"""Create a repeating dataset with constant values.
Args:
input_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
the input data.
input_value: Value of each input element.
input_dtype: Input dtype. If None, will be inferred by the input value.
label_shape: a tf.TensorShape object or nested tf.TensorShapes. The shape of
the label data.
label_value: Value of each input element.
label_dtype: Input dtype. If None, will be inferred by the target value.
Returns:
Dataset of tensors or tuples of tensors (if label_shape is set).
"""
# TODO(kathywu): Replace with SyntheticDataset once it is in contrib.
element = input_element = nest.map_structure(
lambda s: tf.constant(input_value, input_dtype, s), input_shape)
if label_shape:
label_element = nest.map_structure(
lambda s: tf.constant(label_value, label_dtype, s), label_shape)
element = (input_element, label_element)
return tf.data.Dataset.from_tensors(element).repeat()
def apply_clean(flags_obj):
if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
tf.compat.v1.logging.info("--clean flag set. Removing existing model dir:"
" {}".format(flags_obj.model_dir))
tf.io.gfile.rmtree(flags_obj.model_dir)
@@ -0,0 +1,127 @@
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for Model Helper functions."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.misc import keras_utils
from official.utils.misc import model_helpers
class PastStopThresholdTest(tf.test.TestCase):
"""Tests for past_stop_threshold."""
def setUp(self):
super(PastStopThresholdTest, self).setUp()
if keras_utils.is_v2_0:
tf.compat.v1.disable_eager_execution()
def test_past_stop_threshold(self):
"""Tests for normal operating conditions."""
self.assertTrue(model_helpers.past_stop_threshold(0.54, 1))
self.assertTrue(model_helpers.past_stop_threshold(54, 100))
self.assertFalse(model_helpers.past_stop_threshold(0.54, 0.1))
self.assertFalse(model_helpers.past_stop_threshold(-0.54, -1.5))
self.assertTrue(model_helpers.past_stop_threshold(-0.54, 0))
self.assertTrue(model_helpers.past_stop_threshold(0, 0))
self.assertTrue(model_helpers.past_stop_threshold(0.54, 0.54))
def test_past_stop_threshold_none_false(self):
"""Tests that check None returns false."""
self.assertFalse(model_helpers.past_stop_threshold(None, -1.5))
self.assertFalse(model_helpers.past_stop_threshold(None, None))
self.assertFalse(model_helpers.past_stop_threshold(None, 1.5))
# Zero should be okay, though.
self.assertTrue(model_helpers.past_stop_threshold(0, 1.5))
def test_past_stop_threshold_not_number(self):
"""Tests for error conditions."""
with self.assertRaises(ValueError):
model_helpers.past_stop_threshold("str", 1)
with self.assertRaises(ValueError):
model_helpers.past_stop_threshold("str", tf.constant(5))
with self.assertRaises(ValueError):
model_helpers.past_stop_threshold("str", "another")
with self.assertRaises(ValueError):
model_helpers.past_stop_threshold(0, None)
with self.assertRaises(ValueError):
model_helpers.past_stop_threshold(0.7, "str")
with self.assertRaises(ValueError):
model_helpers.past_stop_threshold(tf.constant(4), None)
class SyntheticDataTest(tf.test.TestCase):
"""Tests for generate_synthetic_data."""
def test_generate_synethetic_data(self):
input_element, label_element = tf.compat.v1.data.make_one_shot_iterator(
model_helpers.generate_synthetic_data(input_shape=tf.TensorShape([5]),
input_value=123,
input_dtype=tf.float32,
label_shape=tf.TensorShape([]),
label_value=456,
label_dtype=tf.int32)).get_next()
with self.session() as sess:
for n in range(5):
inp, lab = sess.run((input_element, label_element))
self.assertAllClose(inp, [123., 123., 123., 123., 123.])
self.assertEquals(lab, 456)
def test_generate_only_input_data(self):
d = model_helpers.generate_synthetic_data(
input_shape=tf.TensorShape([4]),
input_value=43.5,
input_dtype=tf.float32)
element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
self.assertFalse(isinstance(element, tuple))
with self.session() as sess:
inp = sess.run(element)
self.assertAllClose(inp, [43.5, 43.5, 43.5, 43.5])
def test_generate_nested_data(self):
d = model_helpers.generate_synthetic_data(
input_shape={'a': tf.TensorShape([2]),
'b': {'c': tf.TensorShape([3]), 'd': tf.TensorShape([])}},
input_value=1.1)
element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
self.assertIn('a', element)
self.assertIn('b', element)
self.assertEquals(len(element['b']), 2)
self.assertIn('c', element['b'])
self.assertIn('d', element['b'])
self.assertNotIn('c', element)
with self.session() as sess:
inp = sess.run(element)
self.assertAllClose(inp['a'], [1.1, 1.1])
self.assertAllClose(inp['b']['c'], [1.1, 1.1, 1.1])
self.assertAllClose(inp['b']['d'], 1.1)
if __name__ == "__main__":
tf.test.main()
@@ -0,0 +1,34 @@
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Initializes TPU system for TF 2.0."""
import tensorflow as tf
def tpu_initialize(tpu_address):
"""Initializes TPU for TF 2.0 training.
Args:
tpu_address: string, bns address of master TPU worker.
Returns:
A TPUClusterResolver.
"""
cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
tpu=tpu_address)
if tpu_address not in ('', 'local'):
tf.config.experimental_connect_to_cluster(cluster_resolver)
tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
return cluster_resolver
@@ -0,0 +1,83 @@
# Lint as: python3
"""Utils to annotate and trace benchmarks."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from absl import logging
from absl.testing import flagsaver
FLAGS = flags.FLAGS
flags.DEFINE_multi_string(
'benchmark_method_flags', None,
'Optional list of runtime flags of the form key=value. Specify '
'multiple times to specify different flags. These will override the FLAGS '
'object directly after hardcoded settings in individual benchmark methods '
'before they call _run_and_report benchmark. Example if we set '
'--benchmark_method_flags=train_steps=10 and a benchmark method hardcodes '
'FLAGS.train_steps=10000 and later calls _run_and_report_benchmark, '
'it\'ll only run for 10 steps. This is useful for '
'debugging/profiling workflows.')
def enable_runtime_flags(decorated_func):
"""Sets attributes from --benchmark_method_flags for method execution.
@enable_runtime_flags decorator temporarily adds flags passed in via
--benchmark_method_flags and runs the decorated function in that context.
A user can set --benchmark_method_flags=train_steps=5 to run the benchmark
method in the snippet below with FLAGS.train_steps=5 for debugging (without
modifying the benchmark code).
class ModelBenchmark():
@benchmark_wrappers.enable_runtime_flags
def _run_and_report_benchmark(self):
# run benchmark ...
# report benchmark results ...
def benchmark_method(self):
FLAGS.train_steps = 1000
...
self._run_and_report_benchmark()
Args:
decorated_func: The method that runs the benchmark after previous setup
execution that set some flags.
Returns:
new_func: The same method which executes in a temporary context where flag
overrides from --benchmark_method_flags are active.
"""
def runner(*args, **kwargs):
"""Creates a temporary context to activate --benchmark_method_flags."""
if FLAGS.benchmark_method_flags:
saved_flag_values = flagsaver.save_flag_values()
for key_value in FLAGS.benchmark_method_flags:
key, value = key_value.split('=', 1)
try:
numeric_float = float(value)
numeric_int = int(numeric_float)
if abs(numeric_int) == abs(numeric_float):
flag_value = numeric_int
else:
flag_value = numeric_float
except ValueError:
flag_value = value
logging.info('Setting --%s=%s', key, flag_value)
setattr(FLAGS, key, flag_value)
else:
saved_flag_values = None
try:
result = decorated_func(*args, **kwargs)
return result
finally:
if saved_flag_values:
flagsaver.restore_flag_values(saved_flag_values)
return runner

Some files were not shown because too many files have changed in this diff Show More