[add]上传训练benchmark by z00560161

2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,89 @@
+#  SSD-Resnet34 TensorFlow训练说明
+
+### 1. 运行环境
+Python版本: 3.7.5
+主要python三方库:
+- tensorflow >= 1.15.0 (satisfied with NPU)
+
+
+### 2. 参数配置
+在train/yaml/SSD-Resnet34.yaml中修改相应配置， 配置项含义:
+
+```
+tensorflow_config: tensorflow框架下ssd-resnet34的配置项
+
+train_batch_size: 训练时设置的batch size大小
+training_file_pattern: 数据集中训练数据集文件标签类型， 数据集中有该类型的文件夹
+resnet_checkpoint: ckpt路径
+validation_file_pattern: 数据集中验证数据文件标签类型， 数据集中有该类型的文件夹
+val_json_file: 数据集中验证数据json文件
+eval_batch_size: 评测时设置的batch size大小
+num_epochs: epochs数量
+model_dir: 存放模型graph等数据的路径
+max_steps: 最大步数
+runmode: 运行模式 边训练边评测、只训练、只评测
+device_group_1p: 跑1p时的device_id
+device_group_2p: 跑2p时的device_id
+device_group_4p: 跑4p时的device_id
+mpirun_ip: 仅集群场景时需要配置, 格式ip1:卡数量1,ip2:卡数量2
+docker_image: docker镜像名称:版本号
+```
+
+
+SSD-Resnet34.yaml中配置项示例：
+```
+tensorflow_config:
+
+    train_batch_size: 32
+    training_file_pattern: /home/data/raw_data/tfrecord/train2017*
+    resnet_checkpoint: /home/data/raw_data/resnet34_pretrain_model/model.ckpt-28152
+    validation_file_pattern: /home/data/raw_data/tfrecord/val2017*
+    val_json_file: /home/data/raw_data/annotations/instances_val2017.json
+    eval_batch_size: 32
+    num_epochs: 1
+    model_dir: result_npu
+    max_steps: 432000
+    runmode: train_and_eval
+    device_group_1p: 0
+    device_group_2p: 0 1
+    device_group_4p: 0 1 2 3
+    mpirun_ip: 90.90.176.152:8,90.90.176.154:8
+    docker_image: mpirun3:latest
+
+```
+SSD-Resnet34.yaml中配置注意事项：
+    当ssd-resnet34在docker侧进行训练时，resnet_checkpoint、validation_file_pattern和val_json_file的路径都必须规划在training_file_pattern字段路径中的raw_data下，因配置路径较多，脚本中统一只对training_file_pattern字段路径中的raw_data下文件做映射
+
+### 3. 启动训练脚本
+
+#### 3.1 训练脚本启动
+当前路径为benchmark包的train文件夹下
+```
+bash benchmark.sh -e SSD-Resnet34 -hw 1p              # host侧1p
+bash benchmark.sh -e SSD-Resnet34 -hw 8p              # host侧8p
+bash benchmark.sh -e SSD-Resnet34 -hw 1p -docker      # docker侧1p
+bash benchmark.sh -e SSD-Resnet34 -hw 8p -docker      # docker侧8p
+bash benchmark.sh -e SSD-Resnet34 -ct                 # host侧集群
+bash benchmark.sh -e SSD-Resnet34 -ct -docker         # docker侧集群
+```
+
+#### 3.2 训练日志
+日志在benchmark包的train路径下reuslt中找到ssd-resnet34的文件夹里。
+```
+./result/tf_ssd-resnet34/TrainingJob-2020xxxxxxxxxx/train_${device_id}.log
+./result/tf_ssd-resnet34/TrainingJob-2020xxxxxxxxxx/device_id/hw_ssd-resnet34.log
+```
+
+### 4. 模型评测
+将train/yaml/SSD-Resnet34.yaml中resnet_checkpoint的值改为训练产生的日志的路径， runmode的值改为evaluate，如2中示例；
+然后运行与训练时相同的脚本，结果参看见train.log。
+
+
+### 5. 训练结果参考
+
+1p: 600
+4P: 2000
+8p: 4000
+
+
+
@@ -0,0 +1,281 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import atexit
+import tempfile
+import time
+
+from absl import flags
+
+import numpy as np
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import six
+
+#COCO = coco.COCO
+#COCOeval = coco.COCOeval
+
+import tensorflow as tf
+
+import ssd_constants
+
+FLAGS = flags.FLAGS
+
+
+# https://github.com/cocodataset/cocoapi/issues/49
+if six.PY3:
+  import pycocotools.coco
+  pycocotools.coco.unicode = str
+
+
+def create_coco(val_json_file, use_cpp_extension=True):
+  """Creates Microsoft COCO helper class object and return it."""
+  if val_json_file.startswith('gs://'):
+    _, local_val_json = tempfile.mkstemp(suffix='.json')
+    tf.gfile.Remove(local_val_json)
+
+    tf.gfile.Copy(val_json_file, local_val_json)
+    atexit.register(tf.gfile.Remove, local_val_json)
+  else:
+    local_val_json = val_json_file
+
+  if use_cpp_extension:
+    coco_gt = coco.COCO(local_val_json, False)
+  else:
+    coco_gt = COCO(local_val_json)
+  return coco_gt
+
+
+def compute_map(labels_and_predictions,
+                coco_gt,
+                use_cpp_extension=True,
+                nms_on_tpu=True):
+  """Use model predictions to compute mAP.
+
+  The evaluation code is largely copied from the MLPerf reference
+  implementation. While it is possible to write the evaluation as a tensor
+  metric and use Estimator.evaluate(), this approach was selected for simplicity
+  and ease of duck testing.
+
+  Args:
+    labels_and_predictions: A map from TPU predict method.
+    coco_gt: ground truch COCO object.
+    use_cpp_extension: use cocoeval C++ library.
+    nms_on_tpu: do NMS on TPU.
+  Returns:
+    Evaluation result.
+  """
+
+  predictions = []
+  tic = time.time()
+
+  if nms_on_tpu:
+    p = []
+    for i in labels_and_predictions:
+      for j in i:
+        p.append(np.array(j, dtype=np.float32))
+    predictions = np.concatenate(list(p)).reshape((-1, 7))
+  else:
+    k = 0
+    for example in labels_and_predictions:
+      if ssd_constants.IS_PADDED in example and example[
+          ssd_constants.IS_PADDED]:
+        continue
+      print(k)
+      k += 1
+      htot, wtot, _ = example[ssd_constants.RAW_SHAPE]
+      pred_box = example['pred_box']
+      pred_scores = example['pred_scores']
+      indices = example['indices']
+      loc, label, prob = decode_single(
+          pred_box, pred_scores, indices, ssd_constants.OVERLAP_CRITERIA,
+          ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES)
+
+      for loc_, label_, prob_ in zip(loc, label, prob):
+        # Ordering convention differs, hence [1], [0] rather than [0], [1]
+        predictions.append([
+            int(example[ssd_constants.SOURCE_ID]),
+            loc_[1] * wtot, loc_[0] * htot, (loc_[3] - loc_[1]) * wtot,
+            (loc_[2] - loc_[0]) * htot, prob_,
+            ssd_constants.CLASS_INV_MAP[label_]
+        ])
+
+  toc = time.time()
+  tf.logging.info('Prepare predictions DONE (t={:0.2f}s).'.format(toc - tic))
+
+  if coco_gt is None:
+    coco_gt = create_coco(
+        FLAGS.val_json_file, use_cpp_extension=use_cpp_extension)
+
+  if use_cpp_extension:
+    coco_dt = coco_gt.LoadRes(np.array(predictions, dtype=np.float32))
+    coco_eval = COCOeval(coco_gt, coco_dt, iou_type='bbox')
+    coco_eval.Evaluate()
+    coco_eval.Accumulate()
+    coco_eval.Summarize()
+    stats = coco_eval.GetStats()
+
+  else:
+    coco_dt = coco_gt.loadRes(np.array(predictions))
+
+    coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    stats = coco_eval.stats
+
+  print('Current AP: {:.5f}'.format(stats[0]))
+  metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+                  'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+  coco_time = time.time()
+  tf.logging.info('COCO eval DONE (t={:0.2f}s).'.format(coco_time - toc))
+
+  # Prefix with "COCO" to group in TensorBoard.
+  return {'COCO/' + key: value for key, value in zip(metric_names, stats)}
+
+
+def calc_iou(target, candidates):
+  target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1))
+  # Left Top & Right Bottom
+  lt = np.maximum(target_tiled[:,:2], candidates[:,:2])
+
+  rb = np.minimum(target_tiled[:,2:], candidates[:,2:])
+
+  delta = np.maximum(rb - lt, 0)
+
+  intersect = delta[:,0] * delta[:,1]
+
+  delta1 = target_tiled[:, 2:] - target_tiled[:, :2]
+  area1 = delta1[:,0] * delta1[:,1]
+  delta2 = candidates[:, 2:] - candidates[:, :2]
+  area2 = delta2[:,0] * delta2[:,1]
+
+  iou = intersect/(area1 + area2 - intersect)
+  return iou
+
+
+def decode_single(bboxes_in,
+                  scores_in,
+                  indices,
+                  criteria,
+                  max_output,
+                  max_num=200):
+  """Implement Non-maximum suppression.
+
+    Reference to https://github.com/amdegroot/ssd.pytorch
+
+  Args:
+    bboxes_in: a Tensor with shape [N, 4], which stacks box regression outputs
+      on all feature levels. The N is the number of total anchors on all levels.
+    scores_in: a Tensor with shape [ssd_constants.MAX_NUM_EVAL_BOXES,
+      num_classes]. The top ssd_constants.MAX_NUM_EVAL_BOXES box scores for each
+      class.
+    indices: a Tensor with shape [ssd_constants.MAX_NUM_EVAL_BOXES,
+      num_classes]. The indices for these top boxes for each class.
+    criteria: a float number to specify the threshold of NMS.
+    max_output: maximum output length.
+    max_num: maximum number of boxes before NMS.
+
+  Returns:
+    boxes, labels and scores after NMS.
+  """
+
+  bboxes_out = []
+  scores_out = []
+  labels_out = []
+
+  for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)):
+    class_indices = indices[:, i]
+    bboxes = bboxes_in[class_indices, :]
+    score = np.squeeze(score, 1)
+
+    # skip background
+    if i == 0:
+      continue
+
+    mask = score > ssd_constants.MIN_SCORE
+    if not np.any(mask):
+      continue
+
+    bboxes, score = bboxes[mask, :], score[mask]
+
+    # remain_list = []
+    # for r in range(bboxes.shape[0]):
+    #   if bboxes[r, 0] < 0 or bboxes[r, 1] < 0 or bboxes[r, 2] < 0 or bboxes[r, 3] < 0 or bboxes[r, 0] >= bboxes[r, 2] or \
+    #           bboxes[r, 1] >= bboxes[r, 3]:
+    #     continue
+    #   remain_list.append(r)
+    # bboxes = bboxes[remain_list, :]
+    # score = score[remain_list]
+
+    remain_list = []
+    for r in range(bboxes.shape[0]):
+      for j in range(4):
+        if bboxes[r, j] < 0:
+          bboxes[r, j] = 0.00001
+      if bboxes[r, 0] >= bboxes[r, 2]:
+        bboxes[r, 2] = bboxes[r, 0] + 0.00001
+      if bboxes[r, 1] >= bboxes[r, 3]:
+        bboxes[r, 3] = bboxes[r, 1] + 0.00001
+      remain_list.append(r)
+    bboxes = bboxes[remain_list, :]
+    score = score[remain_list]
+
+
+    score_idx_sorted = np.argsort(score)
+    score_sorted = score[score_idx_sorted]
+
+    score_idx_sorted = score_idx_sorted[-max_num:]
+    candidates = []
+
+    # perform non-maximum suppression
+    while len(score_idx_sorted):
+      idx = score_idx_sorted[-1]
+      bboxes_sorted = bboxes[score_idx_sorted, :]
+      bboxes_idx = bboxes[idx, :]
+      iou = calc_iou(bboxes_idx, bboxes_sorted)
+
+      score_idx_sorted = score_idx_sorted[iou < criteria]
+      candidates.append(idx)
+
+    bboxes_out.append(bboxes[candidates, :])
+    scores_out.append(score[candidates])
+    labels_out.extend([i]*len(candidates))
+
+  if len(scores_out) == 0:
+    tf.logging.info("No objects detected. Returning dummy values.")
+    return (
+        np.zeros(shape=(1, 4), dtype=np.float32),
+        np.zeros(shape=(1,), dtype=np.int32),
+        np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE,
+    )
+
+  bboxes_out = np.concatenate(bboxes_out, axis=0)
+  scores_out = np.concatenate(scores_out, axis=0)
+  labels_out = np.array(labels_out)
+
+  max_ids = np.argsort(scores_out)[-max_output:]
+
+  return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
@@ -0,0 +1,369 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Convert raw COCO dataset to TFRecord for object_detection.
+
+Example usage:
+    python create_coco_tf_record.py --logtostderr \
+      --image_dir="${TRAIN_IMAGE_DIR}" \
+      --object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
+      --caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
+      --output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
+      --num_shards=32
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import hashlib
+import io
+import json
+import logging
+import multiprocessing
+import os
+from absl import app
+from absl import flags
+import numpy as np
+import PIL.Image
+
+from pycocotools import mask
+from research.object_detection.utils import dataset_util
+from research.object_detection.utils import label_map_util
+
+import tensorflow.compat.v1 as tf
+flags.DEFINE_boolean(
+    'include_masks', False, 'Whether to include instance segmentations masks '
+    '(PNG encoded) in the result. default: False.')
+flags.DEFINE_string('image_dir', '', 'Directory containing images.')
+flags.DEFINE_string(
+    'image_info_file', '', 'File containing image information. '
+    'Tf Examples in the output files correspond to the image '
+    'info entries in this file. If this file is not provided '
+    'object_annotations_file is used if present. Otherwise, '
+    'caption_annotations_file is used to get image info.')
+flags.DEFINE_string(
+    'object_annotations_file', '', 'File containing object '
+    'annotations - boxes and instance masks.')
+flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
+                    'captions.')
+flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
+flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
+
+FLAGS = flags.FLAGS
+
+logger = tf.get_logger()
+logger.setLevel(logging.INFO)
+
+
+def create_tf_example(image,
+                      image_dir,
+                      bbox_annotations=None,
+                      category_index=None,
+                      caption_annotations=None,
+                      include_masks=False):
+  """Converts image and annotations to a tf.Example proto.
+
+  Args:
+    image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
+      u'width', u'date_captured', u'flickr_url', u'id']
+    image_dir: directory containing the image files.
+    bbox_annotations:
+      list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
+        u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
+        coordinates in the official COCO dataset are given as [x, y, width,
+        height] tuples using absolute coordinates where x, y represent the
+        top-left (0-indexed) corner.  This function converts to the format
+        expected by the Tensorflow Object Detection API (which is which is
+        [ymin, xmin, ymax, xmax] with coordinates normalized relative to image
+        size).
+    category_index: a dict containing COCO category information keyed by the
+      'id' field of each category.  See the label_map_util.create_category_index
+      function.
+    caption_annotations:
+      list of dict with keys: [u'id', u'image_id', u'str'].
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+
+  Returns:
+    example: The converted tf.Example
+    num_annotations_skipped: Number of (invalid) annotations that were ignored.
+
+  Raises:
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
+  """
+  image_height = image['height']
+  image_width = image['width']
+  filename = image['file_name']
+  image_id = image['id']
+
+  full_path = os.path.join(image_dir, filename)
+  with tf.gfile.GFile(full_path, 'rb') as fid:
+    encoded_jpg = fid.read()
+  encoded_jpg_io = io.BytesIO(encoded_jpg)
+  image = PIL.Image.open(encoded_jpg_io)
+  key = hashlib.sha256(encoded_jpg).hexdigest()
+  feature_dict = {
+      'image/height':
+          dataset_util.int64_feature(image_height),
+      'image/width':
+          dataset_util.int64_feature(image_width),
+      'image/filename':
+          dataset_util.bytes_feature(filename.encode('utf8')),
+      'image/source_id':
+          dataset_util.bytes_feature(str(image_id).encode('utf8')),
+      'image/key/sha256':
+          dataset_util.bytes_feature(key.encode('utf8')),
+      'image/encoded':
+          dataset_util.bytes_feature(encoded_jpg),
+      'image/format':
+          dataset_util.bytes_feature('jpeg'.encode('utf8')),
+  }
+
+  num_annotations_skipped = 0
+  if bbox_annotations:
+    xmin = []
+    xmax = []
+    ymin = []
+    ymax = []
+    is_crowd = []
+    category_names = []
+    category_ids = []
+    area = []
+    encoded_mask_png = []
+    for object_annotations in bbox_annotations:
+      (x, y, width, height) = tuple(object_annotations['bbox'])
+      if width <= 0 or height <= 0:
+        num_annotations_skipped += 1
+        continue
+      if x + width > image_width or y + height > image_height:
+        num_annotations_skipped += 1
+        continue
+      xmin.append(float(x) / image_width)
+      xmax.append(float(x + width) / image_width)
+      ymin.append(float(y) / image_height)
+      ymax.append(float(y + height) / image_height)
+      is_crowd.append(object_annotations['iscrowd'])
+      category_id = int(object_annotations['category_id'])
+      category_ids.append(category_id)
+      category_names.append(category_index[category_id]['name'].encode('utf8'))
+      area.append(object_annotations['area'])
+
+      if include_masks:
+        run_len_encoding = mask.frPyObjects(object_annotations['segmentation'],
+                                            image_height, image_width)
+        binary_mask = mask.decode(run_len_encoding)
+        if not object_annotations['iscrowd']:
+          binary_mask = np.amax(binary_mask, axis=2)
+        pil_image = PIL.Image.fromarray(binary_mask)
+        output_io = io.BytesIO()
+        pil_image.save(output_io, format='PNG')
+        encoded_mask_png.append(output_io.getvalue())
+    feature_dict.update({
+        'image/object/bbox/xmin':
+            dataset_util.float_list_feature(xmin),
+        'image/object/bbox/xmax':
+            dataset_util.float_list_feature(xmax),
+        'image/object/bbox/ymin':
+            dataset_util.float_list_feature(ymin),
+        'image/object/bbox/ymax':
+            dataset_util.float_list_feature(ymax),
+        'image/object/class/text':
+            dataset_util.bytes_list_feature(category_names),
+        'image/object/class/label':
+            dataset_util.int64_list_feature(category_ids),
+        'image/object/is_crowd':
+            dataset_util.int64_list_feature(is_crowd),
+        'image/object/area':
+            dataset_util.float_list_feature(area),
+    })
+    if include_masks:
+      feature_dict['image/object/mask'] = (
+          dataset_util.bytes_list_feature(encoded_mask_png))
+  if caption_annotations:
+    captions = []
+    for caption_annotation in caption_annotations:
+      captions.append(caption_annotation['caption'].encode('utf8'))
+    feature_dict.update(
+        {'image/caption': dataset_util.bytes_list_feature(captions)})
+
+  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+  return key, example, num_annotations_skipped
+
+
+def _pool_create_tf_example(args):
+  return create_tf_example(*args)
+
+
+def _load_object_annotations(object_annotations_file):
+  """Loads object annotation JSON file."""
+  with tf.gfile.GFile(object_annotations_file, 'r') as fid:
+    obj_annotations = json.load(fid)
+
+  images = obj_annotations['images']
+  category_index = label_map_util.create_category_index(
+      obj_annotations['categories'])
+
+  img_to_obj_annotation = collections.defaultdict(list)
+  logging.info('Building bounding box index.')
+  for annotation in obj_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_obj_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_obj_annotation:
+      missing_annotation_count += 1
+
+  logging.info('%d images are missing bboxes.', missing_annotation_count)
+
+  return img_to_obj_annotation, category_index
+
+
+def _load_caption_annotations(caption_annotations_file):
+  """Loads caption annotation JSON file."""
+  with tf.gfile.GFile(caption_annotations_file, 'r') as fid:
+    caption_annotations = json.load(fid)
+
+  img_to_caption_annotation = collections.defaultdict(list)
+  logging.info('Building caption index.')
+  for annotation in caption_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_caption_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  images = caption_annotations['images']
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_caption_annotation:
+      missing_annotation_count += 1
+
+  logging.info('%d images are missing captions.', missing_annotation_count)
+
+  return img_to_caption_annotation
+
+
+def _load_images_info(images_info_file):
+  with tf.gfile.GFile(images_info_file, 'r') as fid:
+    info_dict = json.load(fid)
+  return info_dict['images']
+
+
+def _create_tf_record_from_coco_annotations(images_info_file,
+                                            image_dir,
+                                            output_path,
+                                            num_shards,
+                                            object_annotations_file=None,
+                                            caption_annotations_file=None,
+                                            include_masks=False):
+  """Loads COCO annotation json files and converts to tf.Record format.
+
+  Args:
+    images_info_file: JSON file containing image info. The number of tf.Examples
+      in the output tf Record files is exactly equal to the number of image info
+      entries in this file. This can be any of train/val/test annotation json
+      files Eg. 'image_info_test-dev2017.json',
+      'instance_annotations_train2017.json',
+      'caption_annotations_train2017.json', etc.
+    image_dir: Directory containing the image files.
+    output_path: Path to output tf.Record file.
+    num_shards: Number of output files to create.
+    object_annotations_file: JSON file containing bounding box annotations.
+    caption_annotations_file: JSON file containing caption annotations.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+  """
+
+  logging.info('writing to output path: %s', output_path)
+  writers = [
+      tf.python_io.TFRecordWriter(
+          output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
+      for i in range(num_shards)
+  ]
+  images = _load_images_info(images_info_file)
+
+  img_to_obj_annotation = None
+  img_to_caption_annotation = None
+  category_index = None
+  if object_annotations_file:
+    img_to_obj_annotation, category_index = (
+        _load_object_annotations(object_annotations_file))
+  if caption_annotations_file:
+    img_to_caption_annotation = (
+        _load_caption_annotations(caption_annotations_file))
+
+  def _get_object_annotation(image_id):
+    if img_to_obj_annotation:
+      return img_to_obj_annotation[image_id]
+    else:
+      return None
+
+  def _get_caption_annotation(image_id):
+    if img_to_caption_annotation:
+      return img_to_caption_annotation[image_id]
+    else:
+      return None
+
+  pool = multiprocessing.Pool()
+  total_num_annotations_skipped = 0
+  for idx, (_, tf_example, num_annotations_skipped) in enumerate(
+      pool.imap(_pool_create_tf_example,
+                [(image, image_dir, _get_object_annotation(image['id']),
+                  category_index, _get_caption_annotation(image['id']),
+                  include_masks) for image in images])):
+    if idx % 100 == 0:
+      logging.info('On image %d of %d', idx, len(images))
+
+    total_num_annotations_skipped += num_annotations_skipped
+    writers[idx % num_shards].write(tf_example.SerializeToString())
+
+  pool.close()
+  pool.join()
+
+  for writer in writers:
+    writer.close()
+
+  logging.info('Finished writing, skipped %d annotations.',
+               total_num_annotations_skipped)
+
+
+def main(_):
+  assert FLAGS.image_dir, '`image_dir` missing.'
+  assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
+          FLAGS.caption_annotations_file), ('All annotation files are '
+                                            'missing.')
+  if FLAGS.image_info_file:
+    images_info_file = FLAGS.image_info_file
+  elif FLAGS.object_annotations_file:
+    images_info_file = FLAGS.object_annotations_file
+  else:
+    images_info_file = FLAGS.caption_annotations_file
+
+  directory = os.path.dirname(FLAGS.output_file_prefix)
+  if not tf.gfile.IsDirectory(directory):
+    tf.gfile.MakeDirs(directory)
+
+  _create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
+                                          FLAGS.output_file_prefix,
+                                          FLAGS.num_shards,
+                                          FLAGS.object_annotations_file,
+                                          FLAGS.caption_annotations_file,
+                                          FLAGS.include_masks)
+
+
+if __name__ == '__main__':
+  logger = tf.get_logger()
+  logger.setLevel(logging.INFO)
+  app.run(main)
@@ -0,0 +1,436 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data loader and processing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools as it
+import math
+import os
+
+import numpy as np
+import tensorflow as tf
+
+from object_detection import argmax_matcher
+from object_detection import box_list
+from object_detection import faster_rcnn_box_coder
+from object_detection import preprocessor
+from object_detection import region_similarity_calculator
+from object_detection import target_assigner
+from object_detection import tf_example_decoder
+import ssd_constants
+
+
+def get_rank_size():
+    return int(os.environ['RANK_SIZE'])
+
+def get_rank_id():
+    return int(os.environ['DEVICE_ID'])
+
+class DefaultBoxes(object):
+  """Default bounding boxes for 300x300 5 layer SSD.
+
+  Default bounding boxes generation follows the order of (W, H, anchor_sizes).
+  Therefore, the tensor converted from DefaultBoxes has a shape of
+  [anchor_sizes, H, W, 4]. The last dimension is the box coordinates; 'ltrb'
+  is [ymin, xmin, ymax, xmax] while 'xywh' is [cy, cx, h, w].
+  """
+
+  def __init__(self):
+    fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS)
+
+    self.default_boxes = []
+    # size of feature and number of feature
+    for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES):
+      sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE
+      sk2 = ssd_constants.SCALES[idx+1] / ssd_constants.IMAGE_SIZE
+      sk3 = math.sqrt(sk1*sk2)
+      all_sizes = [(sk1, sk1), (sk3, sk3)]
+
+      for alpha in ssd_constants.ASPECT_RATIOS[idx]:
+        w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha)
+        all_sizes.append((w, h))
+        all_sizes.append((h, w))
+
+      assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx]
+
+      for i, j in it.product(range(feature_size), repeat=2):
+        for w, h in all_sizes:
+          cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
+          box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w))
+          self.default_boxes.append(box)
+
+    assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES
+
+    def to_ltrb(cy, cx, h, w):
+      return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2
+
+    # For IoU calculation
+    self.default_boxes_ltrb = tuple(to_ltrb(*i) for i in self.default_boxes)
+
+  def __call__(self, order='ltrb'):
+    if order == 'ltrb': return self.default_boxes_ltrb
+    if order == 'xywh': return self.default_boxes
+
+
+def calc_iou_tensor(box1, box2):
+  """ Calculation of IoU based on two boxes tensor,
+      Reference to https://github.com/kuangliu/pytorch-ssd
+      input:
+          box1 (N, 4)
+          box2 (M, 4)
+      output:
+          IoU (N, M)
+  """
+  N = tf.shape(box1)[0]
+  M = tf.shape(box2)[0]
+
+  be1 = tf.tile(tf.expand_dims(box1, axis=1), (1, M, 1))
+  be2 = tf.tile(tf.expand_dims(box2, axis=0), (N, 1, 1))
+
+  # Left Top & Right Bottom
+  lt = tf.maximum(be1[:,:,:2], be2[:,:,:2])
+
+  rb = tf.minimum(be1[:,:,2:], be2[:,:,2:])
+
+  delta = tf.maximum(rb - lt, 0)
+
+  intersect = delta[:,:,0]*delta[:,:,1]
+
+  delta1 = be1[:,:,2:] - be1[:,:,:2]
+  area1 = delta1[:,:,0]*delta1[:,:,1]
+  delta2 = be2[:,:,2:] - be2[:,:,:2]
+  area2 = delta2[:,:,0]*delta2[:,:,1]
+
+  iou = intersect/(area1 + area2 - intersect)
+  return iou
+
+
+def ssd_crop(image, boxes, classes):
+  """IoU biassed random crop.
+
+  Reference: https://github.com/chauhan-utk/ssd.DomainAdaptation
+  """
+
+  num_boxes = tf.shape(boxes)[0]
+
+  def no_crop_check():
+    return (tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32)
+            < ssd_constants.P_NO_CROP_PER_PASS)
+
+  def no_crop_proposal():
+    return (
+        tf.ones((), tf.bool),
+        tf.convert_to_tensor([0, 0, 1, 1], dtype=tf.float32),
+        tf.ones((num_boxes,), tf.bool),
+    )
+
+  def crop_proposal():
+    rand_vec = lambda minval, maxval: tf.random_uniform(
+        shape=(ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval,
+        dtype=tf.float32)
+
+    width, height = rand_vec(0.3, 1), rand_vec(0.3, 1)
+    left, top = rand_vec(0, 1-width), rand_vec(0, 1-height)
+
+    right = left + width
+    bottom = top + height
+
+    ltrb = tf.concat([left, top, right, bottom], axis=1)
+
+    min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0]
+    ious = calc_iou_tensor(ltrb, boxes)
+
+    # discard any bboxes whose center not in the cropped image
+    xc, yc = [tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :],
+                      (ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2)]
+
+    masks = tf.reduce_all(tf.stack([
+        tf.greater(xc, tf.tile(left, (1, num_boxes))),
+        tf.less(xc, tf.tile(right, (1, num_boxes))),
+        tf.greater(yc, tf.tile(top, (1, num_boxes))),
+        tf.less(yc, tf.tile(bottom, (1, num_boxes))),
+    ], axis=2), axis=2)
+
+    # Checks of whether a crop is valid.
+    valid_aspect = tf.logical_and(tf.less(height/width, 2),
+                                  tf.less(width/height, 2))
+    valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True)
+    valid_masks = tf.reduce_any(masks, axis=1, keepdims=True)
+
+    valid_all = tf.cast(tf.reduce_all(tf.concat(
+        [valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32)
+
+    # One indexed, as zero is needed for the case of no matches.
+    index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32)
+
+    # Either one-hot, or zeros if there is no valid crop.
+    selection = tf.equal(tf.reduce_max(index * valid_all), index)
+
+    use_crop = tf.reduce_any(selection)
+    output_ltrb = tf.reduce_sum(tf.multiply(ltrb, tf.tile(tf.cast(
+        selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0)
+    output_masks = tf.reduce_any(tf.logical_and(masks, tf.tile(
+        selection[:, tf.newaxis], (1, num_boxes))), axis=0)
+
+    return use_crop, output_ltrb, output_masks
+
+  def proposal(*args):
+    return tf.cond(
+        pred=no_crop_check(),
+        true_fn=no_crop_proposal,
+        false_fn=crop_proposal,
+    )
+
+  _, crop_bounds, box_masks = tf.while_loop(
+      cond=lambda x, *_: tf.logical_not(x),
+      body=proposal,
+      loop_vars=[tf.zeros((), tf.bool), tf.zeros((4,), tf.float32), tf.zeros((num_boxes,), tf.bool)],
+  )
+
+  filtered_boxes = tf.boolean_mask(boxes, box_masks, axis=0)
+
+  # Clip boxes to the cropped region.
+  filtered_boxes = tf.stack([
+      tf.maximum(filtered_boxes[:, 0], crop_bounds[0]),
+      tf.maximum(filtered_boxes[:, 1], crop_bounds[1]),
+      tf.minimum(filtered_boxes[:, 2], crop_bounds[2]),
+      tf.minimum(filtered_boxes[:, 3], crop_bounds[3]),
+  ], axis=1)
+
+  left = crop_bounds[0]
+  top = crop_bounds[1]
+  width = crop_bounds[2] - left
+  height = crop_bounds[3] - top
+
+  cropped_boxes = tf.stack([
+      (filtered_boxes[:, 0] - left) / width,
+      (filtered_boxes[:, 1] - top) / height,
+      (filtered_boxes[:, 2] - left) / width,
+      (filtered_boxes[:, 3] - top) / height,
+  ], axis=1)
+
+  cropped_image = tf.image.crop_and_resize(
+      image=image[tf.newaxis, :, :, :],
+      boxes=crop_bounds[tf.newaxis, :],
+      box_ind=tf.zeros((1,), tf.int32),
+      crop_size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE),
+  )[0, :, :, :]
+
+  cropped_classes = tf.boolean_mask(classes, box_masks, axis=0)
+
+  return cropped_image, cropped_boxes, cropped_classes
+
+
+def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0):
+  """Distorts the color of the image.
+
+  Args:
+    image: The input image tensor.
+    brightness: A float, specifying the brightness for color jitter.
+    contrast: A float, specifying the contrast for color jitter.
+    saturation: A float, specifying the saturation for color jitter.
+    hue: A float, specifying the hue for color jitter.
+
+  Returns:
+    The distorted image tensor.
+  """
+  with tf.name_scope('distort_color'):
+    if brightness > 0:
+      image = tf.image.random_brightness(image, max_delta=brightness)
+    if contrast > 0:
+      image = tf.image.random_contrast(
+          image, lower=1-contrast, upper=1+contrast)
+    if saturation > 0:
+      image = tf.image.random_saturation(
+          image, lower=1-saturation, upper=1+saturation)
+    if hue > 0:
+      image = tf.image.random_hue(image, max_delta=hue)
+    return image
+
+
+def encode_labels(gt_boxes, gt_labels):
+  """Labels anchors with ground truth inputs.
+
+  Args:
+    gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+      For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+    gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+      classes.
+  Returns:
+    encoded_classes: a tensor with shape [num_anchors, 1].
+    encoded_boxes: a tensor with shape [num_anchors, 4].
+    num_positives: scalar tensor storing number of positives in an image.
+  """
+  similarity_calc = region_similarity_calculator.IouSimilarity()
+  matcher = argmax_matcher.ArgMaxMatcher(
+      matched_threshold=ssd_constants.MATCH_THRESHOLD,
+      unmatched_threshold=ssd_constants.MATCH_THRESHOLD,
+      negatives_lower_than_unmatched=True,
+      force_match_for_each_row=True)
+
+  box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+      scale_factors=ssd_constants.BOX_CODER_SCALES)
+
+  default_boxes = box_list.BoxList(tf.convert_to_tensor(DefaultBoxes()('ltrb')))
+  target_boxes = box_list.BoxList(gt_boxes)
+
+  assigner = target_assigner.TargetAssigner(
+      similarity_calc, matcher, box_coder)
+
+  encoded_classes, _, encoded_boxes, _, matches = assigner.assign(
+      default_boxes, target_boxes, gt_labels)
+  num_matched_boxes = tf.reduce_sum(
+      tf.cast(tf.not_equal(matches.match_results, -1), tf.float32))
+  return encoded_classes, encoded_boxes, num_matched_boxes
+
+class SSDInputReader(object):
+  """Input reader for dataset."""
+
+  def __init__(self,
+               file_pattern,
+               transpose_input=False,
+               is_training=False,
+               distributed_eval=False,
+               count=-1):
+    self._file_pattern = file_pattern
+    self._transpose_input = transpose_input
+    self._is_training = is_training
+    self._distributed_eval = distributed_eval
+    self._count = count
+
+  def __call__(self, params):
+    example_decoder = tf_example_decoder.TfExampleDecoder()
+
+    def _parse_example(data):
+      with tf.name_scope('augmentation'):
+        source_id = data['source_id']
+        image = data['image']  # dtype uint8
+        raw_shape = tf.shape(image)
+        boxes = data['groundtruth_boxes']
+        classes = tf.reshape(data['groundtruth_classes'], [-1, 1])
+
+        # Only 80 of the 90 COCO classes are used.
+        class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
+        classes = tf.gather(class_map, classes)
+        classes = tf.cast(classes, dtype=tf.float32)
+
+        if self._is_training:
+          image, boxes, classes = ssd_crop(image, boxes, classes)
+          # ssd_crop resizes and returns image of dtype float32 and does not
+          # change its range (i.e., value in between 0--255). Divide by 255.
+          # converts it to [0, 1] range. Not doing this before cropping to
+          # avoid dtype cast (which incurs additional memory copy).
+          image /= 255.0
+
+          # random_horizontal_flip() is hard coded to flip with 50% chance.
+          image, boxes = preprocessor.random_horizontal_flip(
+              image=image, boxes=boxes)
+
+          # TODO(shibow): Investigate the parameters for color jitter.
+          image = color_jitter(
+              image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
+
+
+          encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
+              boxes, classes)
+
+          # TODO(taylorrobie): Check that this cast is valid.
+          encoded_classes = tf.cast(encoded_classes, tf.int32)
+
+          labels = {
+              ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes,
+              ssd_constants.BOXES: encoded_boxes,
+              ssd_constants.CLASSES: tf.squeeze(encoded_classes, axis=1),
+          }
+
+          return image, labels
+
+        else:
+          image = tf.image.resize_images(
+              image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))
+          # resize_image returns image of dtype float32 and does not change its
+          # range. Divide by 255 to convert image to [0, 1] range.
+          image /= 255.
+
+          def trim_and_pad(inp_tensor, dim_1):
+            """Limit the number of boxes, and pad if necessary."""
+            inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES]
+            num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0]
+            inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
+            return tf.reshape(
+                inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1])
+
+          boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1)
+
+          sample = {
+              ssd_constants.IMAGE: image,
+              ssd_constants.BOXES: boxes,
+              ssd_constants.CLASSES: classes,
+              ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32),
+              ssd_constants.RAW_SHAPE: raw_shape,
+          }
+
+          return sample
+
+    batch_size = params['batch_size']
+    dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
+
+    if self._is_training or self._distributed_eval:
+      if get_rank_size() == 1:
+          dataset = dataset.shard(1, 0)
+      else:
+          dataset = dataset.shard(get_rank_size(), get_rank_id())
+      if self._is_training:
+        dataset = dataset.shuffle( tf.to_int64(256))
+
+    # Prefetch data from files.
+    def _prefetch_dataset(filename):
+      dataset = tf.data.TFRecordDataset(filename).prefetch(1)
+      return dataset
+    dataset = dataset.apply(
+        tf.data.experimental.parallel_interleave(
+            _prefetch_dataset, cycle_length=32, sloppy=self._is_training))
+
+    # Parse the fetched records to input tensors for model function.
+    dataset = dataset.map(example_decoder.decode, num_parallel_calls=64)
+
+    if self._is_training:
+      dataset = dataset.map(
+          # pylint: disable=g-long-lambda
+          lambda data: (data,
+                        tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)),
+          num_parallel_calls=64)
+      dataset = dataset.filter(lambda data, pred: pred)
+
+      dataset = dataset.shuffle(64).repeat()
+
+      dataset = dataset.map(lambda data, pred: data) # use the first value
+      dataset = dataset.map(_parse_example, num_parallel_calls=64)
+      dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
+    else:
+      dataset = dataset.prefetch(batch_size * 64)
+      dataset = dataset.map(_parse_example, num_parallel_calls=64)
+      dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
+
+    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    options = tf.data.Options()
+    options.experimental_threading.max_intra_op_parallelism = 1
+    options.experimental_threading.private_threadpool_size = 48
+    dataset = dataset.with_options(options)
+
+    return dataset
@@ -0,0 +1,24 @@
+#!/bin/bash
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export JOB_ID=990
+export FUSION_TENSOR_SIZE=1000000000
+
+python3 ${3}/ssd_main.py --mode=train_and_eval \
+                     --train_batch_size=32 \
+                     --training_file_pattern="train_tfrecord_path/train2017*" \
+                     --resnet_checkpoint=resnet34_path/model.ckpt-28152 \
+                     --validation_file_pattern="val_tfrecord_path/val2017*" \
+                     --val_json_file="annotations_patah/instances_val2017.json" \
+                     --eval_batch_size=32 \
+                     --model_dir=result_npu
+
+
+sleep 2
+echo "**************** train finished ***************"
+cp /var/log/npu/slog/host-0/* ./slog
+cp /var/log/npu/slog/device-$DEVICE_ID/* ./slog
+cp /var/log/npu/slog/device-os-$DEVICE_ID/* ./slog
+
@@ -0,0 +1,14 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
@@ -0,0 +1,199 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Argmax matcher implementation.
+
+This class takes a similarity matrix and matches columns to rows based on the
+maximum value per column. One can specify matched_thresholds and
+to prevent columns from matching to rows (generally resulting in a negative
+training example) and unmatched_theshold to ignore the match (generally
+resulting in neither a positive or negative training example).
+
+This matcher is used in Fast(er)-RCNN.
+
+Note: matchers are used in TargetAssigners. There is a create_target_assigner
+factory function for popular implementations.
+"""
+import tensorflow as tf
+
+from object_detection import matcher
+from object_detection import shape_utils
+
+
+class ArgMaxMatcher(matcher.Matcher):
+  """Matcher based on highest value.
+
+  This class computes matches from a similarity matrix. Each column is matched
+  to a single row.
+
+  To support object detection target assignment this class enables setting both
+  matched_threshold (upper threshold) and unmatched_threshold (lower thresholds)
+  defining three categories of similarity which define whether examples are
+  positive, negative, or ignored:
+  (1) similarity >= matched_threshold: Highest similarity. Matched/Positive!
+  (2) matched_threshold > similarity >= unmatched_threshold: Medium similarity.
+          Depending on negatives_lower_than_unmatched, this is either
+          Unmatched/Negative OR Ignore.
+  (3) unmatched_threshold > similarity: Lowest similarity. Depending on flag
+          negatives_lower_than_unmatched, either Unmatched/Negative OR Ignore.
+  For ignored matches this class sets the values in the Match object to -2.
+  """
+
+  def __init__(self,
+               matched_threshold,
+               unmatched_threshold=None,
+               negatives_lower_than_unmatched=True,
+               force_match_for_each_row=False):
+    """Construct ArgMaxMatcher.
+
+    Args:
+      matched_threshold: Threshold for positive matches. Positive if
+        sim >= matched_threshold, where sim is the maximum value of the
+        similarity matrix for a given column. Set to None for no threshold.
+      unmatched_threshold: Threshold for negative matches. Negative if
+        sim < unmatched_threshold. Defaults to matched_threshold
+        when set to None.
+      negatives_lower_than_unmatched: Boolean which defaults to True. If True
+        then negative matches are the ones below the unmatched_threshold,
+        whereas ignored matches are in between the matched and umatched
+        threshold. If False, then negative matches are in between the matched
+        and unmatched threshold, and everything lower than unmatched is ignored.
+      force_match_for_each_row: If True, ensures that each row is matched to
+        at least one column (which is not guaranteed otherwise if the
+        matched_threshold is high). Defaults to False. See
+        argmax_matcher_test.testMatcherForceMatch() for an example.
+
+    Raises:
+      ValueError: if unmatched_threshold is set but matched_threshold is not set
+        or if unmatched_threshold > matched_threshold.
+    """
+    if (matched_threshold is None) and (unmatched_threshold is not None):
+      raise ValueError('Need to also define matched_threshold when'
+                       'unmatched_threshold is defined')
+    self._matched_threshold = matched_threshold
+    if unmatched_threshold is None:
+      self._unmatched_threshold = matched_threshold
+    else:
+      if unmatched_threshold > matched_threshold:
+        raise ValueError('unmatched_threshold needs to be smaller or equal'
+                         'to matched_threshold')
+      self._unmatched_threshold = unmatched_threshold
+    if not negatives_lower_than_unmatched:
+      if self._unmatched_threshold == self._matched_threshold:
+        raise ValueError('When negatives are in between matched and '
+                         'unmatched thresholds, these cannot be of equal '
+                         'value. matched: %s, unmatched: %s',
+                         self._matched_threshold, self._unmatched_threshold)
+    self._force_match_for_each_row = force_match_for_each_row
+    self._negatives_lower_than_unmatched = negatives_lower_than_unmatched
+
+  def _match(self, similarity_matrix):
+    """Tries to match each column of the similarity matrix to a row.
+
+    Args:
+      similarity_matrix: tensor of shape [N, M] representing any similarity
+        metric.
+
+    Returns:
+      Match object with corresponding matches for each of M columns.
+    """
+
+    def _match_when_rows_are_empty():
+      """Performs matching when the rows of similarity matrix are empty.
+
+      When the rows are empty, all detections are false positives. So we return
+      a tensor of -1's to indicate that the columns do not match to any rows.
+
+      Returns:
+        matches:  int32 tensor indicating the row each column matches to.
+      """
+      similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
+          similarity_matrix)
+      return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
+
+    def _match_when_rows_are_non_empty():
+      """Performs matching when the rows of similarity matrix are non empty.
+
+      Returns:
+        matches:  int32 tensor indicating the row each column matches to.
+      """
+      # Matches for each column
+      matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32)
+
+      # Deal with matched and unmatched threshold
+      if self._matched_threshold is not None:
+        # Get logical indices of ignored and unmatched columns as tf.int64
+        matched_vals = tf.reduce_max(similarity_matrix, 0)
+        below_unmatched_threshold = tf.greater(self._unmatched_threshold,
+                                               matched_vals)
+        between_thresholds = tf.logical_and(
+            tf.greater_equal(matched_vals, self._unmatched_threshold),
+            tf.greater(self._matched_threshold, matched_vals))
+
+        if self._negatives_lower_than_unmatched:
+          matches = self._set_values_using_indicator(matches,
+                                                     below_unmatched_threshold,
+                                                     -1)
+          matches = self._set_values_using_indicator(matches,
+                                                     between_thresholds,
+                                                     -2)
+        else:
+          matches = self._set_values_using_indicator(matches,
+                                                     below_unmatched_threshold,
+                                                     -2)
+          matches = self._set_values_using_indicator(matches,
+                                                     between_thresholds,
+                                                     -1)
+
+      if self._force_match_for_each_row:
+        similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
+            similarity_matrix)
+        force_match_column_ids = tf.argmax(similarity_matrix, 1,
+                                           output_type=tf.int32)
+        force_match_column_indicators = tf.one_hot(
+            force_match_column_ids, depth=similarity_matrix_shape[1])
+        force_match_row_ids = tf.argmax(force_match_column_indicators, 0,
+                                        output_type=tf.int32)
+        force_match_column_mask = tf.cast(
+            tf.reduce_max(force_match_column_indicators, 0), tf.bool)
+        final_matches = tf.where(force_match_column_mask,
+                                 force_match_row_ids, matches)
+        return final_matches
+      else:
+        return matches
+
+    if similarity_matrix.shape.is_fully_defined():
+      if similarity_matrix.shape[0].value == 0:
+        return _match_when_rows_are_empty()
+      else:
+        return _match_when_rows_are_non_empty()
+    else:
+      return tf.cond(
+          tf.greater(tf.shape(similarity_matrix)[0], 0),
+          _match_when_rows_are_non_empty, _match_when_rows_are_empty)
+
+  def _set_values_using_indicator(self, x, indicator, val):
+    """Set the indicated fields of x to val.
+
+    Args:
+      x: tensor.
+      indicator: boolean with same shape as x.
+      val: scalar with value to set.
+
+    Returns:
+      modified tensor.
+    """
+    indicator = tf.cast(indicator, x.dtype)
+    return tf.add(tf.multiply(x, 1 - indicator), val * indicator)
@@ -0,0 +1,151 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base box coder.
+
+Box coders convert between coordinate frames, namely image-centric
+(with (0,0) on the top left of image) and anchor-centric (with (0,0) being
+defined by a specific anchor).
+
+Users of a BoxCoder can call two methods:
+ encode: which encodes a box with respect to a given anchor
+  (or rather, a tensor of boxes wrt a corresponding tensor of anchors) and
+ decode: which inverts this encoding with a decode operation.
+In both cases, the arguments are assumed to be in 1-1 correspondence already;
+it is not the job of a BoxCoder to perform matching.
+"""
+from abc import ABCMeta
+from abc import abstractmethod
+from abc import abstractproperty
+
+import tensorflow as tf
+
+
+# Box coder types.
+FASTER_RCNN = 'faster_rcnn'
+KEYPOINT = 'keypoint'
+MEAN_STDDEV = 'mean_stddev'
+SQUARE = 'square'
+
+
+class BoxCoder(object):
+  """Abstract base class for box coder."""
+  __metaclass__ = ABCMeta
+
+  @abstractproperty
+  def code_size(self):
+    """Return the size of each code.
+
+    This number is a constant and should agree with the output of the `encode`
+    op (e.g. if rel_codes is the output of self.encode(...), then it should have
+    shape [N, code_size()]).  This abstractproperty should be overridden by
+    implementations.
+
+    Returns:
+      an integer constant
+    """
+    pass
+
+  def encode(self, boxes, anchors):
+    """Encode a box list relative to an anchor collection.
+
+    Args:
+      boxes: BoxList holding N boxes to be encoded
+      anchors: BoxList of N anchors
+
+    Returns:
+      a tensor representing N relative-encoded boxes
+    """
+    with tf.name_scope('Encode'):
+      return self._encode(boxes, anchors)
+
+  def decode(self, rel_codes, anchors):
+    """Decode boxes that are encoded relative to an anchor collection.
+
+    Args:
+      rel_codes: a tensor representing N relative-encoded boxes
+      anchors: BoxList of anchors
+
+    Returns:
+      boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
+        with corners y_min, x_min, y_max, x_max)
+    """
+    with tf.name_scope('Decode'):
+      return self._decode(rel_codes, anchors)
+
+  @abstractmethod
+  def _encode(self, boxes, anchors):
+    """Method to be overriden by implementations.
+
+    Args:
+      boxes: BoxList holding N boxes to be encoded
+      anchors: BoxList of N anchors
+
+    Returns:
+      a tensor representing N relative-encoded boxes
+    """
+    pass
+
+  @abstractmethod
+  def _decode(self, rel_codes, anchors):
+    """Method to be overriden by implementations.
+
+    Args:
+      rel_codes: a tensor representing N relative-encoded boxes
+      anchors: BoxList of anchors
+
+    Returns:
+      boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
+        with corners y_min, x_min, y_max, x_max)
+    """
+    pass
+
+
+def batch_decode(encoded_boxes, box_coder, anchors):
+  """Decode a batch of encoded boxes.
+
+  This op takes a batch of encoded bounding boxes and transforms
+  them to a batch of bounding boxes specified by their corners in
+  the order of [y_min, x_min, y_max, x_max].
+
+  Args:
+    encoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
+      code_size] representing the location of the objects.
+    box_coder: a BoxCoder object.
+    anchors: a BoxList of anchors used to encode `encoded_boxes`.
+
+  Returns:
+    decoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
+      coder_size] representing the corners of the objects in the order
+      of [y_min, x_min, y_max, x_max].
+
+  Raises:
+    ValueError: if batch sizes of the inputs are inconsistent, or if
+    the number of anchors inferred from encoded_boxes and anchors are
+    inconsistent.
+  """
+  encoded_boxes.get_shape().assert_has_rank(3)
+  if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static():
+    raise ValueError('The number of anchors inferred from encoded_boxes'
+                     ' and anchors are inconsistent: shape[1] of encoded_boxes'
+                     ' %s should be equal to the number of anchors: %s.' %
+                     (encoded_boxes.get_shape()[1].value,
+                      anchors.num_boxes_static()))
+
+  decoded_boxes = tf.stack([
+      box_coder.decode(boxes, anchors).get()
+      for boxes in tf.unstack(encoded_boxes)
+  ])
+  return decoded_boxes
@@ -0,0 +1,207 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Bounding Box List definition.
+
+BoxList represents a list of bounding boxes as tensorflow
+tensors, where each bounding box is represented as a row of 4 numbers,
+[y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes
+within a given list correspond to a single image.  See also
+box_list_ops.py for common box related operations (such as area, iou, etc).
+
+Optionally, users can add additional related fields (such as weights).
+We assume the following things to be true about fields:
+* they correspond to boxes in the box_list along the 0th dimension
+* they have inferrable rank at graph construction time
+* all dimensions except for possibly the 0th can be inferred
+  (i.e., not None) at graph construction time.
+
+Some other notes:
+  * Following tensorflow conventions, we use height, width ordering,
+  and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering
+  * Tensors are always provided as (flat) [N, 4] tensors.
+"""
+
+import tensorflow as tf
+
+
+class BoxList(object):
+  """Box collection."""
+
+  def __init__(self, boxes):
+    """Constructs box collection.
+
+    Args:
+      boxes: a tensor of shape [N, 4] representing box corners
+
+    Raises:
+      ValueError: if invalid dimensions for bbox data or if bbox data is not in
+          float32 format.
+    """
+    if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
+      raise ValueError('Invalid dimensions for box data.')
+    if boxes.dtype != tf.float32:
+      raise ValueError('Invalid tensor type: should be tf.float32')
+    self.data = {'boxes': boxes}
+
+  def num_boxes(self):
+    """Returns number of boxes held in collection.
+
+    Returns:
+      a tensor representing the number of boxes held in the collection.
+    """
+    return tf.shape(self.data['boxes'])[0]
+
+  def num_boxes_static(self):
+    """Returns number of boxes held in collection.
+
+    This number is inferred at graph construction time rather than run-time.
+
+    Returns:
+      Number of boxes held in collection (integer) or None if this is not
+        inferrable at graph construction time.
+    """
+    return self.data['boxes'].get_shape()[0].value
+
+  def get_all_fields(self):
+    """Returns all fields."""
+    return self.data.keys()
+
+  def get_extra_fields(self):
+    """Returns all non-box fields (i.e., everything not named 'boxes')."""
+    return [k for k in self.data.keys() if k != 'boxes']
+
+  def add_field(self, field, field_data):
+    """Add field to box list.
+
+    This method can be used to add related box data such as
+    weights/labels, etc.
+
+    Args:
+      field: a string key to access the data via `get`
+      field_data: a tensor containing the data to store in the BoxList
+    """
+    self.data[field] = field_data
+
+  def has_field(self, field):
+    return field in self.data
+
+  def get(self):
+    """Convenience function for accessing box coordinates.
+
+    Returns:
+      a tensor with shape [N, 4] representing box coordinates.
+    """
+    return self.get_field('boxes')
+
+  def set(self, boxes):
+    """Convenience function for setting box coordinates.
+
+    Args:
+      boxes: a tensor of shape [N, 4] representing box corners
+
+    Raises:
+      ValueError: if invalid dimensions for bbox data
+    """
+    if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
+      raise ValueError('Invalid dimensions for box data.')
+    self.data['boxes'] = boxes
+
+  def get_field(self, field):
+    """Accesses a box collection and associated fields.
+
+    This function returns specified field with object; if no field is specified,
+    it returns the box coordinates.
+
+    Args:
+      field: this optional string parameter can be used to specify
+        a related field to be accessed.
+
+    Returns:
+      a tensor representing the box collection or an associated field.
+
+    Raises:
+      ValueError: if invalid field
+    """
+    if not self.has_field(field):
+      raise ValueError('field ' + str(field) + ' does not exist')
+    return self.data[field]
+
+  def set_field(self, field, value):
+    """Sets the value of a field.
+
+    Updates the field of a box_list with a given value.
+
+    Args:
+      field: (string) name of the field to set value.
+      value: the value to assign to the field.
+
+    Raises:
+      ValueError: if the box_list does not have specified field.
+    """
+    if not self.has_field(field):
+      raise ValueError('field %s does not exist' % field)
+    self.data[field] = value
+
+  def get_center_coordinates_and_sizes(self, scope=None):
+    """Computes the center coordinates, height and width of the boxes.
+
+    Args:
+      scope: name scope of the function.
+
+    Returns:
+      a list of 4 1-D tensors [ycenter, xcenter, height, width].
+    """
+    with tf.name_scope(scope, 'get_center_coordinates_and_sizes'):
+      box_corners = self.get()
+      ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners))
+      width = xmax - xmin
+      height = ymax - ymin
+      ycenter = ymin + height / 2.
+      xcenter = xmin + width / 2.
+      return [ycenter, xcenter, height, width]
+
+  def transpose_coordinates(self, scope=None):
+    """Transpose the coordinate representation in a boxlist.
+
+    Args:
+      scope: name scope of the function.
+    """
+    with tf.name_scope(scope, 'transpose_coordinates'):
+      y_min, x_min, y_max, x_max = tf.split(
+          value=self.get(), num_or_size_splits=4, axis=1)
+      self.set(tf.concat([x_min, y_min, x_max, y_max], 1))
+
+  def as_tensor_dict(self, fields=None):
+    """Retrieves specified fields as a dictionary of tensors.
+
+    Args:
+      fields: (optional) list of fields to return in the dictionary.
+        If None (default), all fields are returned.
+
+    Returns:
+      tensor_dict: A dictionary of tensors specified by fields.
+
+    Raises:
+      ValueError: if specified field is not contained in boxlist.
+    """
+    tensor_dict = {}
+    if fields is None:
+      fields = self.get_all_fields()
+    for field in fields:
+      if not self.has_field(field):
+        raise ValueError('boxlist must contain all specified fields')
+      tensor_dict[field] = self.get_field(field)
+    return tensor_dict
@@ -0,0 +1,118 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Faster RCNN box coder.
+
+Faster RCNN box coder follows the coding schema described below:
+  ty = (y - ya) / ha
+  tx = (x - xa) / wa
+  th = log(h / ha)
+  tw = log(w / wa)
+  where x, y, w, h denote the box's center coordinates, width and height
+  respectively. Similarly, xa, ya, wa, ha denote the anchor's center
+  coordinates, width and height. tx, ty, tw and th denote the anchor-encoded
+  center, width and height respectively.
+
+  See http://arxiv.org/abs/1506.01497 for details.
+"""
+
+import tensorflow as tf
+
+from object_detection import box_coder
+from object_detection import box_list
+
+EPSILON = 1e-8
+
+
+class FasterRcnnBoxCoder(box_coder.BoxCoder):
+  """Faster RCNN box coder."""
+
+  def __init__(self, scale_factors=None):
+    """Constructor for FasterRcnnBoxCoder.
+
+    Args:
+      scale_factors: List of 4 positive scalars to scale ty, tx, th and tw.
+        If set to None, does not perform scaling. For Faster RCNN,
+        the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
+    """
+    if scale_factors:
+      assert len(scale_factors) == 4
+      for scalar in scale_factors:
+        assert scalar > 0
+    self._scale_factors = scale_factors
+
+  @property
+  def code_size(self):
+    return 4
+
+  def _encode(self, boxes, anchors):
+    """Encode a box collection with respect to anchor collection.
+
+    Args:
+      boxes: BoxList holding N boxes to be encoded.
+      anchors: BoxList of anchors.
+
+    Returns:
+      a tensor representing N anchor-encoded boxes of the format
+      [ty, tx, th, tw].
+    """
+    # Convert anchors to the center coordinate representation.
+    ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
+    ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes()
+    # Avoid NaN in division and log below.
+    ha += EPSILON
+    wa += EPSILON
+    h += EPSILON
+    w += EPSILON
+
+    tx = (xcenter - xcenter_a) / wa
+    ty = (ycenter - ycenter_a) / ha
+    tw = tf.log(w / wa)
+    th = tf.log(h / ha)
+    # Scales location targets as used in paper for joint training.
+    if self._scale_factors:
+      ty *= self._scale_factors[0]
+      tx *= self._scale_factors[1]
+      th *= self._scale_factors[2]
+      tw *= self._scale_factors[3]
+    return tf.transpose(tf.stack([ty, tx, th, tw]))
+
+  def _decode(self, rel_codes, anchors):
+    """Decode relative codes to boxes.
+
+    Args:
+      rel_codes: a tensor representing N anchor-encoded boxes.
+      anchors: BoxList of anchors.
+
+    Returns:
+      boxes: BoxList holding N bounding boxes.
+    """
+    ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
+
+    ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes))
+    if self._scale_factors:
+      ty /= self._scale_factors[0]
+      tx /= self._scale_factors[1]
+      th /= self._scale_factors[2]
+      tw /= self._scale_factors[3]
+    w = tf.exp(tw) * wa
+    h = tf.exp(th) * ha
+    ycenter = ty * ha + ycenter_a
+    xcenter = tx * wa + xcenter_a
+    ymin = ycenter - h / 2.
+    xmin = xcenter - w / 2.
+    ymax = ycenter + h / 2.
+    xmax = xcenter + w / 2.
+    return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
@@ -0,0 +1,241 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Matcher interface and Match class.
+
+This module defines the Matcher interface and the Match object. The job of the
+matcher is to match row and column indices based on the similarity matrix and
+other optional parameters. Each column is matched to at most one row. There
+are three possibilities for the matching:
+
+1) match: A column matches a row.
+2) no_match: A column does not match any row.
+3) ignore: A column that is neither 'match' nor no_match.
+
+The ignore case is regularly encountered in object detection: when an anchor has
+a relatively small overlap with a ground-truth box, one neither wants to
+consider this box a positive example (match) nor a negative example (no match).
+
+The Match class is used to store the match results and it provides simple apis
+to query the results.
+"""
+from abc import ABCMeta
+from abc import abstractmethod
+
+import tensorflow as tf
+
+
+class Match(object):
+  """Class to store results from the matcher.
+
+  This class is used to store the results from the matcher. It provides
+  convenient methods to query the matching results.
+  """
+
+  def __init__(self, match_results):
+    """Constructs a Match object.
+
+    Args:
+      match_results: Integer tensor of shape [N] with (1) match_results[i]>=0,
+        meaning that column i is matched with row match_results[i].
+        (2) match_results[i]=-1, meaning that column i is not matched.
+        (3) match_results[i]=-2, meaning that column i is ignored.
+
+    Raises:
+      ValueError: if match_results does not have rank 1 or is not an
+        integer int32 scalar tensor
+    """
+    if match_results.shape.ndims != 1:
+      raise ValueError('match_results should have rank 1')
+    if match_results.dtype != tf.int32:
+      raise ValueError('match_results should be an int32 or int64 scalar '
+                       'tensor')
+    self._match_results = match_results
+
+  @property
+  def match_results(self):
+    """The accessor for match results.
+
+    Returns:
+      the tensor which encodes the match results.
+    """
+    return self._match_results
+
+  def matched_column_indices(self):
+    """Returns column indices that match to some row.
+
+    The indices returned by this op are always sorted in increasing order.
+
+    Returns:
+      column_indices: int32 tensor of shape [K] with column indices.
+    """
+    return self._reshape_and_cast(tf.where(tf.greater(self._match_results, -1)))
+
+  def matched_column_indicator(self):
+    """Returns column indices that are matched.
+
+    Returns:
+      column_indices: int32 tensor of shape [K] with column indices.
+    """
+    return tf.greater_equal(self._match_results, 0)
+
+  def num_matched_columns(self):
+    """Returns number (int32 scalar tensor) of matched columns."""
+    return tf.size(self.matched_column_indices())
+
+  def unmatched_column_indices(self):
+    """Returns column indices that do not match any row.
+
+    The indices returned by this op are always sorted in increasing order.
+
+    Returns:
+      column_indices: int32 tensor of shape [K] with column indices.
+    """
+    return self._reshape_and_cast(tf.where(tf.equal(self._match_results, -1)))
+
+  def unmatched_column_indicator(self):
+    """Returns column indices that are unmatched.
+
+    Returns:
+      column_indices: int32 tensor of shape [K] with column indices.
+    """
+    return tf.equal(self._match_results, -1)
+
+  def num_unmatched_columns(self):
+    """Returns number (int32 scalar tensor) of unmatched columns."""
+    return tf.size(self.unmatched_column_indices())
+
+  def ignored_column_indices(self):
+    """Returns column indices that are ignored (neither Matched nor Unmatched).
+
+    The indices returned by this op are always sorted in increasing order.
+
+    Returns:
+      column_indices: int32 tensor of shape [K] with column indices.
+    """
+    return self._reshape_and_cast(tf.where(self.ignored_column_indicator()))
+
+  def ignored_column_indicator(self):
+    """Returns boolean column indicator where True means the colum is ignored.
+
+    Returns:
+      column_indicator: boolean vector which is True for all ignored column
+      indices.
+    """
+    return tf.equal(self._match_results, -2)
+
+  def num_ignored_columns(self):
+    """Returns number (int32 scalar tensor) of matched columns."""
+    return tf.size(self.ignored_column_indices())
+
+  def unmatched_or_ignored_column_indices(self):
+    """Returns column indices that are unmatched or ignored.
+
+    The indices returned by this op are always sorted in increasing order.
+
+    Returns:
+      column_indices: int32 tensor of shape [K] with column indices.
+    """
+    return self._reshape_and_cast(tf.where(tf.greater(0, self._match_results)))
+
+  def matched_row_indices(self):
+    """Returns row indices that match some column.
+
+    The indices returned by this op are ordered so as to be in correspondence
+    with the output of matched_column_indicator().  For example if
+    self.matched_column_indicator() is [0,2], and self.matched_row_indices() is
+    [7, 3], then we know that column 0 was matched to row 7 and column 2 was
+    matched to row 3.
+
+    Returns:
+      row_indices: int32 tensor of shape [K] with row indices.
+    """
+    return self._reshape_and_cast(
+        tf.gather(self._match_results, self.matched_column_indices()))
+
+  def _reshape_and_cast(self, t):
+    return tf.cast(tf.reshape(t, [-1]), tf.int32)
+
+  def gather_based_on_match(self, input_tensor, unmatched_value,
+                            ignored_value):
+    """Gathers elements from `input_tensor` based on match results.
+
+    For columns that are matched to a row, gathered_tensor[col] is set to
+    input_tensor[match_results[col]]. For columns that are unmatched,
+    gathered_tensor[col] is set to unmatched_value. Finally, for columns that
+    are ignored gathered_tensor[col] is set to ignored_value.
+
+    Note that the input_tensor.shape[1:] must match with unmatched_value.shape
+    and ignored_value.shape
+
+    Args:
+      input_tensor: Tensor to gather values from.
+      unmatched_value: Constant tensor value for unmatched columns.
+      ignored_value: Constant tensor value for ignored columns.
+
+    Returns:
+      gathered_tensor: A tensor containing values gathered from input_tensor.
+        The shape of the gathered tensor is [match_results.shape[0]] +
+        input_tensor.shape[1:].
+    """
+    input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]),
+                              input_tensor], axis=0)
+    gather_indices = tf.maximum(self.match_results + 2, 0)
+    gathered_tensor = tf.gather(input_tensor, gather_indices)
+    return gathered_tensor
+
+
+class Matcher(object):
+  """Abstract base class for matcher.
+  """
+  __metaclass__ = ABCMeta
+
+  def match(self, similarity_matrix, scope=None, **params):
+    """Computes matches among row and column indices and returns the result.
+
+    Computes matches among the row and column indices based on the similarity
+    matrix and optional arguments.
+
+    Args:
+      similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
+        where higher value means more similar.
+      scope: Op scope name. Defaults to 'Match' if None.
+      **params: Additional keyword arguments for specific implementations of
+        the Matcher.
+
+    Returns:
+      A Match object with the results of matching.
+    """
+    with tf.name_scope(scope, 'Match', [similarity_matrix, params]) as scope:
+      return Match(self._match(similarity_matrix, **params))
+
+  @abstractmethod
+  def _match(self, similarity_matrix, **params):
+    """Method to be overridden by implementations.
+
+    Args:
+      similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
+        where higher value means more similar.
+      **params: Additional keyword arguments for specific implementations of
+        the Matcher.
+
+    Returns:
+      match_results: Integer tensor of shape [M]: match_results[i]>=0 means
+        that column i is matched to row match_results[i], match_results[i]=-1
+        means that the column is not matched. match_results[i]=-2 means that
+        the column is ignored (usually this happens when there is a very weak
+        match which one neither wants as positive nor negative example).
+    """
+    pass
@@ -0,0 +1,442 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Preprocess images and bounding boxes for detection.
+
+We perform two sets of operations in preprocessing stage:
+(a) operations that are applied to both training and testing data,
+(b) operations that are applied only to training data for the purpose of
+    data augmentation.
+
+A preprocessing function receives a set of inputs,
+e.g. an image and bounding boxes,
+performs an operation on them, and returns them.
+Some examples are: randomly cropping the image, randomly mirroring the image,
+                   randomly changing the brightness, contrast, hue and
+                   randomly jittering the bounding boxes.
+
+The image is a rank 4 tensor: [1, height, width, channels] with
+dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where
+in each row there is a box with [ymin xmin ymax xmax].
+Boxes are in normalized coordinates meaning
+their coordinate values range in [0, 1]
+
+Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing
+functions receive a rank 3 tensor for processing the image. Thus, inside the
+preprocess function we squeeze the image to become a rank 3 tensor and then
+we pass it to the functions. At the end of the preprocess we expand the image
+back to rank 4.
+"""
+
+import tensorflow as tf
+
+from object_detection import box_list
+
+
+def _flip_boxes_left_right(boxes):
+  """Left-right flip the boxes.
+
+  Args:
+    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
+           Boxes are in normalized form meaning their coordinates vary
+           between [0, 1].
+           Each row is in the form of [ymin, xmin, ymax, xmax].
+
+  Returns:
+    Flipped boxes.
+  """
+  ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1)
+  flipped_xmin = tf.subtract(1.0, xmax)
+  flipped_xmax = tf.subtract(1.0, xmin)
+  flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
+  return flipped_boxes
+
+
+def _flip_masks_left_right(masks):
+  """Left-right flip masks.
+
+  Args:
+    masks: rank 3 float32 tensor with shape
+      [num_instances, height, width] representing instance masks.
+
+  Returns:
+    flipped masks: rank 3 float32 tensor with shape
+      [num_instances, height, width] representing instance masks.
+  """
+  return masks[:, :, ::-1]
+
+
+def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
+                             scope=None):
+  """Flips the keypoints horizontally around the flip_point.
+
+  This operation flips the x coordinate for each keypoint around the flip_point
+  and also permutes the keypoints in a manner specified by flip_permutation.
+
+  Args:
+    keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+    flip_point:  (float) scalar tensor representing the x coordinate to flip the
+      keypoints around.
+    flip_permutation: rank 1 int32 tensor containing the keypoint flip
+      permutation. This specifies the mapping from original keypoint indices
+      to the flipped keypoint indices. This is used primarily for keypoints
+      that are not reflection invariant. E.g. Suppose there are 3 keypoints
+      representing ['head', 'right_eye', 'left_eye'], then a logical choice for
+      flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye'
+      and 'right_eye' after a horizontal flip.
+    scope: name scope.
+
+  Returns:
+    new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+  """
+  with tf.name_scope(scope, 'FlipHorizontal'):
+    keypoints = tf.transpose(keypoints, [1, 0, 2])
+    keypoints = tf.gather(keypoints, flip_permutation)
+    v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
+    u = flip_point * 2.0 - u
+    new_keypoints = tf.concat([v, u], 2)
+    new_keypoints = tf.transpose(new_keypoints, [1, 0, 2])
+    return new_keypoints
+
+
+def random_horizontal_flip(image,
+                           boxes=None,
+                           masks=None,
+                           keypoints=None,
+                           keypoint_flip_permutation=None,
+                           seed=None):
+  """Randomly flips the image and detections horizontally.
+
+  The probability of flipping the image is 50%.
+
+  Args:
+    image: rank 3 float32 tensor with shape [height, width, channels].
+    boxes: (optional) rank 2 float32 tensor with shape [N, 4]
+           containing the bounding boxes.
+           Boxes are in normalized form meaning their coordinates vary
+           between [0, 1].
+           Each row is in the form of [ymin, xmin, ymax, xmax].
+    masks: (optional) rank 3 float32 tensor with shape
+           [num_instances, height, width] containing instance masks. The masks
+           are of the same height, width as the input `image`.
+    keypoints: (optional) rank 3 float32 tensor with shape
+               [num_instances, num_keypoints, 2]. The keypoints are in y-x
+               normalized coordinates.
+    keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip
+                               permutation.
+    seed: random seed
+
+  Returns:
+    image: image which is the same shape as input image.
+
+    If boxes, masks, keypoints, and keypoint_flip_permutation are not None,
+    the function also returns the following tensors.
+
+    boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
+           Boxes are in normalized form meaning their coordinates vary
+           between [0, 1].
+    masks: rank 3 float32 tensor with shape [num_instances, height, width]
+           containing instance masks.
+    keypoints: rank 3 float32 tensor with shape
+               [num_instances, num_keypoints, 2]
+
+  Raises:
+    ValueError: if keypoints are provided but keypoint_flip_permutation is not.
+  """
+
+  def _flip_image(image):
+    # flip image
+    image_flipped = tf.image.flip_left_right(image)
+    return image_flipped
+
+  if keypoints is not None and keypoint_flip_permutation is None:
+    raise ValueError(
+        'keypoints are provided but keypoints_flip_permutation is not provided')
+
+  with tf.name_scope('RandomHorizontalFlip', values=[image, boxes]):
+    result = []
+    # random variable defining whether to do flip or not
+    do_a_flip_random = tf.greater(tf.random_uniform([], seed=seed), 0.5)
+
+    # flip image
+    image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image)
+    result.append(image)
+
+    # flip boxes
+    if boxes is not None:
+      boxes = tf.cond(do_a_flip_random, lambda: _flip_boxes_left_right(boxes),
+                      lambda: boxes)
+      result.append(boxes)
+
+    # flip masks
+    if masks is not None:
+      masks = tf.cond(do_a_flip_random, lambda: _flip_masks_left_right(masks),
+                      lambda: masks)
+      result.append(masks)
+
+    # flip keypoints
+    if keypoints is not None and keypoint_flip_permutation is not None:
+      permutation = keypoint_flip_permutation
+      keypoints = tf.cond(
+          do_a_flip_random,
+          lambda: keypoint_flip_horizontal(keypoints, 0.5, permutation),
+          lambda: keypoints)
+      result.append(keypoints)
+
+    return tuple(result)
+
+
+def _compute_new_static_size(image, min_dimension, max_dimension):
+  """Compute new static shape for resize_to_range method."""
+  image_shape = image.get_shape().as_list()
+  orig_height = image_shape[0]
+  orig_width = image_shape[1]
+  num_channels = image_shape[2]
+  orig_min_dim = min(orig_height, orig_width)
+  # Calculates the larger of the possible sizes
+  large_scale_factor = min_dimension / float(orig_min_dim)
+  # Scaling orig_(height|width) by large_scale_factor will make the smaller
+  # dimension equal to min_dimension, save for floating point rounding errors.
+  # For reasonably-sized images, taking the nearest integer will reliably
+  # eliminate this error.
+  large_height = int(round(orig_height * large_scale_factor))
+  large_width = int(round(orig_width * large_scale_factor))
+  large_size = [large_height, large_width]
+  if max_dimension:
+    # Calculates the smaller of the possible sizes, use that if the larger
+    # is too big.
+    orig_max_dim = max(orig_height, orig_width)
+    small_scale_factor = max_dimension / float(orig_max_dim)
+    # Scaling orig_(height|width) by small_scale_factor will make the larger
+    # dimension equal to max_dimension, save for floating point rounding
+    # errors. For reasonably-sized images, taking the nearest integer will
+    # reliably eliminate this error.
+    small_height = int(round(orig_height * small_scale_factor))
+    small_width = int(round(orig_width * small_scale_factor))
+    small_size = [small_height, small_width]
+    new_size = large_size
+    if max(large_size) > max_dimension:
+      new_size = small_size
+  else:
+    new_size = large_size
+  return tf.constant(new_size + [num_channels])
+
+
+def _compute_new_dynamic_size(image, min_dimension, max_dimension):
+  """Compute new dynamic shape for resize_to_range method."""
+  image_shape = tf.shape(image)
+  orig_height = tf.to_float(image_shape[0])
+  orig_width = tf.to_float(image_shape[1])
+  num_channels = image_shape[2]
+  orig_min_dim = tf.minimum(orig_height, orig_width)
+  # Calculates the larger of the possible sizes
+  min_dimension = tf.constant(min_dimension, dtype=tf.float32)
+  large_scale_factor = min_dimension / orig_min_dim
+  # Scaling orig_(height|width) by large_scale_factor will make the smaller
+  # dimension equal to min_dimension, save for floating point rounding errors.
+  # For reasonably-sized images, taking the nearest integer will reliably
+  # eliminate this error.
+  large_height = tf.to_int32(tf.round(orig_height * large_scale_factor))
+  large_width = tf.to_int32(tf.round(orig_width * large_scale_factor))
+  large_size = tf.stack([large_height, large_width])
+  if max_dimension:
+    # Calculates the smaller of the possible sizes, use that if the larger
+    # is too big.
+    orig_max_dim = tf.maximum(orig_height, orig_width)
+    max_dimension = tf.constant(max_dimension, dtype=tf.float32)
+    small_scale_factor = max_dimension / orig_max_dim
+    # Scaling orig_(height|width) by small_scale_factor will make the larger
+    # dimension equal to max_dimension, save for floating point rounding
+    # errors. For reasonably-sized images, taking the nearest integer will
+    # reliably eliminate this error.
+    small_height = tf.to_int32(tf.round(orig_height * small_scale_factor))
+    small_width = tf.to_int32(tf.round(orig_width * small_scale_factor))
+    small_size = tf.stack([small_height, small_width])
+    new_size = tf.cond(
+        tf.to_float(tf.reduce_max(large_size)) > max_dimension,
+        lambda: small_size, lambda: large_size)
+  else:
+    new_size = large_size
+  return tf.stack(tf.unstack(new_size) + [num_channels])
+
+
+def resize_to_range(image,
+                    masks=None,
+                    min_dimension=None,
+                    max_dimension=None,
+                    method=tf.image.ResizeMethod.BILINEAR,
+                    align_corners=False,
+                    pad_to_max_dimension=False):
+  """Resizes an image so its dimensions are within the provided value.
+
+  The output size can be described by two cases:
+  1. If the image can be rescaled so its minimum dimension is equal to the
+     provided value without the other dimension exceeding max_dimension,
+     then do so.
+  2. Otherwise, resize so the largest dimension is equal to max_dimension.
+
+  Args:
+    image: A 3D tensor of shape [height, width, channels]
+    masks: (optional) rank 3 float32 tensor with shape
+           [num_instances, height, width] containing instance masks.
+    min_dimension: (optional) (scalar) desired size of the smaller image
+                   dimension.
+    max_dimension: (optional) (scalar) maximum allowed size
+                   of the larger image dimension.
+    method: (optional) interpolation method used in resizing. Defaults to
+            BILINEAR.
+    align_corners: bool. If true, exactly align all 4 corners of the input
+                   and output. Defaults to False.
+    pad_to_max_dimension: Whether to resize the image and pad it with zeros
+      so the resulting image is of the spatial size
+      [max_dimension, max_dimension]. If masks are included they are padded
+      similarly.
+
+  Returns:
+    Note that the position of the resized_image_shape changes based on whether
+    masks are present.
+    resized_image: A 3D tensor of shape [new_height, new_width, channels],
+      where the image has been resized (with bilinear interpolation) so that
+      min(new_height, new_width) == min_dimension or
+      max(new_height, new_width) == max_dimension.
+    resized_masks: If masks is not None, also outputs masks. A 3D tensor of
+      shape [num_instances, new_height, new_width].
+    resized_image_shape: A 1D tensor of shape [3] containing shape of the
+      resized image.
+
+  Raises:
+    ValueError: if the image is not a 3D tensor.
+  """
+  if len(image.get_shape()) != 3:
+    raise ValueError('Image should be 3D tensor')
+
+  with tf.name_scope('ResizeToRange', values=[image, min_dimension]):
+    if image.get_shape().is_fully_defined():
+      new_size = _compute_new_static_size(image, min_dimension, max_dimension)
+    else:
+      new_size = _compute_new_dynamic_size(image, min_dimension, max_dimension)
+    new_image = tf.image.resize_images(
+        image, new_size[:-1], method=method, align_corners=align_corners)
+
+    if pad_to_max_dimension:
+      new_image = tf.image.pad_to_bounding_box(
+          new_image, 0, 0, max_dimension, max_dimension)
+
+    result = [new_image]
+    if masks is not None:
+      new_masks = tf.expand_dims(masks, 3)
+      new_masks = tf.image.resize_images(
+          new_masks,
+          new_size[:-1],
+          method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
+          align_corners=align_corners)
+      new_masks = tf.squeeze(new_masks, 3)
+      if pad_to_max_dimension:
+        new_masks = tf.image.pad_to_bounding_box(
+            new_masks, 0, 0, max_dimension, max_dimension)
+      result.append(new_masks)
+
+    result.append(new_size)
+    return result
+
+
+def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from):
+  """Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to.
+
+  Args:
+    boxlist_to_copy_to: BoxList to which extra fields are copied.
+    boxlist_to_copy_from: BoxList from which fields are copied.
+
+  Returns:
+    boxlist_to_copy_to with extra fields.
+  """
+  for field in boxlist_to_copy_from.get_extra_fields():
+    boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field))
+  return boxlist_to_copy_to
+
+
+def box_list_scale(boxlist, y_scale, x_scale, scope=None):
+  """scale box coordinates in x and y dimensions.
+
+  Args:
+    boxlist: BoxList holding N boxes
+    y_scale: (float) scalar tensor
+    x_scale: (float) scalar tensor
+    scope: name scope.
+
+  Returns:
+    boxlist: BoxList holding N boxes
+  """
+  with tf.name_scope(scope, 'Scale'):
+    y_scale = tf.cast(y_scale, tf.float32)
+    x_scale = tf.cast(x_scale, tf.float32)
+    y_min, x_min, y_max, x_max = tf.split(
+        value=boxlist.get(), num_or_size_splits=4, axis=1)
+    y_min = y_scale * y_min
+    y_max = y_scale * y_max
+    x_min = x_scale * x_min
+    x_max = x_scale * x_max
+    scaled_boxlist = box_list.BoxList(
+        tf.concat([y_min, x_min, y_max, x_max], 1))
+    return _copy_extra_fields(scaled_boxlist, boxlist)
+
+
+def keypoint_scale(keypoints, y_scale, x_scale, scope=None):
+  """Scales keypoint coordinates in x and y dimensions.
+
+  Args:
+    keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+    y_scale: (float) scalar tensor
+    x_scale: (float) scalar tensor
+    scope: name scope.
+
+  Returns:
+    new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
+  """
+  with tf.name_scope(scope, 'Scale'):
+    y_scale = tf.cast(y_scale, tf.float32)
+    x_scale = tf.cast(x_scale, tf.float32)
+    new_keypoints = keypoints * [[[y_scale, x_scale]]]
+    return new_keypoints
+
+
+def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None):
+  """Scales boxes from normalized to pixel coordinates.
+
+  Args:
+    image: A 3D float32 tensor of shape [height, width, channels].
+    boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding
+      boxes in normalized coordinates. Each row is of the form
+      [ymin, xmin, ymax, xmax].
+    keypoints: (optional) rank 3 float32 tensor with shape
+      [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized
+      coordinates.
+
+  Returns:
+    image: unchanged input image.
+    scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the
+      bounding boxes in pixel coordinates.
+    scaled_keypoints: a 3D float32 tensor with shape
+      [num_instances, num_keypoints, 2] containing the keypoints in pixel
+      coordinates.
+  """
+  boxlist = box_list.BoxList(boxes)
+  image_height = tf.shape(image)[0]
+  image_width = tf.shape(image)[1]
+  scaled_boxes = box_list_scale(boxlist, image_height, image_width).get()
+  result = [image, scaled_boxes]
+  if keypoints is not None:
+    scaled_keypoints = keypoint_scale(keypoints, image_height, image_width)
+    result.append(scaled_keypoints)
+  return tuple(result)
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Region Similarity Calculators for BoxLists.
+
+Region Similarity Calculators compare a pairwise measure of similarity
+between the boxes in two BoxLists.
+"""
+from abc import ABCMeta
+from abc import abstractmethod
+
+import tensorflow as tf
+
+
+def area(boxlist, scope=None):
+  """Computes area of boxes.
+
+  Args:
+    boxlist: BoxList holding N boxes
+    scope: name scope.
+
+  Returns:
+    a tensor with shape [N] representing box areas.
+  """
+  with tf.name_scope(scope, 'Area'):
+    y_min, x_min, y_max, x_max = tf.split(
+        value=boxlist.get(), num_or_size_splits=4, axis=1)
+    return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
+
+
+def intersection(boxlist1, boxlist2, scope=None):
+  """Compute pairwise intersection areas between boxes.
+
+  Args:
+    boxlist1: BoxList holding N boxes
+    boxlist2: BoxList holding M boxes
+    scope: name scope.
+
+  Returns:
+    a tensor with shape [N, M] representing pairwise intersections
+  """
+  with tf.name_scope(scope, 'Intersection'):
+    y_min1, x_min1, y_max1, x_max1 = tf.split(
+        value=boxlist1.get(), num_or_size_splits=4, axis=1)
+    y_min2, x_min2, y_max2, x_max2 = tf.split(
+        value=boxlist2.get(), num_or_size_splits=4, axis=1)
+    all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
+    all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
+    intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
+    all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
+    all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
+    intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
+    return intersect_heights * intersect_widths
+
+
+def iou(boxlist1, boxlist2, scope=None):
+  """Computes pairwise intersection-over-union between box collections.
+
+  Args:
+    boxlist1: BoxList holding N boxes
+    boxlist2: BoxList holding M boxes
+    scope: name scope.
+
+  Returns:
+    a tensor with shape [N, M] representing pairwise iou scores.
+  """
+  with tf.name_scope(scope, 'IOU'):
+    intersections = intersection(boxlist1, boxlist2)
+    areas1 = area(boxlist1)
+    areas2 = area(boxlist2)
+    unions = (
+        tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
+    return tf.where(
+        tf.equal(intersections, 0.0),
+        tf.zeros_like(intersections), tf.truediv(intersections, unions))
+
+
+class RegionSimilarityCalculator(object):
+  """Abstract base class for region similarity calculator."""
+  __metaclass__ = ABCMeta
+
+  def compare(self, boxlist1, boxlist2, scope=None):
+    """Computes matrix of pairwise similarity between BoxLists.
+
+    This op (to be overriden) computes a measure of pairwise similarity between
+    the boxes in the given BoxLists. Higher values indicate more similarity.
+
+    Note that this method simply measures similarity and does not explicitly
+    perform a matching.
+
+    Args:
+      boxlist1: BoxList holding N boxes.
+      boxlist2: BoxList holding M boxes.
+      scope: Op scope name. Defaults to 'Compare' if None.
+
+    Returns:
+      a (float32) tensor of shape [N, M] with pairwise similarity score.
+    """
+    with tf.name_scope(scope, 'Compare', [boxlist1, boxlist2]) as scope:
+      return self._compare(boxlist1, boxlist2)
+
+  @abstractmethod
+  def _compare(self, boxlist1, boxlist2):
+    pass
+
+
+class IouSimilarity(RegionSimilarityCalculator):
+  """Class to compute similarity based on Intersection over Union (IOU) metric.
+
+  This class computes pairwise similarity between two BoxLists based on IOU.
+  """
+
+  def _compare(self, boxlist1, boxlist2):
+    """Compute pairwise IOU similarity between the two BoxLists.
+
+    Args:
+      boxlist1: BoxList holding N boxes.
+      boxlist2: BoxList holding M boxes.
+
+    Returns:
+      A tensor with shape [N, M] representing pairwise iou scores.
+    """
+    return iou(boxlist1, boxlist2)
@@ -0,0 +1,70 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utils used to manipulate tensor shapes."""
+
+import tensorflow as tf
+
+
+def assert_shape_equal(shape_a, shape_b):
+  """Asserts that shape_a and shape_b are equal.
+
+  If the shapes are static, raises a ValueError when the shapes
+  mismatch.
+
+  If the shapes are dynamic, raises a tf InvalidArgumentError when the shapes
+  mismatch.
+
+  Args:
+    shape_a: a list containing shape of the first tensor.
+    shape_b: a list containing shape of the second tensor.
+
+  Returns:
+    Either a tf.no_op() when shapes are all static and a tf.assert_equal() op
+    when the shapes are dynamic.
+
+  Raises:
+    ValueError: When shapes are both static and unequal.
+  """
+  if (all(isinstance(dim, int) for dim in shape_a) and
+      all(isinstance(dim, int) for dim in shape_b)):
+    if shape_a != shape_b:
+      raise ValueError('Unequal shapes {}, {}'.format(shape_a, shape_b))
+    else: return tf.no_op()
+  else:
+    return tf.assert_equal(shape_a, shape_b)
+
+
+def combined_static_and_dynamic_shape(tensor):
+  """Returns a list containing static and dynamic values for the dimensions.
+
+  Returns a list of static and dynamic values for shape dimensions. This is
+  useful to preserve static shapes when available in reshape operation.
+
+  Args:
+    tensor: A tensor of any type.
+
+  Returns:
+    A list of size tensor.shape.ndims containing integers or a scalar tensor.
+  """
+  static_tensor_shape = tensor.shape.as_list()
+  dynamic_tensor_shape = tf.shape(tensor)
+  combined_shape = []
+  for index, dim in enumerate(static_tensor_shape):
+    if dim is not None:
+      combined_shape.append(dim)
+    else:
+      combined_shape.append(dynamic_tensor_shape[index])
+  return combined_shape
@@ -0,0 +1,310 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Base target assigner module.
+
+The job of a TargetAssigner is, for a given set of anchors (bounding boxes) and
+groundtruth detections (bounding boxes), to assign classification and regression
+targets to each anchor as well as weights to each anchor (specifying, e.g.,
+which anchors should not contribute to training loss).
+
+It assigns classification/regression targets by performing the following steps:
+1) Computing pairwise similarity between anchors and groundtruth boxes using a
+  provided RegionSimilarity Calculator
+2) Computing a matching based on the similarity matrix using a provided Matcher
+3) Assigning regression targets based on the matching and a provided BoxCoder
+4) Assigning classification targets based on the matching and groundtruth labels
+
+Note that TargetAssigners only operate on detections from a single
+image at a time, so any logic for applying a TargetAssigner to multiple
+images must be handled externally.
+"""
+import tensorflow as tf
+
+from object_detection import box_list
+from object_detection import shape_utils
+
+
+KEYPOINTS_FIELD_NAME = 'keypoints'
+
+
+class TargetAssigner(object):
+  """Target assigner to compute classification and regression targets."""
+
+  def __init__(self, similarity_calc, matcher, box_coder,
+               negative_class_weight=1.0, unmatched_cls_target=None):
+    """Construct Object Detection Target Assigner.
+
+    Args:
+      similarity_calc: a RegionSimilarityCalculator
+      matcher: Matcher used to match groundtruth to anchors.
+      box_coder: BoxCoder used to encode matching groundtruth boxes with
+        respect to anchors.
+      negative_class_weight: classification weight to be associated to negative
+        anchors (default: 1.0). The weight must be in [0., 1.].
+      unmatched_cls_target: a float32 tensor with shape [d_1, d_2, ..., d_k]
+        which is consistent with the classification target for each
+        anchor (and can be empty for scalar targets).  This shape must thus be
+        compatible with the groundtruth labels that are passed to the "assign"
+        function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]).
+        If set to None, unmatched_cls_target is set to be [0] for each anchor.
+
+    Raises:
+      ValueError: if similarity_calc is not a RegionSimilarityCalculator or
+        if matcher is not a Matcher or if box_coder is not a BoxCoder
+    """
+    self._similarity_calc = similarity_calc
+    self._matcher = matcher
+    self._box_coder = box_coder
+    self._negative_class_weight = negative_class_weight
+    if unmatched_cls_target is None:
+      self._unmatched_cls_target = tf.constant([0], tf.float32)
+    else:
+      self._unmatched_cls_target = unmatched_cls_target
+
+  @property
+  def box_coder(self):
+    return self._box_coder
+
+  def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None,
+             groundtruth_weights=None, **params):
+    """Assign classification and regression targets to each anchor.
+
+    For a given set of anchors and groundtruth detections, match anchors
+    to groundtruth_boxes and assign classification and regression targets to
+    each anchor as well as weights based on the resulting match (specifying,
+    e.g., which anchors should not contribute to training loss).
+
+    Anchors that are not matched to anything are given a classification target
+    of self._unmatched_cls_target which can be specified via the constructor.
+
+    Args:
+      anchors: a BoxList representing N anchors
+      groundtruth_boxes: a BoxList representing M groundtruth boxes
+      groundtruth_labels:  a tensor of shape [M, d_1, ... d_k]
+        with labels for each of the ground_truth boxes. The subshape
+        [d_1, ... d_k] can be empty (corresponding to scalar inputs).  When set
+        to None, groundtruth_labels assumes a binary problem where all
+        ground_truth boxes get a positive label (of 1).
+      groundtruth_weights: a float tensor of shape [M] indicating the weight to
+        assign to all anchors match to a particular groundtruth box. The weights
+        must be in [0., 1.]. If None, all weights are set to 1.
+      **params: Additional keyword arguments for specific implementations of
+              the Matcher.
+
+    Returns:
+      cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k],
+        where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels
+        which has shape [num_gt_boxes, d_1, d_2, ... d_k].
+      cls_weights: a float32 tensor with shape [num_anchors]
+      reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension]
+      reg_weights: a float32 tensor with shape [num_anchors]
+      match: a matcher.Match object encoding the match between anchors and
+        groundtruth boxes, with rows corresponding to groundtruth boxes
+        and columns corresponding to anchors.
+
+    Raises:
+      ValueError: if anchors or groundtruth_boxes are not of type
+        box_list.BoxList
+    """
+    if not isinstance(anchors, box_list.BoxList):
+      raise ValueError('anchors must be an BoxList')
+    if not isinstance(groundtruth_boxes, box_list.BoxList):
+      raise ValueError('groundtruth_boxes must be an BoxList')
+
+    if groundtruth_labels is None:
+      groundtruth_labels = tf.ones(tf.expand_dims(groundtruth_boxes.num_boxes(),
+                                                  0))
+      groundtruth_labels = tf.expand_dims(groundtruth_labels, -1)
+    unmatched_shape_assert = shape_utils.assert_shape_equal(
+        shape_utils.combined_static_and_dynamic_shape(groundtruth_labels)[1:],
+        shape_utils.combined_static_and_dynamic_shape(
+            self._unmatched_cls_target))
+    labels_and_box_shapes_assert = shape_utils.assert_shape_equal(
+        shape_utils.combined_static_and_dynamic_shape(
+            groundtruth_labels)[:1],
+        shape_utils.combined_static_and_dynamic_shape(
+            groundtruth_boxes.get())[:1])
+
+    if groundtruth_weights is None:
+      num_gt_boxes = groundtruth_boxes.num_boxes_static()
+      if not num_gt_boxes:
+        num_gt_boxes = groundtruth_boxes.num_boxes()
+      groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32)
+    with tf.control_dependencies(
+        [unmatched_shape_assert, labels_and_box_shapes_assert]):
+      match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes,
+                                                           anchors)
+      match = self._matcher.match(match_quality_matrix, **params)
+      reg_targets = self._create_regression_targets(anchors,
+                                                    groundtruth_boxes,
+                                                    match)
+      cls_targets = self._create_classification_targets(groundtruth_labels,
+                                                        match)
+      reg_weights = self._create_regression_weights(match, groundtruth_weights)
+      cls_weights = self._create_classification_weights(match,
+                                                        groundtruth_weights)
+
+    num_anchors = anchors.num_boxes_static()
+    if num_anchors is not None:
+      reg_targets = self._reset_target_shape(reg_targets, num_anchors)
+      cls_targets = self._reset_target_shape(cls_targets, num_anchors)
+      reg_weights = self._reset_target_shape(reg_weights, num_anchors)
+      cls_weights = self._reset_target_shape(cls_weights, num_anchors)
+
+    return cls_targets, cls_weights, reg_targets, reg_weights, match
+
+  def _reset_target_shape(self, target, num_anchors):
+    """Sets the static shape of the target.
+
+    Args:
+      target: the target tensor. Its first dimension will be overwritten.
+      num_anchors: the number of anchors, which is used to override the target's
+        first dimension.
+
+    Returns:
+      A tensor with the shape info filled in.
+    """
+    target_shape = target.get_shape().as_list()
+    target_shape[0] = num_anchors
+    target.set_shape(target_shape)
+    return target
+
+  def _create_regression_targets(self, anchors, groundtruth_boxes, match):
+    """Returns a regression target for each anchor.
+
+    Args:
+      anchors: a BoxList representing N anchors
+      groundtruth_boxes: a BoxList representing M groundtruth_boxes
+      match: a matcher.Match object
+
+    Returns:
+      reg_targets: a float32 tensor with shape [N, box_code_dimension]
+    """
+    matched_gt_boxes = match.gather_based_on_match(
+        groundtruth_boxes.get(),
+        unmatched_value=tf.zeros(4),
+        ignored_value=tf.zeros(4))
+    matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
+    if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME):
+      groundtruth_keypoints = groundtruth_boxes.get_field(KEYPOINTS_FIELD_NAME)
+      matched_keypoints = match.gather_based_on_match(
+          groundtruth_keypoints,
+          unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]),
+          ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]))
+      matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME, matched_keypoints)
+    matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors)
+    match_results_shape = shape_utils.combined_static_and_dynamic_shape(
+        match.match_results)
+
+    # Zero out the unmatched and ignored regression targets.
+    unmatched_ignored_reg_targets = tf.tile(
+        self._default_regression_target(), [match_results_shape[0], 1])
+    matched_anchors_mask = match.matched_column_indicator()
+    reg_targets = tf.where(matched_anchors_mask,
+                           matched_reg_targets,
+                           unmatched_ignored_reg_targets)
+    return reg_targets
+
+  def _default_regression_target(self):
+    """Returns the default target for anchors to regress to.
+
+    Default regression targets are set to zero (though in
+    this implementation what these targets are set to should
+    not matter as the regression weight of any box set to
+    regress to the default target is zero).
+
+    Returns:
+      default_target: a float32 tensor with shape [1, box_code_dimension]
+    """
+    return tf.constant([self._box_coder.code_size*[0]], tf.float32)
+
+  def _create_classification_targets(self, groundtruth_labels, match):
+    """Create classification targets for each anchor.
+
+    Assign a classification target of for each anchor to the matching
+    groundtruth label that is provided by match.  Anchors that are not matched
+    to anything are given the target self._unmatched_cls_target
+
+    Args:
+      groundtruth_labels:  a tensor of shape [num_gt_boxes, d_1, ... d_k]
+        with labels for each of the ground_truth boxes. The subshape
+        [d_1, ... d_k] can be empty (corresponding to scalar labels).
+      match: a matcher.Match object that provides a matching between anchors
+        and groundtruth boxes.
+
+    Returns:
+      a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the
+      subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has
+      shape [num_gt_boxes, d_1, d_2, ... d_k].
+    """
+    return match.gather_based_on_match(
+        groundtruth_labels,
+        unmatched_value=self._unmatched_cls_target,
+        ignored_value=self._unmatched_cls_target)
+
+  def _create_regression_weights(self, match, groundtruth_weights):
+    """Set regression weight for each anchor.
+
+    Only positive anchors are set to contribute to the regression loss, so this
+    method returns a weight of 1 for every positive anchor and 0 for every
+    negative anchor.
+
+    Args:
+      match: a matcher.Match object that provides a matching between anchors
+        and groundtruth boxes.
+      groundtruth_weights: a float tensor of shape [M] indicating the weight to
+        assign to all anchors match to a particular groundtruth box.
+
+    Returns:
+      a float32 tensor with shape [num_anchors] representing regression weights.
+    """
+    return match.gather_based_on_match(
+        groundtruth_weights, ignored_value=0., unmatched_value=0.)
+
+  def _create_classification_weights(self,
+                                     match,
+                                     groundtruth_weights):
+    """Create classification weights for each anchor.
+
+    Positive (matched) anchors are associated with a weight of
+    positive_class_weight and negative (unmatched) anchors are associated with
+    a weight of negative_class_weight. When anchors are ignored, weights are set
+    to zero. By default, both positive/negative weights are set to 1.0,
+    but they can be adjusted to handle class imbalance (which is almost always
+    the case in object detection).
+
+    Args:
+      match: a matcher.Match object that provides a matching between anchors
+        and groundtruth boxes.
+      groundtruth_weights: a float tensor of shape [M] indicating the weight to
+        assign to all anchors match to a particular groundtruth box.
+
+    Returns:
+      a float32 tensor with shape [num_anchors] representing classification
+      weights.
+    """
+    return match.gather_based_on_match(
+        groundtruth_weights,
+        ignored_value=0.,
+        unmatched_value=self._negative_class_weight)
+
+  def get_box_coder(self):
+    """Get BoxCoder of this TargetAssigner.
+
+    Returns:
+      BoxCoder object.
+    """
+    return self._box_coder
@@ -0,0 +1,210 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tensorflow Example proto decoder for object detection.
+
+A decoder to decode string tensors containing serialized tensorflow.Example
+protos for object detection.
+"""
+import tensorflow as tf
+
+
+slim_example_decoder = tf.contrib.slim.tfexample_decoder
+
+
+class TfExampleDecoder(object):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self):
+    """Constructor sets keys_to_features and items_to_handlers."""
+    self.keys_to_features = {
+        'image/encoded':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/format':
+            tf.FixedLenFeature((), tf.string, default_value='jpeg'),
+        'image/filename':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/key/sha256':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/source_id':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/height':
+            tf.FixedLenFeature((), tf.int64, 1),
+        'image/width':
+            tf.FixedLenFeature((), tf.int64, 1),
+        # Object boxes and classes.
+        'image/object/bbox/xmin':
+            tf.VarLenFeature(tf.float32),
+        'image/object/bbox/xmax':
+            tf.VarLenFeature(tf.float32),
+        'image/object/bbox/ymin':
+            tf.VarLenFeature(tf.float32),
+        'image/object/bbox/ymax':
+            tf.VarLenFeature(tf.float32),
+        'image/object/class/label':
+            tf.VarLenFeature(tf.int64),
+        'image/object/class/text':
+            tf.VarLenFeature(tf.string),
+        'image/object/area':
+            tf.VarLenFeature(tf.float32),
+        'image/object/is_crowd':
+            tf.VarLenFeature(tf.int64),
+        'image/object/difficult':
+            tf.VarLenFeature(tf.int64),
+        'image/object/group_of':
+            tf.VarLenFeature(tf.int64),
+        'image/object/weight':
+            tf.VarLenFeature(tf.float32),
+    }
+    self.items_to_handlers = {
+        'image': slim_example_decoder.Image(
+            image_key='image/encoded', format_key='image/format', channels=3),
+        'source_id': (
+            slim_example_decoder.Tensor('image/source_id')),
+        'key': (
+            slim_example_decoder.Tensor('image/key/sha256')),
+        'filename': (
+            slim_example_decoder.Tensor('image/filename')),
+        # Object boxes and classes.
+        'groundtruth_boxes': (
+            slim_example_decoder.BoundingBox(
+                ['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/')),
+        'groundtruth_area': slim_example_decoder.Tensor(
+            'image/object/area'),
+        'groundtruth_is_crowd': (
+            slim_example_decoder.Tensor('image/object/is_crowd')),
+        'groundtruth_difficult': (
+            slim_example_decoder.Tensor('image/object/difficult')),
+        'groundtruth_group_of': (
+            slim_example_decoder.Tensor('image/object/group_of')),
+        'groundtruth_weights': (
+            slim_example_decoder.Tensor('image/object/weight')),
+    }
+    label_handler = slim_example_decoder.Tensor('image/object/class/label')
+    self.items_to_handlers['groundtruth_classes'] = label_handler
+
+  def decode(self, tf_example_string_tensor):
+    """Decodes serialized tensorflow example and returns a tensor dictionary.
+
+    Args:
+      tf_example_string_tensor: a string tensor holding a serialized tensorflow
+        example proto.
+
+    Returns:
+      A dictionary of the following tensors.
+      image - 3D uint8 tensor of shape [None, None, 3]
+        containing image.
+      source_id - string tensor containing original
+        image id.
+      key - string tensor with unique sha256 hash key.
+      filename - string tensor with original dataset
+        filename.
+      groundtruth_boxes - 2D float32 tensor of shape
+        [None, 4] containing box corners.
+      groundtruth_classes - 1D int64 tensor of shape
+      groundtruth_weights - 1D float32 tensor of
+        shape [None] indicating the weights of groundtruth boxes.
+        [None] containing classes for the boxes.
+      groundtruth_area - 1D float32 tensor of shape
+        [None] containing containing object mask area in pixel squared.
+      groundtruth_is_crowd - 1D bool tensor of shape
+        [None] indicating if the boxes enclose a crowd.
+
+    Optional:
+      groundtruth_difficult - 1D bool tensor of shape
+        [None] indicating if the boxes represent `difficult` instances.
+      groundtruth_group_of - 1D bool tensor of shape
+        [None] indicating if the boxes represent `group_of` instances.
+      groundtruth_instance_masks - 3D float32 tensor of
+        shape [None, None, None] containing instance masks.
+    """
+    serialized_example = tf.reshape(tf_example_string_tensor, shape=[])
+    decoder = slim_example_decoder.TFExampleDecoder(self.keys_to_features,
+                                                    self.items_to_handlers)
+    keys = sorted(decoder.list_items())
+
+    tensors = decoder.decode(serialized_example, items=keys)
+    tensor_dict = dict(zip(keys, tensors))
+    is_crowd = 'groundtruth_is_crowd'
+    tensor_dict[is_crowd] = tf.cast(tensor_dict[is_crowd], dtype=tf.bool)
+    tensor_dict['image'].set_shape([None, None, 3])
+
+    def default_groundtruth_weights():
+      return tf.ones(
+          tf.shape(tensor_dict['groundtruth_boxes'])[0],
+          dtype=tf.float32)
+
+    tensor_dict['groundtruth_weights'] = tf.cond(
+        tf.greater(
+            tf.shape(
+                tensor_dict['groundtruth_weights'])[0],
+            0), lambda: tensor_dict['groundtruth_weights'],
+        default_groundtruth_weights)
+    return tensor_dict
+
+
+class TfExampleSegmentationDecoder(object):
+  """Tensorflow Example proto decoder."""
+
+  def __init__(self):
+    """Constructor sets keys_to_features and items_to_handlers."""
+    self.keys_to_features = {
+        'image/encoded':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/filename':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/format':
+            tf.FixedLenFeature((), tf.string, default_value='jpeg'),
+        'image/height':
+            tf.FixedLenFeature((), tf.int64, default_value=0),
+        'image/width':
+            tf.FixedLenFeature((), tf.int64, default_value=0),
+        'image/segmentation/class/encoded':
+            tf.FixedLenFeature((), tf.string, default_value=''),
+        'image/segmentation/class/format':
+            tf.FixedLenFeature((), tf.string, default_value='png'),
+    }
+    self.items_to_handlers = {
+        'image': slim_example_decoder.Image(
+            image_key='image/encoded', format_key='image/format', channels=3),
+        'labels_class': slim_example_decoder.Image(
+            image_key='image/segmentation/class/encoded',
+            format_key='image/segmentation/class/format',
+            channels=1)
+    }
+
+  def decode(self, tf_example_string_tensor):
+    """Decodes serialized tensorflow example and returns a tensor dictionary.
+
+    Args:
+      tf_example_string_tensor: a string tensor holding a serialized tensorflow
+        example proto.
+
+    Returns:
+      A dictionary of the following tensors.
+      image - 3D uint8 tensor of shape [None, None, 3] containing image.
+      labels_class - 2D unit8 tensor of shape [None, None] containing
+        pixel-wise class labels.
+    """
+    serialized_example = tf.reshape(tf_example_string_tensor, shape=[])
+    decoder = slim_example_decoder.TFExampleDecoder(self.keys_to_features,
+                                                    self.items_to_handlers)
+    keys = sorted(decoder.list_items())
+    keys = ['image', 'labels_class']
+
+    tensors = decoder.decode(serialized_example, items=keys)
+    tensor_dict = dict(zip(keys, tensors))
+    tensor_dict['image'].set_shape([None, None, 3])
+    return tensor_dict
@@ -0,0 +1,44 @@
+#clean slog
+rm -rf /var/log/npu/slog/host-0/*.log
+rm -rf /var/log/npu/slog/device-*/*.log
+
+# set env
+export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
+PATH=$PATH:$HOME/bin
+export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
+export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+export DDK_VERSION_FLAG=1.71.T5.0.B060
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+export DUMP_GE_GRAPH=1
+export DUMP_GRAPH_LEVEL=3
+export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=1
+
+
+export RANK_SIZE=1
+RANK_ID_START=1
+
+SAVE_PATH=training
+BASE_PATH=`pwd`
+echo $BASE_PATH
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+echo
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device "$RANK_ID
+TMP_PATH=$SAVE_PATH/D$RANK_ID
+mkdir -p $TMP_PATH
+cp exec_main.sh $TMP_PATH/
+cd $TMP_PATH
+bash exec_main.sh $RANK_ID $RANK_SIZE $BASE_PATH > train_$RANK_ID.log &
+cd -
+done
+
+
+
+
+
+
@@ -0,0 +1,41 @@
+
+#clean slog
+rm -rf /var/log/npu/slog/host-0/*.log
+rm -rf /var/log/npu/slog/device-*/*.log
+
+# set env
+export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe/
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
+PATH=$PATH:$HOME/bin
+export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
+export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+export DDK_VERSION_FLAG=1.71.T5.0.B060
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+export DUMP_GE_GRAPH=1
+export DUMP_GRAPH_LEVEL=3
+export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=1
+
+
+export RANK_SIZE=8
+export RANK_TABLE_FILE=${PWD}/npu_config/${RANK_SIZE}p.json
+RANK_ID_START=0
+
+BASE_PATH=`pwd`
+SAVE_PATH=training
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+echo
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device "$RANK_ID
+TMP_PATH=$SAVE_PATH/D$RANK_ID
+mkdir -p $TMP_PATH
+cp exec_main.sh $TMP_PATH/
+cd $TMP_PATH
+nohup bash exec_main.sh $RANK_ID $RANK_SIZE $BASE_PATH > train_$RANK_ID.log &
+cd -
+done
+
+
+
@@ -0,0 +1,484 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SSD (via ResNet50) model definition.
+
+Defines the SSD model and loss functions from this paper:
+
+https://arxiv.org/pdf/1708.02002
+
+Uses the ResNet model as a basis.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import ssd_constants
+
+def batch_norm_relu(inputs,
+                    is_training_bn,
+                    params,
+                    relu=True,
+                    init_zero=False,
+                    data_format='channels_last',
+                    name=None):
+  """Performs a batch normalization followed by a ReLU.
+
+  Args:
+    inputs: `Tensor` of shape `[batch, channels, ...]`.
+    is_training_bn: `bool` for whether the model is training.
+    params: params of the model, a dict including `distributed_group_size`
+        and `num_shards`.
+    relu: `bool` if False, omits the ReLU operation.
+    init_zero: `bool` if True, initializes scale parameter of batch
+        normalization with 0 instead of 1 (default).
+    data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+    name: the name of the batch normalization layer
+
+  Returns:
+    A normalized `Tensor` with the same `data_format`.
+  """
+  if init_zero:
+    gamma_initializer = tf.zeros_initializer()
+  else:
+    gamma_initializer = tf.ones_initializer()
+
+  if data_format == 'channels_first':
+    axis = 1
+  else:
+    axis = 3
+
+
+  inputs = tf.layers.batch_normalization(
+        inputs=inputs,
+        axis=axis,
+        momentum=ssd_constants.BATCH_NORM_DECAY,
+        epsilon=ssd_constants.BATCH_NORM_EPSILON,
+        center=True,
+        scale=True,
+        training=is_training_bn,
+        fused=True,
+        gamma_initializer=gamma_initializer,
+        name=name)
+
+  if relu:
+    inputs = tf.nn.relu(inputs)
+  return inputs
+
+
+def fixed_padding(inputs, kernel_size, data_format='channels_last'):
+  """Pads the input along the spatial dimensions independently of input size.
+
+  Args:
+    inputs: `Tensor` of size `[batch, channels, height, width]` or
+        `[batch, height, width, channels]` depending on `data_format`.
+    kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
+        operations. Should be a positive integer.
+    data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+
+  Returns:
+    A padded `Tensor` of the same `data_format` with size either intact
+    (if `kernel_size == 1`) or padded (if `kernel_size > 1`).
+  """
+  pad_total = kernel_size - 1
+  pad_beg = pad_total // 2
+  pad_end = pad_total - pad_beg
+  if data_format == 'channels_first':
+    padded_inputs = tf.pad(
+        inputs, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
+  else:
+    padded_inputs = tf.pad(
+        inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
+
+  return padded_inputs
+
+
+def conv2d_fixed_padding(inputs,
+                         filters,
+                         kernel_size,
+                         strides,
+                         data_format='channels_last'):
+  """Strided 2-D convolution with explicit padding.
+
+  The padding is consistent and is based only on `kernel_size`, not on the
+  dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
+
+  Args:
+    inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
+    filters: `int` number of filters in the convolution.
+    kernel_size: `int` size of the kernel to be used in the convolution.
+    strides: `int` strides of the convolution.
+    data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+
+  Returns:
+    A `Tensor` of shape `[batch, filters, height_out, width_out]`.
+  """
+  if strides > 1:
+    inputs = fixed_padding(inputs, kernel_size, data_format=data_format)
+
+  return tf.layers.conv2d(
+      inputs=inputs,
+      filters=filters,
+      kernel_size=kernel_size,
+      strides=strides,
+      padding=('SAME' if strides == 1 else 'VALID'),
+      use_bias=False,
+      kernel_initializer=tf.variance_scaling_initializer(),
+      data_format=data_format)
+
+
+def residual_block(inputs,
+                   filters,
+                   is_training_bn,
+                   strides,
+                   params,
+                   use_projection=False,
+                   data_format='channels_last'):
+  """Standard building block for residual networks with BN after convolutions.
+
+  Args:
+    inputs: `Tensor` of size `[batch, channels, height, width]`.
+    filters: `int` number of filters for the first two convolutions. Note that
+        the third and final convolution will use 4 times as many filters.
+    is_training_bn: `bool` for whether the model is in training.
+    strides: `int` block stride. If greater than 1, this block will ultimately
+        downsample the input.
+    params: params of the model, a dict.
+    use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+    data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+
+  Returns:
+    The output `Tensor` of the block.
+  """
+  shortcut = inputs
+  if use_projection:
+    # Projection shortcut in first layer to match filters and strides
+    shortcut = conv2d_fixed_padding(
+        inputs=inputs,
+        filters=filters,
+        kernel_size=1,
+        strides=strides,
+        data_format=data_format)
+    shortcut = batch_norm_relu(
+        shortcut, is_training_bn, params, relu=False, data_format=data_format)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs,
+      filters=filters,
+      kernel_size=3,
+      strides=strides,
+      data_format=data_format)
+  inputs = batch_norm_relu(
+      inputs, is_training_bn, params, data_format=data_format)
+
+  inputs = conv2d_fixed_padding(
+      inputs=inputs,
+      filters=filters,
+      kernel_size=3,
+      strides=1,
+      data_format=data_format)
+  inputs = batch_norm_relu(
+      inputs,
+      is_training_bn,
+      params,
+      relu=False,
+      init_zero=True,
+      data_format=data_format)
+
+  return tf.nn.relu(inputs + shortcut)
+
+
+def block_group(inputs,
+                filters,
+                block_fn,
+                blocks,
+                strides,
+                is_training_bn,
+                name,
+                params,
+                data_format='channels_last',
+                use_projection=True):
+  """Creates one group of blocks for the ResNet model.
+
+  Args:
+    inputs: `Tensor` of size `[batch, channels, height, width]`.
+    filters: `int` number of filters for the first convolution of the layer.
+    block_fn: `function` for the block to use within the model
+    blocks: `int` number of blocks contained in the layer.
+    strides: `int` stride to use for the first convolution of the layer. If
+        greater than 1, this layer will downsample the input.
+    is_training_bn: `bool` for whether the model is training.
+    name: `str`name for the Tensor output of the block layer.
+    params: params of the model, a dict.
+    data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+    use_projection: `bool` for whether this block should use a projection
+        shortcut (versus the default identity shortcut). This is usually `True`
+        for the first block of a block group, which may change the number of
+        filters and the resolution.
+
+  Returns:
+    The output `Tensor` of the block layer.
+  """
+  # Only the first block per block_group uses projection shortcut and strides.
+  inputs = block_fn(
+      inputs,
+      filters,
+      is_training_bn,
+      strides,
+      params,
+      use_projection=use_projection,
+      data_format=data_format)
+
+  for _ in range(1, blocks):
+    inputs = block_fn(
+        inputs, filters, is_training_bn, 1, params, data_format=data_format)
+
+  return tf.identity(inputs, name)
+
+
+def resnet_v1_generator(block_fn, layers, params, data_format='channels_last'):
+  """Generator of ResNet v1 model with classification layers removed.
+
+    Our actual ResNet network.  We return the output of c2, c3,c4,c5
+    N.B. batch norm is always run with trained parameters, as we use very small
+    batches when training the object layers.
+
+  Args:
+    block_fn: `function` for the block to use within the model. Either
+        `residual_block` or `bottleneck_block`.
+    layers: list of 4 `int`s denoting the number of blocks to include in each
+      of the 4 block groups. Each group consists of blocks that take inputs of
+      the same resolution.
+    params: params of the model, a dict.
+    data_format: `str` either "channels_first" for `[batch, channels, height,
+        width]` or "channels_last for `[batch, height, width, channels]`.
+
+  Returns:
+    Model `function` that takes in `inputs` and `is_training` and returns the
+    output `Tensor` of the ResNet model.
+  """
+  def model(inputs, is_training_bn=False):
+    """Creation of the model graph."""
+    inputs = conv2d_fixed_padding(
+          inputs=inputs,
+          filters=64,
+          kernel_size=7,
+          strides=2,
+          data_format=data_format)
+    inputs = tf.identity(inputs, 'initial_conv')
+    inputs = batch_norm_relu(
+        inputs, is_training_bn, params, data_format=data_format)
+
+    inputs = tf.layers.max_pooling2d(
+        inputs=inputs,
+        pool_size=3,
+        strides=2,
+        padding='SAME',
+        data_format=data_format)
+    inputs = tf.identity(inputs, 'initial_max_pool')
+
+    c2 = block_group(
+        inputs=inputs,
+        filters=64,
+        blocks=layers[0],
+        strides=1,
+        block_fn=block_fn,
+        is_training_bn=is_training_bn,
+        params=params,
+        name='block_group1',
+        data_format=data_format,
+        use_projection=False)
+    c3 = block_group(
+        inputs=c2,
+        filters=128,
+        blocks=layers[1],
+        strides=2,
+        block_fn=block_fn,
+        is_training_bn=is_training_bn,
+        params=params,
+        name='block_group2',
+        data_format=data_format)
+    c4 = block_group(
+        inputs=c3,
+        filters=256,
+        blocks=layers[2],
+        strides=1,
+        block_fn=block_fn,
+        is_training_bn=is_training_bn,
+        params=params,
+        name='block_group3',
+        data_format=data_format)
+    return c2, c3, c4
+
+  return model
+
+
+def resnet_v1(resnet_depth, params, data_format='channels_last'):
+  """Returns the ResNet model for a given size and number of output classes."""
+  model_params = {
+      34: {'block': residual_block, 'layers': [3, 4, 6, 3]}
+  }
+
+  if resnet_depth not in model_params:
+    raise ValueError('Not a valid resnet_depth:', resnet_depth)
+
+  resnet_params = model_params[resnet_depth]
+  return resnet_v1_generator(resnet_params['block'], resnet_params['layers'],
+                             params, data_format)
+
+
+def class_net(images, level, num_classes):
+  """Class prediction network for SSD."""
+  return tf.layers.conv2d(
+      images,
+      num_classes * ssd_constants.NUM_DEFAULTS_BY_LEVEL[level],
+      kernel_size=(3, 3),
+      padding='same',
+      activation=None,
+      name='class-%d' % (level),
+  )
+
+
+def box_net(images, level):
+  """Box regression network for SSD."""
+  return tf.layers.conv2d(
+      images,
+      4 * ssd_constants.NUM_DEFAULTS_BY_LEVEL[level],
+      kernel_size=(3, 3),
+      padding='same',
+      activation=None,
+      name='box-%d' % (level),
+  )
+
+
+def ssd(features, params, is_training_bn=False):
+  """SSD classification and regression model."""
+  # upward layers
+  with tf.variable_scope(
+      'resnet%s' % ssd_constants.RESNET_DEPTH, reuse=tf.AUTO_REUSE):
+    resnet_fn = resnet_v1(ssd_constants.RESNET_DEPTH, params)
+    _, _, u4 = resnet_fn(features, is_training_bn)
+
+  with tf.variable_scope('ssd', reuse=tf.AUTO_REUSE):
+    feats = {}
+    # output channels for mlperf logging.
+    out_channels = [256]
+    feats[3] = u4
+    feats[4] = tf.layers.conv2d(
+        feats[3],
+        filters=256,
+        kernel_size=(1, 1),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block7-conv1x1')
+    feats[4] = tf.layers.conv2d(
+        feats[4],
+        filters=512,
+        strides=(2, 2),
+        kernel_size=(3, 3),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block7-conv3x3')
+    out_channels.append(512)
+    feats[5] = tf.layers.conv2d(
+        feats[4],
+        filters=256,
+        kernel_size=(1, 1),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block8-conv1x1')
+    feats[5] = tf.layers.conv2d(
+        feats[5],
+        filters=512,
+        strides=(2, 2),
+        kernel_size=(3, 3),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block8-conv3x3')
+    out_channels.append(512)
+    feats[6] = tf.layers.conv2d(
+        feats[5],
+        filters=128,
+        kernel_size=(1, 1),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block9-conv1x1')
+    feats[6] = tf.layers.conv2d(
+        feats[6],
+        filters=256,
+        strides=(2, 2),
+        kernel_size=(3, 3),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block9-conv3x3')
+    out_channels.append(256)
+    feats[7] = tf.layers.conv2d(
+        feats[6],
+        filters=128,
+        kernel_size=(1, 1),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block10-conv1x1')
+    feats[7] = tf.layers.conv2d(
+        feats[7],
+        filters=256,
+        kernel_size=(3, 3),
+        padding='valid',
+        activation=tf.nn.relu,
+        name='block10-conv3x3')
+    out_channels.append(256)
+    feats[8] = tf.layers.conv2d(
+        feats[7],
+        filters=128,
+        kernel_size=(1, 1),
+        padding='same',
+        activation=tf.nn.relu,
+        name='block11-conv1x1')
+    feats[8] = tf.layers.conv2d(
+        feats[8],
+        filters=256,
+        kernel_size=(3, 3),
+        padding='valid',
+        activation=tf.nn.relu,
+        name='block11-conv3x3')
+    out_channels.append(256)
+
+    class_outputs = {}
+    box_outputs = {}
+    min_level = ssd_constants.MIN_LEVEL
+    max_level = ssd_constants.MAX_LEVEL
+    num_classes = ssd_constants.NUM_CLASSES
+
+    with tf.variable_scope('class_net', reuse=tf.AUTO_REUSE):
+      for level in range(min_level, max_level + 1):
+        class_outputs[level] = class_net(
+            feats[level], level, num_classes)
+
+    with tf.variable_scope('box_net', reuse=tf.AUTO_REUSE):
+      for level in range(min_level, max_level + 1):
+        box_outputs[level] = box_net(
+            feats[level], level)
+
+  return class_outputs, box_outputs
@@ -0,0 +1,122 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Central location for all constants related to MLPerf SSD."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# ==============================================================================
+# == Model =====================================================================
+# ==============================================================================
+IMAGE_SIZE = 300
+SPACE_TO_DEPTH_BLOCK_SIZE = 2
+
+# TODO(taylorrobie): MLPerf uses 80, but COCO documents 90. (RetinaNet uses 90)
+# Update(taylorrobie): Labels > 81 show up in the pipeline. This will need to
+#                      be resolved.
+NUM_CLASSES = 81  # Including "no class". Not all COCO classes are used.
+
+# Note: Zero is special. (Background class) CLASS_INV_MAP[0] must be zero.
+CLASS_INV_MAP = (
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+    44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+    64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87,
+    88, 89, 90)
+_MAP = {j: i for i, j in enumerate(CLASS_INV_MAP)}
+CLASS_MAP = tuple(_MAP.get(i, -1) for i in range(max(CLASS_INV_MAP) + 1))
+
+NUM_SSD_BOXES = 8732
+
+RESNET_DEPTH = 34
+
+"""SSD specific"""
+MIN_LEVEL = 3
+MAX_LEVEL = 8
+
+FEATURE_SIZES = (38, 19, 10, 5, 3, 1)
+STEPS = (8, 16, 32, 64, 100, 300)
+
+# https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+SCALES = (21, 45, 99, 153, 207, 261, 315)
+ASPECT_RATIOS = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,))
+NUM_DEFAULTS = (4, 6, 6, 6, 4, 4)
+NUM_DEFAULTS_BY_LEVEL = {3: 4, 4: 6, 5: 6, 6: 6, 7: 4, 8: 4}
+SCALE_XY = 0.1
+SCALE_HW = 0.2
+BOX_CODER_SCALES = (1 / SCALE_XY, 1 / SCALE_XY, 1 / SCALE_HW, 1 / SCALE_HW)
+MATCH_THRESHOLD = 0.5
+
+# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
+NORMALIZATION_MEAN = (0.485, 0.456, 0.406)
+NORMALIZATION_STD = (0.229, 0.224, 0.225)
+
+# SSD Cropping
+NUM_CROP_PASSES = 50
+CROP_MIN_IOU_CHOICES = (0, 0.1, 0.3, 0.5, 0.7, 0.9)
+P_NO_CROP_PER_PASS = 1 / (len(CROP_MIN_IOU_CHOICES) + 1)
+
+# Hard example mining
+NEGS_PER_POSITIVE = 3
+
+# Batch normalization
+BATCH_NORM_DECAY = 0.9
+BATCH_NORM_EPSILON = 1e-5
+
+
+# ==============================================================================
+# == Optimizer =================================================================
+# ==============================================================================
+BASE_LEARNING_RATE = 3.0e-3
+FIRST_LR_DROP_STEP = 160000  # 该参数不起作用
+SECOND_LR_DROP_STEP = 200000 # 该参数不起作用
+MOMENTUM = 0.9
+WEIGHT_DECAY = 5e-4
+DEFAULT_BATCH_SIZE = 32.0
+
+# ==============================================================================
+# == Keys ======================================================================
+# ==============================================================================
+BOXES = "boxes"
+CLASSES = "classes"
+NUM_MATCHED_BOXES = "num_matched_boxes"
+IMAGE = "image"
+SOURCE_ID = "source_id"
+RAW_SHAPE = "raw_shape"
+IS_PADDED = "is_padded"
+
+
+# ==============================================================================
+# == Evaluation ================================================================
+# ==============================================================================
+
+# Note: This is based on a batch size of 32
+# https://github.com/mlperf/reference/blob/master/single_stage_detector/ssd/train.py#L21-L37  # pylint: disable=line-too-long
+EVAL_SAMPLES = 5000
+CHECKPOINT_FREQUENCY = 5000
+MAX_NUM_EVAL_BOXES = 200
+OVERLAP_CRITERIA = 0.5  # Used for nonmax supression
+MIN_SCORE = 0.05  # Minimum score to be considered during evaluation.
+DUMMY_SCORE = -1e5  # If no boxes are matched.
+# Eval step intervals starting from 0
+#EVAL_STEPS = (24000, 24000, 24000, 24000, 24000,24000, 24000, 24000, 24000, 24000)
+EVAL_STEPS = (432000,)
+# Target COCO/AP for mlperf.
+EVAL_TARGET = 0.24
+
+# For multiprocessing.
+QUEUE_SIZE = 24
+WORKER_COUNT = 10
@@ -0,0 +1,309 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training script for SSD.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import multiprocessing
+import os
+
+import sys
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../utils/atlasboost'))
+
+import threading
+from absl import app
+import numpy as np
+import tensorflow as tf
+
+from npu_bridge.estimator import npu_ops
+from tensorflow.core.protobuf import rewriter_config_pb2
+from npu_bridge.estimator.npu.npu_config import NPURunConfig
+from npu_bridge.estimator.npu.npu_estimator  import NPUEstimator
+
+import coco_metric
+import dataloader
+import ssd_constants
+import ssd_model
+
+
+def get_rank_size():
+    return int(os.environ['RANK_SIZE'])
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+tf.flags.DEFINE_string('model_dir', None, 'Location of model_dir')
+tf.flags.DEFINE_string('resnet_checkpoint', '',
+                       'Location of the ResNet checkpoint to use for model '
+                       'initialization.')
+tf.flags.DEFINE_integer('train_batch_size', 64, 'training batch size')
+tf.flags.DEFINE_integer('eval_batch_size', 1, 'evaluation batch size')
+tf.flags.DEFINE_integer('eval_samples', 5000, 'The number of samples for '
+                                              'evaluation.')
+tf.flags.DEFINE_string(
+    'training_file_pattern', None,
+    'Glob for training data files (e.g., COCO train - minival set)')
+tf.flags.DEFINE_string(
+    'validation_file_pattern', None,
+    'Glob for evaluation tfrecords (e.g., COCO val2017 set)')
+tf.flags.DEFINE_string(
+    'val_json_file',
+    None,
+    'COCO validation JSON containing golden bounding boxes.')
+tf.flags.DEFINE_integer('num_examples_per_epoch', 120000,
+                        'Number of examples in one epoch')
+tf.flags.DEFINE_float('num_epochs', 58, 'Number of epochs for training')
+
+tf.flags.DEFINE_string('mode', 'train_and_eval',
+                       'Mode to run: train_and_eval, train, eval')
+
+tf.flags.DEFINE_integer(
+    'keep_checkpoint_max', 32,
+    'Maximum number of checkpoints to keep.')
+
+
+FLAGS = tf.flags.FLAGS
+
+SUCCESS = False
+
+
+def construct_run_config():
+    """Construct the run config."""
+
+    # Parse hparams
+    hparams = ssd_model.default_hparams()
+
+    params = dict(
+        hparams.values(),
+        num_examples_per_epoch=FLAGS.num_examples_per_epoch,
+        resnet_checkpoint=FLAGS.resnet_checkpoint,
+        val_json_file=FLAGS.val_json_file,
+        mode=FLAGS.mode,
+        model_dir=FLAGS.model_dir,
+        eval_samples=FLAGS.eval_samples,
+    )
+
+    return NPURunConfig(
+        model_dir=FLAGS.model_dir,
+        session_config=tf.ConfigProto(),
+        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
+        save_checkpoints_steps=ssd_constants.CHECKPOINT_FREQUENCY,
+        enable_data_pre_proc=True,
+        save_summary_steps=100,
+        iterations_per_loop=100,
+        precision_mode='allow_mix_precision'
+      ), params
+
+def coco_eval(predictions,
+              current_step,
+              summary_writer,
+              coco_gt,
+              use_cpp_extension=True,
+              nms_on_tpu=True):
+    """Call the coco library to get the eval metrics."""
+    global SUCCESS
+    eval_results = coco_metric.compute_map(
+        predictions,
+        coco_gt,
+        use_cpp_extension=use_cpp_extension,
+        nms_on_tpu=nms_on_tpu)
+    if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS:
+        SUCCESS = True
+    tf.logging.info('Eval results: %s' % eval_results)
+    hwlog.remark_print(key=hwlog.EVAL_RESULTS, value=eval_results)
+    # Write out eval results for the checkpoint.
+    with tf.Graph().as_default():
+        summaries = []
+        for metric in eval_results:
+            summaries.append(
+                tf.Summary.Value(tag=metric, simple_value=eval_results[metric]))
+        tf_summary = tf.Summary(value=list(summaries))
+        summary_writer.add_summary(tf_summary, current_step)
+
+def init_npu():
+   """Initialize npu manually.
+   Returns:
+     `init_sess` npu  init session config.
+     `npu_init` npu  init ops.
+   """
+   npu_init = npu_ops.initialize_system()
+   config = tf.ConfigProto()
+
+   #npu mix precision attribute set to true when using mix precision
+   config.graph_options.rewrite_options.remapping = rewriter_config_pb2.RewriterConfig.OFF
+   custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
+   custom_op.name = "NpuOptimizer"
+   custom_op.parameter_map["use_off_line"].b = True
+
+   init_sess = tf.Session(config=config)
+   return init_sess,npu_init
+
+def main(argv):
+    init_sess, npu_init = init_npu()
+    init_sess.run(npu_init)
+
+    del argv  # Unused.
+    global SUCCESS
+
+    # Check data path
+    if FLAGS.mode in ('train',
+                      'train_and_eval') and FLAGS.training_file_pattern is None:
+        raise RuntimeError('You must specify --training_file_pattern for training.')
+    if FLAGS.mode in ('train_and_eval', 'eval'):
+        if FLAGS.validation_file_pattern is None:
+            raise RuntimeError('You must specify --validation_file_pattern '
+                               'for evaluation.')
+        if FLAGS.val_json_file is None:
+            raise RuntimeError('You must specify --val_json_file for evaluation.')
+
+    run_config, params = construct_run_config()
+
+    if FLAGS.mode == 'train':
+        train_params = dict(params)
+        hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=train_params['num_examples_per_epoch'])
+        train_params['batch_size'] = FLAGS.train_batch_size
+        train_estimator = NPUEstimator(
+            model_fn=ssd_model.ssd_model_fn,
+            model_dir=FLAGS.model_dir,
+            config=run_config,
+            params=train_params)
+
+        tf.logging.info(params)
+
+        train_estimator.train(
+            input_fn=dataloader.SSDInputReader(
+                FLAGS.training_file_pattern,
+                params['transpose_input'],
+                is_training=True),
+            steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
+                      FLAGS.train_batch_size / get_rank_size()))
+
+    elif FLAGS.mode == 'train_and_eval':
+        output_dir = os.path.join(FLAGS.model_dir, 'eval')
+        tf.gfile.MakeDirs(output_dir)
+        # Summary writer writes out eval metrics.
+        summary_writer = tf.summary.FileWriter(output_dir)
+
+        current_step = 0
+
+        coco_gt = coco_metric.create_coco(
+            FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc'])
+        for eval_step in ssd_constants.EVAL_STEPS:
+            # Compute the actual eval steps based on the actural train_batch_size
+            steps = int(eval_step / get_rank_size() * ssd_constants.DEFAULT_BATCH_SIZE /
+                        FLAGS.train_batch_size)
+            print('###################################', steps)
+
+            tf.logging.info('Starting training cycle for %d steps.' % steps)
+            run_config, params = construct_run_config()
+
+            train_params = dict(params)
+            hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=train_params['num_examples_per_epoch'])
+            train_params['batch_size'] = FLAGS.train_batch_size
+            train_estimator = NPUEstimator(
+                model_fn=ssd_model.ssd_model_fn,
+                model_dir=FLAGS.model_dir,
+                config=run_config,
+                params=train_params)
+            tf.logging.info(params)
+            train_estimator.train(
+                input_fn=dataloader.SSDInputReader(
+                    FLAGS.training_file_pattern,
+                    params['transpose_input'],
+                    is_training=True),
+                steps=steps)
+
+            if SUCCESS:
+                break
+
+            current_step = current_step + steps
+
+            tf.logging.info('Starting evaluation cycle at step %d.' % current_step)
+            # Run evaluation at the given step.
+            eval_params = dict(params)
+            eval_params['batch_size'] = FLAGS.eval_batch_size
+            eval_estimator = NPUEstimator(
+                model_fn=ssd_model.ssd_model_fn,
+                model_dir=FLAGS.model_dir,
+                config=run_config,
+                params=eval_params)
+
+            predictions = list(
+                eval_estimator.predict(
+                    input_fn=dataloader.SSDInputReader(
+                        FLAGS.validation_file_pattern,
+                        is_training=False)))
+
+            coco_eval(predictions, current_step, summary_writer, coco_gt, params['use_cocoeval_cc'], False)
+        summary_writer.close()
+
+    elif FLAGS.mode == 'eval':
+        coco_gt = coco_metric.create_coco(
+            FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc'])
+        eval_params = dict(params)
+        eval_params['batch_size'] = FLAGS.eval_batch_size
+        eval_estimator = NPUEstimator(
+            model_fn=ssd_model.ssd_model_fn,
+            model_dir=FLAGS.model_dir,
+            config=run_config,
+            params=eval_params)
+
+        output_dir = os.path.join(FLAGS.model_dir, 'eval')
+        tf.gfile.MakeDirs(output_dir)
+        # Summary writer writes out eval metrics.
+        summary_writer = tf.summary.FileWriter(output_dir)
+        ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
+        tf.logging.info('Starting to evaluate on newest checkpoint.')
+        predictions = list(
+            eval_estimator.predict(
+                checkpoint_path=ckpt,
+                input_fn=dataloader.SSDInputReader(
+                    FLAGS.validation_file_pattern,
+                    is_training=False)))
+        tf.logging.info('Starting to cal coco ap.')
+        current_step = int(os.path.basename(ckpt).split('-')[1])
+
+        coco_eval(predictions, current_step, summary_writer, coco_gt,
+                  params['use_cocoeval_cc'], False)
+
+        tf.logging.info('end to evaluate.')
+
+        summary_writer.close()
+
+    npu_shutdown = npu_ops.shutdown_system()
+    init_sess.run(npu_shutdown)
+    init_sess.close()
+
+if __name__ == '__main__':
+    hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+    cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+    config_info = get_model_parameter("tensorflow_config")
+    initinal_data = {"base_lr": 0.01, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512}
+    hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+    hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+    hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+    hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+    hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+    hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+    hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+    hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+    hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+    hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+    hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
+    tf.logging.set_verbosity(tf.logging.INFO)
+    app.run(main)
@@ -0,0 +1,500 @@
+# Copyright 2018 Google. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model defination for the SSD Model.
+
+Defines model_fn of SSD for TF Estimator. The model_fn includes SSD
+model architecture, loss function, learning rate schedule, and evaluation
+procedure.
+
+T.-Y. Lin, P. Goyal, R. Girshick, K. He, and P. Dollar
+Focal Loss for Dense Object Detection. arXiv:1708.02002
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import tensorflow as tf
+
+from object_detection import box_coder
+from object_detection import box_list
+from object_detection import faster_rcnn_box_coder
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+
+import dataloader
+import ssd_architecture
+import ssd_constants
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+
+
+def get_rank_size():
+    return int(os.environ['RANK_SIZE'])
+
+def select_top_k_scores(scores_in, pre_nms_num_detections=5000):
+  """Select top_k scores and indices for each class.
+
+  Args:
+    scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
+      class logit outputs on all feature levels. The N is the number of total
+      anchors on all levels. The num_classes is the number of classes predicted
+      by the model.
+    pre_nms_num_detections: Number of candidates before NMS.
+
+  Returns:
+    scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
+      num_classes].
+  """
+  scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
+
+  top_k_scores, top_k_indices = tf.nn.top_k(
+      scores_trans, k=pre_nms_num_detections, sorted=True)
+
+  return tf.transpose(top_k_scores, [0, 2, 1]), tf.transpose(
+      top_k_indices, [0, 2, 1])
+
+
+def concat_outputs(cls_outputs, box_outputs):
+  """Concatenate predictions into a single tensor.
+
+  This function takes the dicts of class and box prediction tensors and
+  concatenates them into a single tensor for comparison with the ground truth
+  boxes and class labels.
+  Args:
+    cls_outputs: an OrderDict with keys representing levels and values
+      representing logits in [batch_size, height, width,
+      num_anchors * num_classses].
+    box_outputs: an OrderDict with keys representing levels and values
+      representing box regression targets in
+      [batch_size, height, width, num_anchors * 4].
+  Returns:
+    concatenanted cls_outputs and box_outputs.
+  """
+  assert set(cls_outputs.keys()) == set(box_outputs.keys())
+
+  # This sort matters. The labels assume a certain order based on
+  # ssd_constants.FEATURE_SIZES, and this sort matches that convention.
+  keys = sorted(cls_outputs.keys())
+  batch_size = int(cls_outputs[keys[0]].shape[0])
+
+  flat_cls = []
+  flat_box = []
+
+  for i, k in enumerate(keys):
+    # TODO(taylorrobie): confirm that this reshape, transpose,
+    # reshape is correct.
+    scale = ssd_constants.FEATURE_SIZES[i] # 不同特征尺度, 38,19,10,5,3,1
+    split_shape = (ssd_constants.NUM_DEFAULTS[i], ssd_constants.NUM_CLASSES) # （4,81）、（6,81）...
+    assert cls_outputs[k].shape[3] == split_shape[0] * split_shape[1]
+    intermediate_shape = (batch_size, scale, scale) + split_shape # (32,38,38)+ (4,81)=(32,38,38,4,81)
+    final_shape = (batch_size, scale ** 2 * split_shape[0], split_shape[1]) # (32, 38^2 * 4, 81)
+    flat_cls.append(tf.reshape(tf.reshape(
+        cls_outputs[k], intermediate_shape), final_shape))
+
+    split_shape = (ssd_constants.NUM_DEFAULTS[i], 4) # (4,4), (6,4)...
+    assert box_outputs[k].shape[3] == split_shape[0] * split_shape[1]
+    intermediate_shape = (batch_size, scale, scale) + split_shape # (32, 19,19) + (6,4) 为避免歧义，以第二个default box为例
+    final_shape = (batch_size, scale ** 2 * split_shape[0], split_shape[1]) # (32, 19^2 * 6, 4)
+    flat_box.append(tf.reshape(tf.reshape(
+        box_outputs[k], intermediate_shape), final_shape))
+
+  return tf.concat(flat_cls, axis=1), tf.concat(flat_box, axis=1)
+
+
+def _localization_loss(pred_locs, gt_locs, gt_labels, num_matched_boxes):
+  """Computes the localization loss.
+
+  Computes the localization loss using smooth l1 loss.
+  Args:
+    pred_locs: a dict from index to tensor of predicted locations. The shape
+      of each tensor is [batch_size, num_anchors, 4].
+    gt_locs: a list of tensors representing box regression targets in
+      [batch_size, num_anchors, 4].
+    gt_labels: a list of tensors that represents the classification groundtruth
+      targets. The shape is [batch_size, num_anchors, 1].
+    num_matched_boxes: the number of anchors that are matched to a groundtruth
+      targets, used as the loss normalizater. The shape is [batch_size].
+  Returns:
+    box_loss: a float32 representing total box regression loss.
+  """
+  keys = sorted(pred_locs.keys())
+  box_loss = 0
+  for i, k in enumerate(keys):
+    gt_label = gt_labels[i]
+    gt_loc = gt_locs[i]
+    pred_loc = tf.reshape(pred_locs[k], gt_loc.shape)
+    mask = tf.greater(gt_label, 0)
+    float_mask = tf.cast(mask, tf.float32)
+
+    smooth_l1 = tf.reduce_sum(
+        tf.losses.huber_loss(
+            gt_loc, pred_loc, reduction=tf.losses.Reduction.NONE),
+        axis=-1)
+    smooth_l1 = tf.multiply(smooth_l1, float_mask)
+    box_loss = box_loss + tf.reduce_sum(
+        smooth_l1, axis=list(range(1, smooth_l1.shape.ndims)))
+
+  # TODO(taylorrobie): Confirm that normalizing by the number of boxes matches
+  # reference
+  return tf.reduce_mean(box_loss / num_matched_boxes)
+
+
+@tf.custom_gradient
+def _softmax_cross_entropy(logits, label):
+  """Helper function to compute softmax cross entropy loss."""
+  shifted_logits = logits - tf.expand_dims(tf.reduce_max(logits, -1), -1)
+  exp_shifted_logits = tf.math.exp(shifted_logits)
+  sum_exp = tf.reduce_sum(exp_shifted_logits, -1)
+  log_sum_exp = tf.math.log(sum_exp)
+  one_hot_label = tf.one_hot(label, ssd_constants.NUM_CLASSES)
+  shifted_logits = tf.reduce_sum(shifted_logits * one_hot_label, -1)
+  loss = log_sum_exp - shifted_logits
+
+  def grad(dy):
+    return (exp_shifted_logits / tf.expand_dims(sum_exp, -1) -
+            one_hot_label) * tf.expand_dims(dy, -1), dy
+
+  return loss, grad
+
+
+def _classification_loss(pred_labels, gt_labels, num_matched_boxes):
+  """Computes the classification loss.
+
+  Computes the classification loss with hard negative mining.
+  Args:
+    pred_labels: a dict from index to tensor of predicted class. The shape
+      of the tensor is [batch_size, num_anchors, num_classes].
+    gt_labels: a list of tensor that represents the classification groundtruth
+      targets. The shape is [batch_size, num_anchors, 1].
+    num_matched_boxes: the number of anchors that are matched to a groundtruth
+      targets. This is used as the loss normalizater.
+  Returns:
+    box_loss: a float32 representing total box regression loss.
+  """
+  keys = sorted(pred_labels.keys())
+  batch_size = gt_labels[0].shape[0]
+  cross_entropy = []
+  for i, k in enumerate(keys):
+    gt_label = gt_labels[i]
+    pred_label = tf.reshape(
+        pred_labels[k],
+        gt_label.get_shape().as_list() + [ssd_constants.NUM_CLASSES])
+    cross_entropy.append(
+        tf.reshape(
+            _softmax_cross_entropy(pred_label, gt_label), [batch_size, -1]))
+
+
+  # Put the rest of the loss computation on one device to avoid excessive
+  # communication inside topk_mask with spatial partition
+  #with tf.device(tf.contrib.tpu.core(0)):
+  cross_entropy = tf.concat(cross_entropy, 1)
+  gt_label = tf.concat([tf.reshape(l, [batch_size, -1]) for l in gt_labels],
+                         1)
+  mask = tf.greater(gt_label, 0)
+  float_mask = tf.cast(mask, tf.float32)
+
+    # Hard example mining
+  neg_masked_cross_entropy = cross_entropy * (1 - float_mask)
+
+
+  value1, _ = tf.math.top_k(neg_masked_cross_entropy, k=4096)
+  kth1 = tf.reduce_min(value1, 1, keepdims=True)
+  mask1 = tf.cast(tf.less(neg_masked_cross_entropy, kth1), tf.float32)
+
+  value2, _ = tf.math.top_k(tf.multiply(neg_masked_cross_entropy, mask1), k=4096)
+  kth2 = tf.reduce_min(value2, 1, keepdims=True)
+  mask2 = tf.cast(tf.less(neg_masked_cross_entropy, kth2), tf.float32)
+
+  value3, _ = tf.math.top_k(tf.multiply(neg_masked_cross_entropy, mask2), k=540)
+
+  value = tf.concat([value1, value2, value3], axis=1)
+
+  num_neg_boxes = tf.minimum(
+          tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE, 8731)
+  large_neg_ce = tf.batch_gather(value, num_neg_boxes[:, tf.newaxis])
+  top_k_neg_mask = tf.cast(tf.greater_equal(neg_masked_cross_entropy, large_neg_ce), tf.float32)
+
+
+
+  class_loss = tf.reduce_sum(
+        tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1)
+
+
+    # TODO(taylorrobie): Confirm that normalizing by the number of boxes matches
+    # reference
+  return tf.reduce_mean(class_loss / num_matched_boxes)
+
+
+def detection_loss(cls_outputs, box_outputs, labels):
+  """Computes total detection loss.
+
+  Computes total detection loss including box and class loss from all levels.
+  Args:
+    cls_outputs: an OrderDict with keys representing levels and values
+      representing logits in [batch_size, height, width, num_anchors].
+    box_outputs: an OrderDict with keys representing levels and values
+      representing box regression targets in
+      [batch_size, height, width, num_anchors * 4].
+    labels: the dictionary that returned from dataloader that includes
+      groundturth targets.
+  Returns:
+    total_loss: a float32 representing total loss reducing from class and box
+      losses from all levels.
+    cls_loss: a float32 representing total class loss.
+    box_loss: a float32 representing total box regression loss.
+  """
+  if isinstance(labels[ssd_constants.BOXES], dict):
+    gt_boxes = list(labels[ssd_constants.BOXES].values())
+    gt_classes = list(labels[ssd_constants.CLASSES].values())
+  else:
+    gt_boxes = [labels[ssd_constants.BOXES]]
+    gt_classes = [labels[ssd_constants.CLASSES]]
+    cls_outputs, box_outputs = concat_outputs(cls_outputs, box_outputs)
+    cls_outputs = {'flatten': cls_outputs}
+    box_outputs = {'flatten': box_outputs}
+
+  box_loss = _localization_loss(box_outputs, gt_boxes, gt_classes,
+                                labels[ssd_constants.NUM_MATCHED_BOXES])
+  class_loss = _classification_loss(cls_outputs, gt_classes,
+                                    labels[ssd_constants.NUM_MATCHED_BOXES])
+
+  return class_loss + box_loss, class_loss, box_loss
+
+
+def update_learning_rate_schedule_parameters(params):
+  """Updates params that are related to the learning rate schedule.
+
+  Args:
+    params: a parameter dictionary that includes learning_rate, lr_warmup_epoch,
+      first_lr_drop_epoch, and second_lr_drop_epoch.
+  """
+  batch_size = params['batch_size']
+  # Learning rate is proportional to the batch size
+  steps_per_epoch = params['num_examples_per_epoch'] / batch_size // get_rank_size()
+  params['lr_warmup_step'] = int(params['lr_warmup_epoch'] * steps_per_epoch)
+  params['cos_decay_step'] = int(
+      params['cos_decay_epoch'] * steps_per_epoch)
+
+
+def learning_rate_schedule(params, global_step):
+  """Handles learning rate scaling, linear warmup, and learning rate decay.
+
+  Args:
+    params: A dictionary that defines hyperparameters of model.
+    global_step: A tensor representing current global step.
+
+  Returns:
+    A tensor representing current learning rate.
+  """
+  base_learning_rate = params['base_learning_rate']
+  lr_warmup_step = params['lr_warmup_step']
+  cos_decay_step = params['cos_decay_step']
+  batch_size = params['batch_size']
+  scaling_factor = get_rank_size() * batch_size / ssd_constants.DEFAULT_BATCH_SIZE
+  adjusted_learning_rate = base_learning_rate * scaling_factor
+  learning_rate = (tf.cast(global_step, dtype=tf.float32) /
+                   lr_warmup_step) * adjusted_learning_rate
+
+  learning_rate = tf.where(global_step < lr_warmup_step, learning_rate,
+                           tf.train.cosine_decay(adjusted_learning_rate, global_step, cos_decay_step, alpha=0.01))
+
+  return learning_rate
+
+
+class ExamplesPerSecondHook(tf.train.SessionRunHook):
+  def __init__(
+      self,
+      batch_size,
+      lr=0,
+      loss=0,
+      every_n_steps=100,
+      every_n_secs=None,):
+
+
+    if (every_n_steps is None) == (every_n_secs is None):
+      raise ValueError('exactly one of every_n_steps'
+                       ' and every_n_secs should be provided.')
+
+    self._timer = tf.train.SecondOrStepTimer(
+        every_steps=every_n_steps, every_secs=every_n_secs)
+
+    self._step_train_time = 0
+    self._total_steps = 0
+    self._batch_size = batch_size
+    self._lr = lr
+    self._loss = loss
+
+  def begin(self):
+    self._global_step_tensor = tf.compat.v1.train.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          'Global step should be created to use StepCounterHook.')
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return tf.train.SessionRunArgs([self._global_step_tensor, self._lr, self._loss])
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+
+    global_step, lr, loss = run_values.results
+    if self._timer.should_trigger_for_step(global_step):
+
+      elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
+          global_step)
+      if elapsed_time is not None:
+        steps_per_sec = elapsed_steps / elapsed_time
+        self._step_train_time += elapsed_time
+        self._total_steps += elapsed_steps
+
+        current_examples_per_sec = steps_per_sec * self._batch_size
+        tf.logging.info('%s: %g, %s: %s, %s: %s', 'FPS', current_examples_per_sec, 'learning rate', lr, 'loss', loss)
+        hwlog.remark_print(key=hwlog.FPS, value='%7.1f' % current_examples_per_sec)
+
+
+
+def _model_fn(features, labels, mode, params, model):
+  """Model defination for the SSD model based on ResNet-50.
+
+  Args:
+    features: the input image tensor with shape [batch_size, height, width, 3].
+      The height and width are fixed and equal.
+    labels: the input labels in a dictionary. The labels include class targets
+      and box targets which are dense label maps. The labels are generated from
+      get_input_fn function in data/dataloader.py
+    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
+    params: the dictionary defines hyperparameters of model. The default
+      settings are in default_hparams function in this file.
+    model: the SSD model outputs class logits and box regression outputs.
+
+  Returns:
+    spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation,
+      or prediction.
+  """
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    labels = features
+    features = labels.pop('image')
+
+  features -= tf.constant(
+        ssd_constants.NORMALIZATION_MEAN, shape=[1, 1, 3], dtype=features.dtype)
+
+  features /= tf.constant(
+        ssd_constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype)
+
+  def _model_outputs():
+    return model(
+        features, params, is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN))
+
+
+  cls_outputs, box_outputs = _model_outputs()
+
+  # First check if it is in PREDICT mode.
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs)
+    ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
+        scale_factors=ssd_constants.BOX_CODER_SCALES)
+
+    anchors = box_list.BoxList(
+        tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb')))
+
+    decoded_boxes = box_coder.batch_decode(
+        encoded_boxes=flattened_box, box_coder=ssd_box_coder, anchors=anchors)
+
+    pred_scores = tf.nn.softmax(flattened_cls, axis=2)
+
+    pred_scores, indices = select_top_k_scores(pred_scores,
+                                               ssd_constants.MAX_NUM_EVAL_BOXES)
+
+    predictions = dict(
+          labels,
+          indices=indices,
+          pred_scores=pred_scores,
+          pred_box=decoded_boxes,
+    )
+
+    return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
+
+  # Load pretrained model from checkpoint.
+  if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:
+
+    def scaffold_fn():
+      """Loads pretrained model through scaffold function."""
+      tf.train.init_from_checkpoint(params['resnet_checkpoint'], {
+          '/': 'resnet%s/' % ssd_constants.RESNET_DEPTH,
+      })
+      return tf.train.Scaffold()
+  else:
+    scaffold_fn = None
+
+  # Set up training loss and learning rate.
+  update_learning_rate_schedule_parameters(params)
+  global_step = tf.train.get_or_create_global_step()
+  learning_rate = learning_rate_schedule(params, global_step)
+  # cls_loss and box_loss are for logging. only total_loss is optimized.
+  total_loss, cls_loss, box_loss = detection_loss(
+      cls_outputs, box_outputs, labels)
+
+  total_loss += params['weight_decay'] * tf.add_n(
+      [tf.nn.l2_loss(v) for v in tf.trainable_variables()])
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    total_loss_t = tf.reduce_mean(tf.reshape(total_loss, [1]))
+    cls_loss_t = tf.reduce_mean(tf.reshape(cls_loss, [1]))
+    box_loss_t = tf.reduce_mean(tf.reshape(box_loss, [1]))
+    learning_rate_t = tf.reduce_mean(tf.reshape(learning_rate, [1]))
+    tf.summary.scalar('total_loss', total_loss_t)
+    tf.summary.scalar('cls_loss_t', cls_loss_t)
+    tf.summary.scalar('box_loss_t', box_loss_t)
+    tf.summary.scalar('learning_rate_t', learning_rate_t)
+
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate, momentum=ssd_constants.MOMENTUM)
+    from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+    optimizer = NPUDistributedOptimizer(optimizer)  # 使用NPU分布式计算，更新梯度
+
+    # Batch norm requires update_ops to be added as a train_op dependency.
+    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+
+    examples_sec_hook = ExamplesPerSecondHook(get_rank_size() * 32, learning_rate, total_loss)
+
+    train_op = tf.group(optimizer.minimize(total_loss, global_step),
+                        update_ops)
+    return model_fn_lib.EstimatorSpec(
+        mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn(),
+        training_hooks=[examples_sec_hook])
+
+  if mode == tf.estimator.ModeKeys.EVAL:
+    raise NotImplementedError
+
+
+def ssd_model_fn(features, labels, mode, params):
+  """SSD model."""
+  return _model_fn(features, labels, mode, params, model=ssd_architecture.ssd)
+
+
+def default_hparams():
+  # TODO(taylorrobie): replace params useages with global constants.
+  return tf.contrib.training.HParams(
+
+      num_examples_per_epoch=120000,
+      lr_warmup_epoch=0.8,
+      cos_decay_epoch=106,
+      weight_decay=ssd_constants.WEIGHT_DECAY,
+      base_learning_rate=ssd_constants.BASE_LEARNING_RATE,
+      eval_every_checkpoint=False,
+      transpose_input=False,
+      use_cocoeval_cc=False
+  )
@@ -0,0 +1,197 @@
+# Alexnet for Tensorflow 
+
+This repository provides a script and recipe to train the AlexNet model .
+
+## Table Of Contents
+
+* [Model overview](#model-overview)
+  * [Model Architecture](#model-architecture)  
+  * [Default configuration](#default-configuration)
+* [Data augmentation](#data-augmentation)
+* [Setup](#setup)
+  * [Requirements](#requirements)
+* [Quick start guide](#quick-start-guide)
+* [Advanced](#advanced)
+  * [Command line arguments](#command-line-arguments)
+  * [Training process](#training-process)
+* [Performance](#performance)
+  * [Results](#results)
+    * [Training accuracy results](#training-accuracy-results)
+    * [Training performance results](#training-performance-results)
+
+
+    
+
+## Model overview
+
+AlexNet model from
+`Alex Krizhevsky. "One weird trick for parallelizing convolutional neural networks". <https://arxiv.org/abs/1404.5997>.`
+reference implementation:  <https://pytorch.org/docs/stable/_modules/torchvision/models/alexnet.html#alexnet>
+### Model architecture
+
+
+
+### Default configuration
+
+The following sections introduce the default configurations and hyperparameters for AlexNet model.
+
+#### Optimizer
+
+This model uses Momentum optimizer from Tensorflow with the following hyperparameters:
+
+- Momentum : 0.9
+- Learning rate (LR) : 0.06
+- LR schedule: cosine_annealing
+- Batch size : 128 
+- Weight decay :  0.0001. 
+- Label smoothing = 0.1
+- We train for:
+  - 150 epochs ->  60.1% top1 accuracy
+
+#### Data augmentation
+
+This model uses the following data augmentation:
+
+- For training:
+  - RandomResizeCrop, scale=(0.08, 1.0), ratio=(0.75, 1.333)
+  - RandomHorizontalFlip, prob=0.5
+  - Normalize, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
+- For inference:
+  - Resize to (256, 256)
+  - CenterCrop to (224, 224)
+  - Normalize, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
+
+## Setup
+The following section lists the requirements to start training the Alexnet model.
+### Requirements
+
+Tensorflow
+NPU environmemnt
+
+## Quick Start Guide
+
+### 1. Clone the respository
+
+```shell
+git clone xxx
+cd  Model_zoo_Alexnet_HARD
+```
+
+### 2. Download and preprocess the dataset
+
+1. down load the imagenet dataset
+2. Extract the training data
+3. The train and val images are under the train/ and val/ directories, respectively. All images within one folder have the same label.
+
+### 3. Train
+- train on single NPU
+    - **edit** *scripts/train_alexnet_1p.sh*( see example below)
+    - bash scripts/run_npu_1p.sh
+- train on 8 NPUs
+    - **edit** *scripts/train_alexnet_8p.sh*(see example below)
+    - bash scripts/run_npu_8p.sh 
+
+
+for example:
+- case for single NPU
+    - In scripts/train_alexnet_1p.sh , python scripts part should look like as follows. For more detailed command lines arguments, please refer to [Command line arguments](#command-line-arguments)
+```shell
+python3.7 ${EXEC_DIR}/train.py --rank_size=1 \
+	--iterations_per_loop=100 \
+	--batch_size=256 \
+	--data_dir=/path/to/dataset \
+	--mode=train \
+	--lr=0.015 \
+	--log_dir=./model_1p > ./train_${device_id}.log 2>&1 
+```
+run the program  
+```
+bash scripts/run_npu_1p.sh
+```
+- case for 8 NPUs
+    - In scripts/train_alexnet_8p.sh , python scripts part should look like as follows.
+```shell 
+python3.7 ${EXEC_DIR}/train.py --rank_size=8 \
+	--iterations_per_loop=100 \
+	--batch_size=128 \
+	--data_dir=/path/to/dataset \
+	--mode=train \
+	--lr=0.06 \
+	--log_dir=./model_8p > ./train_${device_id}.log 2>&1 
+```
+run the program  
+```
+bash scripts/run_npu_1p.sh
+```
+
+### 4. Test
+- same procedure as training except 2 following modifications
+    - change `--mode=train` to `--mode=evaluate`
+    - add `--checkpoint_dir=/path/to/checkpoints`
+
+
+## Advanced
+### Commmand-line options
+
+```
+  --data_dir                        train data dir
+  --num_classes                     num of classes in ImageNet（default:1000)
+  --image_size                      image size of the dataset
+  --batch_size                      mini-batch size (default: 128) per npu
+  --pretrained                      path of pretrained model
+  --lr                              initial learning rate
+  --max_epochs                      max epoch num to train the model
+  --warmup_epochs                   warmup epoch(when batchsize is large)
+  --weight_decay                    weight decay (default: 1e-4)
+  --momentum                        momentum(default: 0.9)
+  --label_smoothing                 use label smooth in CE, default 0.1
+  --save_summary_steps              logging interval(dafault:100)
+  --log_dir                         path to save checkpoint and log
+  --log_name                        name of log file
+  --save_checkpoints_steps          the interval to save checkpoint
+  --mode                            mode to run the program (train, evaluate)
+  --checkpoint_dir                  path to checkpoint for evaluation
+  --max_train_steps                 max number of training steps 
+  --synthetic                       whether to use synthetic data or not
+  --version                         weight initialization for model
+  --do_checkpoint                   whether to save checkpoint or not 
+  --rank_size                       local rank of distributed(default: 0)
+  --group_size                      world size of distributed(default: 1)
+  --max_train_steps                 number of training step , default : None, when set ,it will override the max_epoch
+```
+for a complete list of options, please refer to `train.py`
+### Training process
+
+All the results of the training will be stored in the directory `results`.
+Script will store:
+ - checkpoints.
+ - log.
+ 
+## Performance
+
+### Result
+
+Our result were obtained by running the applicable training script. To achieve the same results, follow the steps in the Quick Start Guide.
+
+#### Training accuracy results
+
+| **epochs** |   Top1/Top5   |
+| :--------: | :-----------: |
+|    150     | 60.12%/82.06% |
+
+#### Training performance results
+
+| **NPUs** | train performance |
+| :------: | :---------------: |
+|    8     |   30000+  img/s   |
+
+
+
+
+
+
+
+
+
+
+
@@ -0,0 +1,9 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [{devices}],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+rm -rf /var/log/npu/slog/host-0/*
+# main env
+if [ -d /usr/local/Ascend/nnae/latest ];then
+
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
+	export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+else
+	export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
+	export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+	
+fi
+
+export DDK_VERSION_FLAG=1.60.T17.B830
+export HCCL_CONNECT_TIMEOUT=600
+export JOB_ID=9999001
+
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+export DUMP_GE_GRAPH=1
+export DUMP_GRAPH_LEVEL=3
+export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=1
+
+
+export PROFILING_MODE=false
+export PROFILING_OPTIONS=training_trace
+export FP_POINT=ssd/block7-conv1x1/Relu
+export BP_POINT=gradients/resnet34/Relu_grad/ReluGrad
+export AICPU_PROFILING_MODE=false
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+rank_size=$1
+yamlPath=$2
+
+toolsPath=$3
+if [ -f /.dockerenv ];then
+        CLUSTER=$4
+        MPIRUN_ALL_IP="$5"
+        export CLUSTER=${CLUSTER}
+fi
+
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+model_name=$(cd $currentDir/..;basename `pwd`)
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+#mkdir train job path
+currtime=`date +%Y%m%d%H%M%S`
+mkdir -p ${currentDir%train*}/train/result/tf_ssd_resnet34/training_job_${currtime}/
+train_job_dir=${currentDir%train*}/train/result/tf_ssd_resnet34/training_job_${currtime}/
+echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
+jsonFilePath=${currentDir}/code/ssd_constants.py
+
+echo "start to modify inner config file"
+echo "jsonfilepath is "${jsonFilePath}
+
+sed -i "s/EVAL_STEPS = (.*,)$/EVAL_STEPS = (${max_steps},)/g" ${jsonFilePath}
+
+# device 列表, 若无指定 device 根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+# get last device id in device_group, hw log in performance from the dir named first_device_id  
+device_group_str=`echo ${device_group} | sed 's/ //g'`
+first_device_id=`echo ${device_group_str: 0:1}`
+
+if [ x"${CLUSTER}" == x"True" ];then
+    # ln hw log
+    ln -snf ${train_job_dir}/0/hw_SSD-Resnet34.log ${train_job_dir}
+    this_ip=$(hostname -I |awk '{print $1}')
+    for ip in $MPIRUN_ALL_IP;do
+        if [ x"$ip" != x"$this_ip" ];then
+            scp $yamlPath root@$ip:$yamlPath
+            scp $jsonFilePath root@$ip:$jsonFilePath
+        fi
+    done
+    export PATH=$PATH:/usr/local/mpirun4.0/bin
+    mpirun -H ${mpirun_ip} \
+    --bind-to none -map-by slot\
+    --allow-run-as-root \
+    --mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
+    --prefix /usr/local/mpirun4.0/ \
+    ${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
+else
+    # ln hw log
+    ln -snf ${train_job_dir}/${first_device_id}/hw_SSD-Resnet34.log ${train_job_dir}
+    rank_id=0
+    for device_id in $device_group;do
+      ${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id &
+      let rank_id++
+    done
+fi
+wait
+
+#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train exit " >> ${currentDir}/result/main.log
+
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+device_id=$1
+rank_size=$2
+yamlPath=$3
+
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+#model_name="SSD-Resnet34"
+currtime=$4
+toolsPath=$5
+export YAML_PATH=$3
+
+mkdir -p ${currentDir%train*}/train/result/tf_ssd_resnet34/training_job_${currtime}/
+train_job_dir=${currentDir%train*}/train/result/tf_ssd_resnet34/training_job_${currtime}/
+
+
+# 从 yaml 获取配置
+
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+
+source ${currentDir}/config/npu_set_env.sh
+
+
+# 声明变量
+export REMARK_LOG_FILE=hw_SSD-Resnet34.log  # 打点日志文件名称， 必须hw_后跟模型名称小写
+# 添加日志打点模块路径
+benchmark_log_path=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils
+export PYTHONPATH=$PYTHONPATH:${benchmark_log_path}
+
+# user env
+export DDK_VERSION_FLAG=1.60.T17.B830
+export HCCL_CONNECT_TIMEOUT=600
+export RANK_TABLE_FILE=${currentDir}/config/${rank_size}p.json
+export RANK_SIZE=${rank_size}
+export SLOG_PRINT_TO_STDOUT=0
+export DEVICE_ID=${device_id}
+export DEVICE_INDEX=$RANK_ID
+export JOB_ID=990
+export FUSION_TENSOR_SIZE=1000000000
+
+startTime=`date +%Y%m%d-%H:%M:%S`
+startTime_s=`date +%s`
+
+cd ${train_job_dir}
+curd_dir=${currentDir%atlas_benchmark-master*}/atlas_benchmark-master/utils/atlasboost
+export PYTHONPATH=$PYTHONPATH:${curd_dir}
+
+if [ x"$6" != x"True" ];then
+        rank_id=$6
+        export RANK_ID=$6
+else
+        device_id_mo=$(python3.7 -c "import src.tensorflow.mpi_ops as atlasboost;atlasboost.init(); \
+                device_id = atlasboost.local_rank();cluster_device_id = str(device_id); \
+                atlasboost.set_device_id(device_id);print(atlasboost.rank())")
+        device_id_mo=`echo $device_id_mo`
+        rank_id=${device_id_mo##* }
+        export RANK_ID=${rank_id}
+        device=${device_id_mo##*deviceid = }
+        device_id=${device%% phyid=*}
+        export DEVICE_ID=${device_id}
+        hccljson=${train_job_dir}/*.json
+        cp ${hccljson} ${currentDir}/config/${rank_size}p.json
+fi
+
+#mkdir exec path
+mkdir -p ${train_job_dir}/${device_id}
+cd ${train_job_dir}/${device_id}
+
+# 根据单卡/多卡区分调用参数
+if [ x"$6" == x"True" ];then
+    # 多卡多机
+    export CLUSTER=True
+fi
+python3.7 ${currentDir}/code/ssd_main.py \
+    --mode=${runmode} \
+    --train_batch_size=${train_batch_size} \
+    --training_file_pattern=${training_file_pattern} \
+    --resnet_checkpoint=${resnet_checkpoint} \
+    --validation_file_pattern=${validation_file_pattern} \
+    --val_json_file=${val_json_file} \
+    --eval_batch_size=${eval_batch_size} \
+    --num_epochs=${num_epochs} \
+    --model_dir=${model_dir} > ${train_job_dir}/train_${device_id}.log 2>&1
+
+if [ $? -eq 0 ] ;then
+    echo ":::ABK 1.0.0 SSD-Resnet34 train success"
+    echo ":::ABK 1.0.0 SSD-Resnet34 train success" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 SSD-Resnet34 train success" >> ${train_job_dir}/${device_id}/hw_SSD-Resnet34.log
+else
+    echo ":::ABK 1.0.0 SSD-Resnet34 train failed"
+    echo ":::ABK 1.0.0 SSD-Resnet34 train failed" >> ${train_job_dir}/train_${device_id}.log
+    echo ":::ABK 1.0.0 SSD-Resnet34 train failed" >> ${train_job_dir}/${device_id}/hw_SSD-Resnet34.log
+fi
+
+endTime=`date +%Y%m%d-%H:%M:%S`
+endTime_s=`date +%s`
+sumTime=$[ $endTime_s - $startTime_s ]
+hour=$(( $sumTime/3600 ))
+min=$(( ($sumTime-${hour}*3600)/60 ))
+sec=$(( $sumTime-${hour}*3600-${min}*60 ))
+echo ${hour}:${min}:${sec}
+echo ":::ABK 1.0.0 SSD-Resnet34 train total time ${hour}:${min}:${sec}" >> ${train_job_dir}/${device_id}/hw_SSD-Resnet34.log
+
+
@@ -0,0 +1,141 @@
+#  YOLOv3_TensorFlow训练说明
+
+### 1. 介绍
+YOLOv3是基于第三方TensorFlow开源代码，使用darknet-53作为主干网络，同时支持单尺度与多尺度训练。包含训练集和验证集两部分，可选用包括COCO2014、COCO2017等， 本文档以COCO2014数据集为例，说明yolov3训练操作步骤。
+
+### 2. 运行环境
+Python版本: 3.7.5
+主要python三方库:
+- tensorflow >= 1.15.0 (satisfied with NPU)
+
+- opencv-python
+
+  1、直接pip  install opencv-python
+
+  2、如果直接使用pip install opencv-python无法正常安装三方库，则采用离线安装方法安装。
+
+      (1)'解压opencv包'
+        
+      (2)'进入解压后的opencv包  cd opencv'
+        
+      (3)'mkdir -p build'
+        
+      (4)'cd build'
+        
+      (5)'cmake -D BUILD_opencv_python3=yes -D BUILD_opencv_python2=no -D          PYTHON3_EXECUTABLE=/usr/local/python3.7.5/bin/python3.7m -D PYTHON3_INCLUDE_DIR=/usr/local/python3.7.5/include/python3.7m -D PYTHON3_LIBRARY=/usr/local/python3.7.5/lib/libpython3.7m.so -D PYTHON3_NUMPY_INCLUDE_DIRS=/usr/local/python3.7.5/lib/python3.7/site-packages/numpy/core/include -D PYTHON3_PACKAGES_PATH=/usr/local/python3.7.5/lib/python3.7/site-packages -D PYTHON_DEFAULT_EXECUTABLE=/usr/local/python3.7.5/bin/python3.7m ..'
+        
+      (5)'make -j4'
+      (6)'make install'
+
+   说明：cmake -D 后参数匹配当前环境
+
+- tqdm          安装方式：pip  install  tqdm
+
+- pycocotools     安装方式：pip  install pycocotools
+
+  说明： 评测的时候需要用到三方库pycocotools
+
+### 3. 数据集预处理
+#### 3.1 修改coco_dataset_path的值
+在yolov3/tensorflow/code下对coco_minival_anns.py和coco_trainval_anns.py中coco_dataset_path的值改为当前环境的数据集路径， 如/opt/dataset/coco2014。
+
+#### 3.2 运行脚本
+```
+python3.7 coco_minival_anns.py
+python3.7 coco_trainval_anns.py
+```
+生成训练和验证样本标注文件coco2014_trainval.txt和coco2014_minival.txt，请将这2个文件放置到yolov3/tensorflow/code/data下。
+生成的txt文件内容示例如下：
+```
+0 xxx/xxx/a.jpg 1920 1080 0 453 369 473 391 1 588 245 608 268
+1 xxx/xxx/b.jpg 1920 1080 1 466 403 485 422 2 793 300 809 320
+...
+```
+
+### 4. 准备预训练模型
+#### 4.1 下载预训练模型
+请从链接https://pjreddie.com/media/files/yolov3.weights下载darknet框架下的预训练模型。
+
+#### 4.2  模型转换
+使用train/atlas_benchmark-master/object_detection/yolov3/tensorflow/code下的convert_weight.py将预处理模型转换为TensorFlow框架的ckpt文件：
+在convert_weight.py中将weight_path修改为下载下的预训练模型文件的路径，save_path的值修改为命名的转换为TensorFlow框架的ckpt文件的路径； 如
+```
+weight_path = '../yolov3-tf2/data/darknet53.conv.74'
+save_path = './data/darknet_weights/darknet53.ckpt'
+```
+然后执行
+```
+python3.7 convert_weight.py
+
+```
+注意：save_path中ckpt文件的路径不是在train/atlas_benchmark-master/object_detection/yolov3/tensorflow/code/data/darknet_weights/下时， 请将其手动移至该路径；
+
+### 5. 模型训练
+#### 5.1 训练参数配置
+在train/yaml/YoLoV3.yaml中修改相应配置， 配置项含义:
+```
+mode: yolov3的单尺度或者多尺度模式，值为single或者 multi
+data_url:数据集路径
+runmode: 运行模式，是训练还是评测，值为train或者evaluate
+ckpt_path: 评测时要用到的ckpt文件的路径， 仅在evaluate时用到
+total_epoches: 跑多少个epoch，
+save_epoch: 多少epoch保存一次ckpt文件
+device_group_1p: 跑1p时的device_id
+device_group_2p: 跑2p时的device_id
+device_group_4p: 跑4p时的device_id
+mpirun_ip: 仅集群场景时需要配置, 格式ip1:卡数量1,ip2:卡数量2
+docker_image: docker镜像名称:版本号
+```
+YoLoV3.yaml中配置项示例：
+```
+mode: single
+data_url: /opt/npu/dataset
+runmode: train
+ckpt_path: /home/benchmark-master720/train/atlas_benchmark-master/object_detection/yolov3/tensorflow/result/TrainingJob-20200724115042
+total_epoches: 1
+save_epoch: 3
+device_group_1p: 0
+device_group_2p: 0 1
+device_group_4p: 0 1 2 3
+mpirun_ip: 90.90.176.152:8,90.90.176.154:8
+docker_image: mpirun3:latest
+```
+
+#### 5.2 训练脚本启动
+当前路径为benchmark包的train文件夹下
+```
+bash benchmark.sh -e YoLoV3 -hw 1p              # host侧1p
+bash benchmark.sh -e YoLoV3 -hw 8p              # host侧8p
+bash benchmark.sh -e YoLoV3 -hw 1p -docker      # docker侧1p
+bash benchmark.sh -e YoLoV3 -hw 8p -docker      # docker侧8p
+bash benchmark.sh -e YoLoV3 -ct                 # host侧集群
+bash benchmark.sh -e YoLoV3 -ct -docker         # docker侧集群
+```
+
+#### 5.3 训练日志
+日志在benchmark包的train路径下reuslt中找到YoLoV3的文件夹里。
+```
+./result/tf_yolov3/TrainingJob-2020xxxxxxxxxx/train_${device_id}.log
+./result/TrainingJob-2020xxxxxxxxxx/train_${device_id}.log
+./result/tensorflow/yolov3t/TrainingJob-2020xxxxxxxxxx/device_id/hw_yolov3.log
+```
+
+### 6. 模型评测
+将train/yaml/YoLoV3.yaml中ckpt_path的值改为训练产生的日志的路径， runmode的值改为evaluate，如5.1中示例；
+然后运行与训练时相同的脚本，结果参看见train.log。
+
+
+### 7. 训练结果参考
+
+| Model                 | Npu_nums | mAP      | FPS       |
+| :-------------------- | :------: | :------: | :------:  |
+| single_scale          | 8        |    30.0  | 740       |
+| multi_scale           | 8        |    31.0  | 340       |
+| single_scale          | 1        |    ----  | 96        |
+| multi_scale           | 1        |    ----  | 44        |
+
+
+
+-------
+
+
@@ -0,0 +1,13 @@
+
+# dirs
+.idea/
+__pycache__/
+tmp*/
+
+# fils
+*.pyc
+*.log
+*.out
+
+data/darknet_weights/*.ckpt*
+
@@ -0,0 +1,140 @@
+#  YOLOv3_TensorFlow
+
+### 1. Introduction
+This is npu implementation of [YOLOv3](https://pjreddie.com/media/files/papers/YOLOv3.pdf) using TensorFlow modified from [YOLOv3_TensorFlow](https://github.com/wizyoung/YOLOv3_TensorFlow).   
+
+### 2. Requirements
+Python version: 3.7.5  
+Main Python Packages:
+- tensorflow >= 1.15.0 (satisfied with NPU)
+- opencv-python
+- tqdm
+
+### 3. Weights convertion
+The pretrained darknet53 weights file can be downloaded [here](https://pjreddie.com/media/files/darknet53.conv.74).        
+Place this weights file under directory `./data/darknet_weights/` and then run:
+```python
+python3 convert_weight.py
+```
+Then the converted TensorFlow checkpoint file will be saved to `./data/darknet_weights/` directory.  
+In this repo, conerted weight file may be contained. 
+
+### 4. Training
+#### 4.1 Data preparation 
+0. dataset
+To compare with official implement, for example, we use [get_coco_dataset.sh](https://github.com/pjreddie/darknet/blob/master/scripts/get_coco_dataset.sh) to prepare our dataset.
+
+1. annotation file
+- ATTENTION: you can use easy tricks to fit default setting
+    - ln -s ${real_dataset_path} /opt/npu/dataset/coco 
+Using script generate `coco2014_trainval.txt/coco2014_minival.txt` files under `./data/` directory.
+```python
+python3 coco_trainval_anns.py
+python3 coco_minival_anns.py
+```   
+One line for one image, in the format like `image_index image_absolute_path img_width img_height box_1 box_2 ... box_n`.    
+Box_x format: 
+- `label_index x_min y_min x_max y_max`. (The origin of coordinates is at the left top corner, left top => (xmin, ymin), right bottom => (xmax, ymax).)       
+-  `image_index` is the line index which starts from zero. `label_index` is in range [0, class_num - 1].
+
+For example:
+```
+0 xxx/xxx/a.jpg 1920 1080 0 453 369 473 391 1 588 245 608 268
+1 xxx/xxx/b.jpg 1920 1080 1 466 403 485 422 2 793 300 809 320
+...
+```
+
+(2)  class_names file:
+Generate the `data.names` file under `./data/` directory. Each line represents a class name.     
+For example:     
+```
+bird
+person
+bike
+...
+```
+
+The COCO dataset class names file is placed at `./data/coco.names`.
+
+(3) prior anchor file:
+
+Using the kmeans algorithm to get the prior anchors:
+
+```
+python get_kmeans.py
+```
+
+Then you will get 9 anchors and the average IoU. Save the anchors to a txt file.
+
+The COCO dataset anchors offered by YOLO's author is placed at `./data/yolo_anchors.txt`, you can use that one too.
+
+The yolo anchors computed by the kmeans script is on the resized image scale.  The default resize method is the letterbox resize, i.e., keep the original aspect ratio in the resized image.
+
+#### 4.2 Training
+1. single scale
+Using `npu_train_*p_single.sh`. The hyper-parameters and the corresponding annotations can be found in `args_single.py`:
+
+```shell
+bash npu_train_1p_single.sh 
+or 
+bash npu_train_8p_single.sh
+```
+
+2. multi scale
+Using `npu_train_*p_multi.sh`. The hyper-parameters and the corresponding annotations can be found in `args_multi.py`:
+
+```shell
+bash npu_train_1p_multi.sh 
+or 
+bash npu_train_8p_multi.sh
+```
+
+Check the `args.py` for more details. You should set the parameters yourself in your own specific task.
+
+3. training details
+     1. nohup.out -- training task main_log
+     2. ./training/t1/D0/train_0.log -- training host log
+     3. training/t1/D0/training/train.log -- training perf log
+
+### 5. Evaluation
+
+Using `eval.sh` to evaluate the validation or test dataset. The parameters are as following:
+
+```shell
+bash eval.sh
+```
+
+Check the `eval.py` for more details. You could set the parameters yourself. 
+
+You will get the mAP metrics results using official cocoapi.
+Using `tail -f eval_*.out` to watching results of models.
+
+
+### 6. Training result
+
+| Model                 | Npu_nums | mAP      | FPS       |
+| :-------------------- | :------: | :------: | :------:  |
+| single_scale          | 8        |    30.0  | 740       |
+| multi_scale           | 8        |    31.0  | 340       |
+| single_scale          | 1        |    ----  | 96        |
+| multi_scale           | 1        |    ----  | 44        |
+
+
+
+
+-------
+
+### Credits:
+
+I referred to many fantastic repos during the implementation:
+
+[YunYang1994/tensorflow-yolov3](https://github.com/YunYang1994/tensorflow-yolov3)
+
+[qqwweee/keras-yolo3](https://github.com/qqwweee/keras-yolo3)
+
+[eriklindernoren/PyTorch-YOLOv3](https://github.com/eriklindernoren/PyTorch-YOLOv3)
+
+[pjreddie/darknet](https://github.com/pjreddie/darknet)
+
+[dmlc/gluon-cv](https://github.com/dmlc/gluon-cv/tree/master/scripts/detection/yolo)
+
@@ -0,0 +1,110 @@
+# coding: utf-8
+# This file contains the parameter used in train.py
+
+from __future__ import division, print_function
+
+from utils.misc_utils import parse_anchors, read_class_names
+import math
+import os
+
+
+save_dir =          './training/'  # The directory of the weights to save.
+log_dir =           './training/logs/'  # The directory to store the tensorboard log files.
+progress_log_path = './training/train.log'  # The path to record the training progress.
+# save_dir = os.path.join(work_path, save_dir)
+# log_dir = os.path.join(work_path, log_dir)
+# progress_log_path = os.path.join(work_path, progress_log_path)
+
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+
+
+work_path = os.path.realpath(__file__+"/..")
+### Some paths
+train_file =        os.path.realpath(os.path.join(work_path, './data/coco2014_trainval.txt'))  # The path of the training txt file.
+val_file =          os.path.realpath(os.path.join(work_path, './data/coco2014_minival.txt'))  # The path of the validation txt file.
+restore_path =      os.path.realpath(os.path.join(work_path, './data/darknet_weights/darknet53.ckpt'))  # The path of the weights to restore.
+anchor_path =       os.path.realpath(os.path.join(work_path, './data/yolo_anchors.txt'))  # The path of the anchor txt file.
+class_name_path =   os.path.realpath(os.path.join(work_path, './data/coco.names'))  # The path of the class names.
+
+### Distribution setting
+num_gpus=int(os.environ['RANK_SIZE'])
+iterations_per_loop=10
+
+### Training releated numbersls
+
+batch_size = 16
+img_size = [608, 608]  # Images will be resized to `img_size` and fed to the network, size format: [width, height]
+letterbox_resize = True  # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
+total_epoches = 200
+train_evaluation_step = 1000  # Evaluate on the training batch after some steps.
+val_evaluation_epoch = 2  # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch.
+save_epoch = 10  # Save the model after some epochs.
+batch_norm_decay = 0.99  # decay in bn ops
+weight_decay = 5e-4  # l2 weight decay
+global_step = 0  # used when resuming training
+
+### tf.data parameters
+num_threads = 8  # Number of threads for image processing used in tf.data pipeline.
+prefetech_buffer = batch_size * 4  # Prefetech_buffer used in tf.data pipeline.
+
+### Learning rate and optimizer
+optimizer_name = 'momentum'  # Chosen from [sgd, momentum, adam, rmsprop]
+save_optimizer = True  # Whether to save the optimizer parameters into the checkpoint file.
+learning_rate_base = 75e-4
+learning_rate_base_batch_size = 64
+learning_rate_init = learning_rate_base * ((batch_size * num_gpus) / learning_rate_base_batch_size)
+lr_type = 'piecewise'  # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise]
+lr_decay_epoch = 5  # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` and `cosine_decay_restart` lr_type.
+lr_decay_factor = 0.96  # The learning rate decay factor. Used when chosen `exponential` lr_type.
+lr_lower_bound = 1e-6  # The minimum learning rate.
+# only used in piecewise lr type
+pw_boundaries = [80, 90]  # epoch based boundaries
+pw_values = [learning_rate_init, learning_rate_init*0.1, learning_rate_init*0.01]
+
+### Load and finetune
+# Choose the parts you want to restore the weights. List form.
+# restore_include: None, restore_exclude: None  => restore the whole model
+# restore_include: None, restore_exclude: scope  => restore the whole model except `scope`
+# restore_include: scope1, restore_exclude: scope2  => if scope1 contains scope2, restore scope1 and not restore scope2 (scope1 - scope2)
+# choise 1: only restore the darknet body
+# restore_include = ['yolov3/darknet53_body']
+restore_exclude = None
+# choise 2: restore all layers except the last 3 conv2d layers in 3 scale
+restore_include = None
+# restore_exclude = ['yolov3/yolov3_head/Conv_14', 'yolov3/yolov3_head/Conv_6', 'yolov3/yolov3_head/Conv_22']
+# restore_exclude = None
+# Choose the parts you want to finetune. List form.
+# Set to None to train the whole model.
+# update_part = ['yolov3/yolov3_head']
+update_part = None
+
+### other training strategies
+multi_scale_train = True  # Whether to apply multi-scale training strategy. Image size varies from [320, 320] to [640, 640] by default.
+use_label_smooth = False # Whether to use class label smoothing strategy.
+use_focal_loss = False  # Whether to apply focal loss on the conf loss.
+use_mix_up = False  # Whether to use mix up data augmentation strategy.
+use_warm_up = True  # whether to use warm up strategy to prevent from gradient exploding.
+warm_up_epoch = min(total_epoches*0.1, 3)  # Warm up training epoches. Set to a larger value if gradient explodes.
+
+### some constants in validation
+# nms
+nms_threshold = 0.5  # iou threshold in nms operation
+score_threshold = 0.001  # threshold of the probability of the classes in nms operation, i.e. score = pred_confs * pred_probs. set lower for higher recall.
+nms_topk = 100  # keep at most nms_topk outputs after nms
+# mAP eval
+eval_threshold = 0.5  # the iou threshold applied in mAP evaluation
+use_voc_07_metric = False  # whether to use voc 2007 evaluation metric, i.e. the 11-point metric
+
+### parse some params
+anchors = parse_anchors(anchor_path)
+classes = read_class_names(class_name_path)
+class_num = len(classes)
+train_img_cnt = len(open(train_file, 'r').readlines())
+val_img_cnt = len(open(val_file, 'r').readlines())
+train_batch_num = int(float(train_img_cnt) / batch_size / num_gpus)
+
+lr_decay_freq = int(train_batch_num * lr_decay_epoch)
+pw_boundaries = [float(i) * train_batch_num + global_step for i in pw_boundaries]
@@ -0,0 +1,105 @@
+# coding: utf-8
+# This file contains the parameter used in train.py
+
+from __future__ import division, print_function
+
+from utils.misc_utils import parse_anchors, read_class_names
+import math
+import os
+
+save_dir =          './training/'  # The directory of the weights to save.
+log_dir =           './training/logs/'  # The directory to store the tensorboard log files.
+progress_log_path = './training/train.log'  # The path to record the training progress.
+
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+if not os.path.exists(log_dir):
+    os.makedirs(log_dir)
+
+
+work_path = os.path.realpath(__file__+"/..")
+### Some paths
+train_file =        os.path.realpath(os.path.join(work_path, './data/coco2014_trainval.txt'))  # The path of the training txt file.
+val_file =          os.path.realpath(os.path.join(work_path, './data/coco2014_minival.txt'))  # The path of the validation txt file.
+restore_path =      os.path.realpath(os.path.join(work_path, './data/darknet_weights/darknet53.ckpt'))  # The path of the weights to restore.
+anchor_path =       os.path.realpath(os.path.join(work_path, './data/yolo_anchors.txt'))  # The path of the anchor txt file.
+class_name_path =   os.path.realpath(os.path.join(work_path, './data/coco.names'))  # The path of the class names.
+
+### Distribution setting
+num_gpus=int(os.environ['RANK_SIZE'])
+iterations_per_loop=10
+
+### Training releated numbersls
+
+batch_size = 32
+img_size = [416, 416]  # Images will be resized to `img_size` and fed to the network, size format: [width, height]
+letterbox_resize = True  # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
+total_epoches = 200
+train_evaluation_step = 1000  # Evaluate on the training batch after some steps.
+val_evaluation_epoch = 2  # Evaluate on the whole validation dataset after some epochs. Set to None to evaluate every epoch.
+save_epoch = 10  # Save the model after some epochs.
+batch_norm_decay = 0.99  # decay in bn ops
+weight_decay = 5e-4  # l2 weight decay
+global_step = 0  # used when resuming training
+
+### tf.data parameters
+num_threads = 8  # Number of threads for image processing used in tf.data pipeline.
+prefetech_buffer = batch_size * 4   # Prefetech_buffer used in tf.data pipeline.
+
+### Learning rate and optimizer
+optimizer_name = 'momentum'  # Chosen from [sgd, momentum, adam, rmsprop]
+save_optimizer = True  # Whether to save the optimizer parameters into the checkpoint file.
+learning_rate_base = 5e-3
+learning_rate_base_batch_size = 64
+learning_rate_init = learning_rate_base * ((batch_size * num_gpus) / learning_rate_base_batch_size)
+lr_type = 'piecewise'  # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise]
+lr_decay_epoch = 5  # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` and `cosine_decay_restart` lr_type.
+lr_decay_factor = 0.96  # The learning rate decay factor. Used when chosen `exponential` lr_type.
+lr_lower_bound = 1e-6  # The minimum learning rate.
+# only used in piecewise lr type
+pw_boundaries = [80, 90]  # epoch based boundaries
+pw_values = [learning_rate_init, learning_rate_init*0.1, learning_rate_init*0.01]
+
+### Load and finetune
+# Choose the parts you want to restore the weights. List form.
+# restore_include: None, restore_exclude: None  => restore the whole model
+# restore_include: None, restore_exclude: scope  => restore the whole model except `scope`
+# restore_include: scope1, restore_exclude: scope2  => if scope1 contains scope2, restore scope1 and not restore scope2 (scope1 - scope2)
+# choise 1: only restore the darknet body
+# restore_include = ['yolov3/darknet53_body']
+restore_exclude = None
+# choise 2: restore all layers except the last 3 conv2d layers in 3 scale
+restore_include = None
+# restore_exclude = ['yolov3/yolov3_head/Conv_14', 'yolov3/yolov3_head/Conv_6', 'yolov3/yolov3_head/Conv_22']
+# Choose the parts you want to finetune. List form.
+# Set to None to train the whole model.
+# update_part = ['yolov3/yolov3_head']
+update_part = None
+
+### other training strategies
+multi_scale_train = False  # Whether to apply multi-scale training strategy. Image size varies from [320, 320] to [640, 640] by default.
+use_label_smooth = False # Whether to use class label smoothing strategy.
+use_focal_loss = False  # Whether to apply focal loss on the conf loss.
+use_mix_up = False  # Whether to use mix up data augmentation strategy.
+use_warm_up = True  # whether to use warm up strategy to prevent from gradient exploding.
+warm_up_epoch = min(total_epoches*0.1, 3)  # Warm up training epoches. Set to a larger value if gradient explodes.
+
+### some constants in validation
+# nms
+nms_threshold = 0.5  # iou threshold in nms operation
+score_threshold = 0.001  # threshold of the probability of the classes in nms operation, i.e. score = pred_confs * pred_probs. set lower for higher recall.
+nms_topk = 100  # keep at most nms_topk outputs after nms
+# mAP eval
+eval_threshold = 0.5  # the iou threshold applied in mAP evaluation
+use_voc_07_metric = False  # whether to use voc 2007 evaluation metric, i.e. the 11-point metric
+
+### parse some params
+anchors = parse_anchors(anchor_path)
+classes = read_class_names(class_name_path)
+class_num = len(classes)
+train_img_cnt = len(open(train_file, 'r').readlines())
+val_img_cnt = len(open(val_file, 'r').readlines())
+train_batch_num = int(float(train_img_cnt) / batch_size / num_gpus)
+
+lr_decay_freq = int(train_batch_num * lr_decay_epoch)
+pw_boundaries = [float(i) * train_batch_num + global_step for i in pw_boundaries]
@@ -0,0 +1,113 @@
+import json,cv2
+from collections import defaultdict
+
+ban_path = './data/5k.txt'
+with open(ban_path, 'r')as f:
+    ban_list = f.read().split('\n')[:-1]
+    ban_list = [i.split('/')[-1] for i in ban_list]
+
+name_box_id = defaultdict(list)
+id_name = dict()
+
+coco_dataset_path = '/opt/npu/dataset/coco/coco2014'
+
+f = open(
+    coco_dataset_path + "/annotations/instances_train2014.json",
+    encoding='utf-8')
+data = json.load(f)
+annotations = data['annotations']
+for ant in annotations:
+    id = ant['image_id']
+    name = coco_dataset_path + '/train2014/COCO_train2014_%012d.jpg' % id
+    cat = ant['category_id']
+
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+
+    name_box_id[name].append([ant['bbox'], cat])
+
+
+
+
+f = open(
+    coco_dataset_path + "/annotations/instances_val2014.json",
+    encoding='utf-8')
+data = json.load(f)
+annotations = data['annotations']
+for ant in annotations:
+    id = ant['image_id']
+    name = coco_dataset_path + '/val2014/COCO_val2014_%012d.jpg' % id
+    cat = ant['category_id']
+
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+
+    name_box_id[name].append([ant['bbox'], cat])
+    
+
+
+
+
+
+f = open('data/coco2014_minival.txt', 'w')
+ii = 0
+for idx, key in enumerate(name_box_id.keys()):
+    if key.split('/')[-1] not in ban_list:
+        continue
+
+    print('5k', key.split('/')[-1])
+
+    f.write('%d '%ii)
+    ii += 1
+    f.write(key)
+
+    img = cv2.imread(key)
+    h,w,c = img.shape
+
+    f.write(' %d %d'%(w,h))
+
+    box_infos = name_box_id[key]
+    for info in box_infos:
+        x_min = int(info[0][0])
+        y_min = int(info[0][1])
+        x_max = x_min + int(info[0][2])
+        y_max = y_min + int(info[0][3])
+
+        box_info = " %d %d %d %d %d" % (
+            int(info[1]), x_min, y_min, x_max, y_max
+        )
+        f.write(box_info)
+    f.write('\n')
+f.close()
@@ -0,0 +1,113 @@
+import json,cv2
+from collections import defaultdict
+
+ban_path = './data/5k.txt'
+with open(ban_path, 'r')as f:
+    ban_list = f.read().split('\n')[:-1]
+    ban_list = [i.split('/')[-1] for i in ban_list]
+
+name_box_id = defaultdict(list)
+id_name = dict()
+
+coco_dataset_path = '/opt/npu/dataset/coco/coco2014'
+
+f = open(
+    coco_dataset_path + "/annotations/instances_train2014.json",
+    encoding='utf-8')
+data = json.load(f)
+annotations = data['annotations']
+for ant in annotations:
+    id = ant['image_id']
+    name = coco_dataset_path + '/train2014/COCO_train2014_%012d.jpg' % id
+    cat = ant['category_id']
+
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+
+    name_box_id[name].append([ant['bbox'], cat])
+
+
+
+
+f = open(
+    coco_dataset_path + "/annotations/instances_val2014.json",
+    encoding='utf-8')
+data = json.load(f)
+annotations = data['annotations']
+for ant in annotations:
+    id = ant['image_id']
+    name = coco_dataset_path + '/val2014/COCO_val2014_%012d.jpg' % id
+    cat = ant['category_id']
+
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+
+    name_box_id[name].append([ant['bbox'], cat])
+    
+
+
+
+
+
+f = open('data/coco2014_trainval.txt', 'w')
+ii = 0
+for idx, key in enumerate(name_box_id.keys()):
+    if key.split('/')[-1] in ban_list:
+        continue
+
+    print('trainval', key.split('/')[-1])
+
+    f.write('%d '%ii)
+    ii += 1
+    f.write(key)
+
+    img = cv2.imread(key)
+    h,w,c = img.shape
+
+    f.write(' %d %d'%(w,h))
+
+    box_infos = name_box_id[key]
+    for info in box_infos:
+        x_min = int(info[0][0])
+        y_min = int(info[0][1])
+        x_max = x_min + int(info[0][2])
+        y_max = y_min + int(info[0][3])
+
+        box_info = " %d %d %d %d %d" % (
+            int(info[1]), x_min, y_min, x_max, y_max
+        )
+        f.write(box_info)
+    f.write('\n')
+f.close()
@@ -0,0 +1,38 @@
+# coding: utf-8
+# for more details about the yolo darknet weights file, refer to
+# https://itnext.io/implementing-yolo-v3-in-tensorflow-tf-slim-c3c55ff59dbe
+
+from __future__ import division, print_function
+
+import os
+import sys
+import tensorflow as tf
+import numpy as np
+
+from model import yolov3
+from utils.misc_utils import parse_anchors, load_weights
+
+num_class = 80
+img_size = 416
+weight_path = '../yolov3-tf2/data/darknet53.conv.74'
+save_path = './data/darknet_weights/darknet53.ckpt'
+anchors = parse_anchors('./data/yolo_anchors.txt')
+
+model = yolov3(80, anchors)
+with tf.Session() as sess:
+    inputs = tf.placeholder(tf.float32, [1, img_size, img_size, 3])
+
+    with tf.variable_scope('yolov3'):
+        feature_map = model.forward(inputs)
+
+    saver = tf.train.Saver(var_list=tf.global_variables(scope='yolov3'))
+
+    load_ops = load_weights(tf.global_variables(scope='yolov3'), weight_path)
+
+    sess.run(tf.global_variables_initializer())
+    sess.run(load_ops)
+    saver.save(sess, save_path=save_path)
+    print('TensorFlow model checkpoint has been saved to {}'.format(save_path))
+
+
+
@@ -0,0 +1,80 @@
+person
+bicycle
+car
+motorbike
+aeroplane
+bus
+train
+truck
+boat
+traffic light
+fire hydrant
+stop sign
+parking meter
+bench
+bird
+cat
+dog
+horse
+sheep
+cow
+elephant
+bear
+zebra
+giraffe
+backpack
+umbrella
+handbag
+tie
+suitcase
+frisbee
+skis
+snowboard
+sports ball
+kite
+baseball bat
+baseball glove
+skateboard
+surfboard
+tennis racket
+bottle
+wine glass
+cup
+fork
+knife
+spoon
+bowl
+banana
+apple
+sandwich
+orange
+broccoli
+carrot
+hot dog
+pizza
+donut
+cake
+chair
+sofa
+pottedplant
+bed
+diningtable
+toilet
+tvmonitor
+laptop
+mouse
+remote
+keyboard
+cell phone
+microwave
+oven
+toaster
+sink
+refrigerator
+book
+clock
+vase
+scissors
+teddy bear
+hair drier
+toothbrush
@@ -0,0 +1 @@
+10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90,  156,198,  373,326
@@ -0,0 +1,220 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import tensorflow as tf
+import numpy as np
+import argparse
+import cv2
+
+from utils.misc_utils import parse_anchors, read_class_names
+from utils.nms_utils import gpu_nms, cpu_nms
+from utils.plot_utils import get_color_table, plot_one_box
+from utils.data_aug import letterbox_resize
+
+from model import yolov3
+from tqdm import trange
+import json
+import os,time
+
+# npu modified
+from npu_bridge.estimator import npu_ops
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
+from npu_bridge.estimator.npu import util
+
+'''
+coco weight from official checked 
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.309
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.555
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.311
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.136
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.337
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.460
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.273
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.430
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.465
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.270
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.511
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.629
+
+'''
+
+parser = argparse.ArgumentParser(description="YOLO-V3 test single image test procedure.")
+parser.add_argument("--annotation_txt", type=str, default='../code/data/coco2014_minival.txt',
+                    help="The path of the input image. Or annotation label txt.")
+parser.add_argument("--anchor_path", type=str, default="../code/data/yolo_anchors.txt",
+                    help="The path of the anchor txt file.")
+parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416],
+                    help="Resize the input image with `new_size`, size format: [width, height]")
+parser.add_argument("--max_test", type=int, default=-1,
+                    help="max step for test")
+parser.add_argument("--score_thresh", type=float, default=1e-3,
+                    help="score_threshold for test")
+parser.add_argument("--nms_thresh", type=float, default=0.5,
+                    help="iou_threshold for test")
+parser.add_argument("--max_boxes", type=int, default=100,
+                    help="max_boxes for test")
+parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=True,
+                    help="Whether to use the letterbox resize.")
+parser.add_argument("--class_name_path", type=str, default="../code/data/coco.names",
+                    help="The path of the class names.")
+parser.add_argument("--restore_path", type=str, default="../code/data/darknet_weights/yolo3.ckpt",
+                    # parser.add_argument("--restore_path", type=str, default="./training_s2/checkpoint_dir/model.ckpt-45800",
+                    help="The path of the weights to restore.")
+parser.add_argument("--save_img", type=bool, default=False,
+                    help="whether to save detected-result image")
+parser.add_argument("--save_json", type=bool, default=False,
+                    help="whether to save detected-result cocolike json")
+parser.add_argument("--save_json_path", type=str, default="../result/result.json",
+                    help="The path of the result.json.")
+args = parser.parse_args()
+
+args.anchors = parse_anchors(args.anchor_path)
+args.classes = read_class_names(args.class_name_path)
+args.num_class = len(args.classes)
+
+color_table = get_color_table(args.num_class)
+cat_id_to_real_id = \
+    {1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8, 9: 9, 10: 10, 11: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16,
+     18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 27: 25, 28: 26, 31: 27, 32: 28, 33: 29, 34: 30,
+     35: 31, 36: 32, 37: 33, 38: 34, 39: 35, 40: 36, 41: 37, 42: 38, 43: 39, 44: 40, 46: 41, 47: 42, 48: 43, 49: 44,
+     50: 45, 51: 46, 52: 47, 53: 48, 54: 49, 55: 50, 56: 51, 57: 52, 58: 53, 59: 54, 60: 55, 61: 56, 62: 57, 63: 58,
+     64: 59, 65: 60, 67: 61, 70: 62, 72: 63, 73: 64, 74: 65, 75: 66, 76: 67, 77: 68, 78: 69, 79: 70, 80: 71, 81: 72,
+     82: 73, 84: 74, 85: 75, 86: 76, 87: 77, 88: 78, 89: 79, 90: 80}
+real_id_to_cat_id = {cat_id_to_real_id[i]: i for i in cat_id_to_real_id}
+
+
+def get_default_dict():
+    return {"image_id": -1, "category_id": -1, "bbox": [], "score": 0}
+
+
+eval_path = args.annotation_txt
+with open(eval_path, 'r')as f:
+    eval_file_list = f.read().split('\n')[:-1]
+    print(len(eval_file_list))
+eval_file_dict = {}
+for i in eval_file_list:
+    tmp_list = i.split(' ')
+    idx = int(tmp_list[0])
+    path = tmp_list[1]
+    w = float(tmp_list[2])
+    h = float(tmp_list[3])
+    bbox_len = len(tmp_list[4:]) // 5
+    bbox = []
+    for bbox_idx in range(bbox_len):
+        label, x1, y1, x2, y2 = tmp_list[4:][bbox_idx * 5:bbox_idx * 5 + 5]
+        bbox.append([label, x1, y1, x2, y2])
+    eval_file_dict[idx] = {
+        'path': path,
+        'w': w,
+        'h': h,
+        'bbox': bbox
+    }
+
+config = tf.ConfigProto()
+custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
+custom_op.name = "NpuOptimizer"
+custom_op.parameter_map["use_off_line"].b = True  # training on Ascend chips
+config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+
+json_out = []
+with tf.Session(config=config) as sess:
+# with tf.Session() as sess:
+    input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data')
+    yolo_model = yolov3(args.num_class, args.anchors)
+    with tf.variable_scope('yolov3'):
+        pred_feature_maps = yolo_model.forward(input_data, False)
+    pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps)
+
+    pred_scores = pred_confs * pred_probs
+
+    # boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=100, score_thresh=args.score_thresh, nms_thresh=0.5)
+
+    saver = tf.train.Saver()
+    if args.restore_path.find('.ckpt') < 0 and args.restore_path.find('model-') < 0:
+        with open(os.path.join(args.restore_path, 'checkpoint'), 'r')as f:
+            tmp_checkpoint = f.readline()
+            tmp_checkpoint = tmp_checkpoint.replace('"', '').split(':')[1].strip()
+            args.restore_path = os.path.join(args.restore_path, tmp_checkpoint)
+            print('tmp_checkpoint: ', tmp_checkpoint)
+            # input()
+
+    saver.restore(sess, args.restore_path)
+
+    if args.max_test > 0:
+        test_len = min(args.max_test, len(eval_file_dict.keys()))
+    else:
+        test_len = len(eval_file_dict.keys())
+    for test_idx in trange(test_len):
+        img_path = eval_file_dict[test_idx]['path']
+        img_ori = cv2.imread(img_path)
+        if args.letterbox_resize:
+            img, resize_ratio, dw, dh = letterbox_resize(img_ori, args.new_size[0], args.new_size[1])
+        else:
+            height_ori, width_ori = img_ori.shape[:2]
+            img = cv2.resize(img_ori, tuple(args.new_size))
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = np.asarray(img, np.float32)
+        img = img[np.newaxis, :] / 255.
+
+        # boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})
+        # print('bbox: ',boxes_)
+        t = time.time()
+        boxes_, scores_ = sess.run([pred_boxes, pred_scores], feed_dict={input_data: img})
+        # print("FPS: ", 1/(time.time() - t))
+        boxes_, scores_, labels_ = cpu_nms(boxes_, scores_, args.num_class, args.max_boxes, args.score_thresh, args.nms_thresh)
+        # print('bbox: ', boxes_)
+
+        # try:
+        #     boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
+        # except:
+        #     print("boxes_: ", boxes_)
+        #     continue
+
+        # print("boxes_: ", boxes_)
+        # rescale the coordinates to the original image
+        if args.letterbox_resize:
+            boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
+            boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio
+        else:
+            boxes_[:, [0, 2]] *= (width_ori / float(args.new_size[0]))
+            boxes_[:, [1, 3]] *= (height_ori / float(args.new_size[1]))
+
+        if args.save_img:
+            # print("box coords:")
+            # print(boxes_)
+            # print('*' * 30)
+            # print("scores:")
+            # print(scores_)
+            # print('*' * 30)
+            # print("labels:")
+            # print(labels_)
+            for i in range(len(boxes_)):
+                x0, y0, x1, y1 = boxes_[i]
+                plot_one_box(img_ori, [x0, y0, x1, y1],
+                             label=args.classes[labels_[i]] + ', {:.2f}%'.format(scores_[i] * 100),
+                             color=color_table[labels_[i]])
+            cv2.imwrite('tmp/%d_detection_result.jpg' % test_idx, img_ori)
+            print('%d done' % test_idx)
+
+        if args.save_json:
+            for i in range(len(boxes_)):
+                x0, y0, x1, y1 = boxes_[i]
+                bw = x1 - x0
+                bh = y1 - y0
+                s = scores_[i]
+                c = labels_[i]
+                t_dict = get_default_dict()
+                t_dict['image_id'] = int(img_path.split('/')[-1].split('.')[0].split('_')[-1])
+                t_dict['category_id'] = real_id_to_cat_id[int(c) + 1]
+                t_dict['bbox'] = [int(i) for i in [x0, y0, bw, bh]]
+                t_dict['score'] = float(s)
+                json_out.append(t_dict)
+
+if args.save_json:
+    with open(args.save_json_path, 'w')as f:
+        json.dump(json_out, f)
+    print('output json saved to: ', args.save_json_path)
+    eval_coco = os.path.realpath(__file__ + "/../eval_coco.py")
+    os.system('python3.7 %s %s' % (eval_coco, args.save_json_path))
@@ -0,0 +1,61 @@
+
+#export CUDA_VISIBLE_DEVICES=''
+#export CUDA_VISIBLE_DEVICES=7
+
+
+
+# setting main path
+MAIN_PATH=$(dirname $(readlink -f $0))
+
+## set env
+#export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe/:$MAIN_PATH/../../../
+#export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
+#PATH=$PATH:$HOME/bin
+#export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
+#export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+
+# set env
+export ASCEND_HOME=/usr/local/Ascend
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
+export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
+export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+export DDK_VERSION_FLAG=1.60.T49.0.B201
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+
+export JOB_ID=10087
+export FUSION_TENSOR_SIZE=1000000000
+#export SLOG_PRINT_TO_STDOUT=1
+#export DUMP_GE_GRAPH=2
+#export DUMP_GRAPH_LEVEL=3
+
+
+
+for((RANK_ID=0;RANK_ID<8;RANK_ID++));
+do
+
+export RANK_ID=$RANK_ID
+export RANK_SIZE=1
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[debug]\" --device "$RANK_ID
+
+RESTORE_PATH=./training/t1/D$RANK_ID/training/
+
+nohup python3.7 eval.py \
+--save_json True \
+--score_thresh 0.0001 \
+--nms_thresh 0.55 \
+--max_boxes 100 \
+--restore_path $RESTORE_PATH \
+--max_test 10000 \
+--save_json_path eval_res_D$RANK_ID.json > eval_$RANK_ID.out &
+
+
+done
+
+
@@ -0,0 +1,57 @@
+#-*- coding:utf-8 -*-
+# import matplotlib.pyplot as plt
+from pycocotools.coco import COCO 
+from pycocotools.cocoeval import COCOeval 
+import numpy as np 
+import pylab,json
+import sys
+# pylab.rcParams['figure.figsize'] = (10.0, 8.0)
+
+def get_img_id(file_name): 
+    ls = [] 
+    myset = [] 
+    annos = json.load(open(file_name, 'r')) 
+    for anno in annos: 
+      ls.append(anno['image_id']) 
+    myset = {}.fromkeys(ls).keys() 
+    return myset
+
+
+'''
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.317
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.562
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.321
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.162
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.343
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.448
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.278
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.438
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.464
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.275
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.497
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.625
+'''
+
+if __name__ == '__main__': 
+    annType = ['segm', 'bbox', 'keypoints']#set iouType to 'segm', 'bbox' or 'keypoints'
+    annType = annType[1] # specify type here
+    cocoGt_file = '/opt/npu/dataset/coco/coco2014/annotations/instances_val2014.json'
+    cocoGt = COCO(cocoGt_file)#取得标注集中coco json对象
+    # print(list(cocoGt.anns.items())[:10])
+    # print(cocoGt.anns[318219])
+    # input()
+    # cocoDt_file = 'result.json'
+    cocoDt_file = sys.argv[1]
+
+    imgIds = get_img_id(cocoDt_file) 
+    # print(len(imgIds))
+    cocoDt = cocoGt.loadRes(cocoDt_file)#取得结果集中image json对象
+    imgIds = sorted(imgIds)#按顺序排列coco标注集image_id
+    # print(imgIds)
+    # input()
+    # imgIds = imgIds[0:5000]#标注集中的image数据
+    cocoEval = COCOeval(cocoGt, cocoDt, annType) 
+    cocoEval.params.imgIds = imgIds#参数设置
+    cocoEval.evaluate()#评价
+    cocoEval.accumulate()#积累
+    cocoEval.summarize()#总结
@@ -0,0 +1,155 @@
+# coding: utf-8
+# This script is modified from https://github.com/lars76/kmeans-anchor-boxes
+
+from __future__ import division, print_function
+
+import numpy as np
+
+def iou(box, clusters):
+    """
+    Calculates the Intersection over Union (IoU) between a box and k clusters.
+    param:
+        box: tuple or array, shifted to the origin (i. e. width and height)
+        clusters: numpy array of shape (k, 2) where k is the number of clusters
+    return:
+        numpy array of shape (k, 0) where k is the number of clusters
+    """
+    x = np.minimum(clusters[:, 0], box[0])
+    y = np.minimum(clusters[:, 1], box[1])
+    if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
+        raise ValueError("Box has no area")
+
+    intersection = x * y
+    box_area = box[0] * box[1]
+    cluster_area = clusters[:, 0] * clusters[:, 1]
+
+    iou_ = np.true_divide(intersection, box_area + cluster_area - intersection + 1e-10)
+    # iou_ = intersection / (box_area + cluster_area - intersection + 1e-10)
+
+    return iou_
+
+
+def avg_iou(boxes, clusters):
+    """
+    Calculates the average Intersection over Union (IoU) between a numpy array of boxes and k clusters.
+    param:
+        boxes: numpy array of shape (r, 2), where r is the number of rows
+        clusters: numpy array of shape (k, 2) where k is the number of clusters
+    return:
+        average IoU as a single float
+    """
+    return np.mean([np.max(iou(boxes[i], clusters)) for i in range(boxes.shape[0])])
+
+
+def translate_boxes(boxes):
+    """
+    Translates all the boxes to the origin.
+    param:
+        boxes: numpy array of shape (r, 4)
+    return:
+    numpy array of shape (r, 2)
+    """
+    new_boxes = boxes.copy()
+    for row in range(new_boxes.shape[0]):
+        new_boxes[row][2] = np.abs(new_boxes[row][2] - new_boxes[row][0])
+        new_boxes[row][3] = np.abs(new_boxes[row][3] - new_boxes[row][1])
+    return np.delete(new_boxes, [0, 1], axis=1)
+
+
+def kmeans(boxes, k, dist=np.median):
+    """
+    Calculates k-means clustering with the Intersection over Union (IoU) metric.
+    param:
+        boxes: numpy array of shape (r, 2), where r is the number of rows
+        k: number of clusters
+        dist: distance function
+    return:
+        numpy array of shape (k, 2)
+    """
+    rows = boxes.shape[0]
+
+    distances = np.empty((rows, k))
+    last_clusters = np.zeros((rows,))
+
+    np.random.seed()
+
+    # the Forgy method will fail if the whole array contains the same rows
+    clusters = boxes[np.random.choice(rows, k, replace=False)]
+
+    while True:
+        for row in range(rows):
+            distances[row] = 1 - iou(boxes[row], clusters)
+
+        nearest_clusters = np.argmin(distances, axis=1)
+
+        if (last_clusters == nearest_clusters).all():
+            break
+
+        for cluster in range(k):
+            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)
+
+        last_clusters = nearest_clusters
+
+    return clusters
+
+
+def parse_anno(annotation_path, target_size=None):
+    anno = open(annotation_path, 'r')
+    result = []
+    for line in anno:
+        s = line.strip().split(' ')
+        img_w = int(s[2])
+        img_h = int(s[3])
+        s = s[4:]
+        box_cnt = len(s) // 5
+        for i in range(box_cnt):
+            x_min, y_min, x_max, y_max = float(s[i*5+1]), float(s[i*5+2]), float(s[i*5+3]), float(s[i*5+4])
+            width = x_max - x_min
+            height = y_max - y_min
+            assert width > 0
+            assert height > 0
+            # use letterbox resize, i.e. keep the original aspect ratio
+            # get k-means anchors on the resized target image size
+            if target_size is not None:
+                resize_ratio = min(target_size[0] / img_w, target_size[1] / img_h)
+                width *= resize_ratio
+                height *= resize_ratio
+                result.append([width, height])
+            # get k-means anchors on the original image size
+            else:
+                result.append([width, height])
+    result = np.asarray(result)
+    return result
+
+
+def get_kmeans(anno, cluster_num=9):
+
+    anchors = kmeans(anno, cluster_num)
+    ave_iou = avg_iou(anno, anchors)
+
+    anchors = anchors.astype('int').tolist()
+
+    anchors = sorted(anchors, key=lambda x: x[0] * x[1])
+
+    return anchors, ave_iou
+
+
+if __name__ == '__main__':
+    # target resize format: [width, height]
+    # if target_resize is speficied, the anchors are on the resized image scale
+    # if target_resize is set to None, the anchors are on the original image scale
+    target_size = [416, 416]
+    annotation_path = "train.txt"
+    anno_result = parse_anno(annotation_path, target_size=target_size)
+    anchors, ave_iou = get_kmeans(anno_result, 9)
+
+    anchor_string = ''
+    for anchor in anchors:
+        anchor_string += '{},{}, '.format(anchor[0], anchor[1])
+    anchor_string = anchor_string[:-2]
+
+    print('anchors are:')
+    print(anchor_string)
+    print('the average iou is:')
+    print(ave_iou)
+
@@ -0,0 +1,32 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "1",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "1",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                }
+           ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0"
+    ],
+    "para_plane_nic_num": "1",
+    "status": "completed"
+}
@@ -0,0 +1,43 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "2",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "2",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1"
+    ],
+    "para_plane_nic_num": "2",
+    "status": "completed"
+}
@@ -0,0 +1,65 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "4",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "4",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3"
+    ],
+    "para_plane_nic_num": "4",
+    "status": "completed"
+}
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,88 @@
+# coding: utf-8
+# This file contains the parameter used in train.py
+
+from __future__ import division, print_function
+
+from utils.misc_utils import parse_anchors, read_class_names
+import math
+
+### Some paths
+train_file = './data/my_data/train.txt'  # The path of the training txt file.
+val_file = './data/my_data/val.txt'  # The path of the validation txt file.
+restore_path = './data/darknet_weights/yolov3.ckpt'  # The path of the weights to restore.
+save_dir = './checkpoint/'  # The directory of the weights to save.
+log_dir = './data/logs/'  # The directory to store the tensorboard log files.
+progress_log_path = './data/progress.log'  # The path to record the training progress.
+anchor_path = './data/yolo_anchors.txt'  # The path of the anchor txt file.
+class_name_path = './data/voc.names'  # The path of the class names.
+
+### Training releated numbers
+batch_size = 6
+img_size = [416, 416]  # Images will be resized to `img_size` and fed to the network, size format: [width, height]
+letterbox_resize = False  # Whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
+total_epoches = 100
+train_evaluation_step = 100  # Evaluate on the training batch after some steps.
+val_evaluation_epoch = 1  # Evaluate on the whole validation dataset after some steps. Set to None to evaluate every epoch.
+save_epoch = 10  # Save the model after some epochs.
+batch_norm_decay = 0.99  # decay in bn ops
+weight_decay = 5e-4  # l2 weight decay
+global_step = 0  # used when resuming training
+
+### tf.data parameters
+num_threads = 10  # Number of threads for image processing used in tf.data pipeline.
+prefetech_buffer = 5  # Prefetech_buffer used in tf.data pipeline.
+
+### Learning rate and optimizer
+optimizer_name = 'momentum'  # Chosen from [sgd, momentum, adam, rmsprop]
+save_optimizer = False  # Whether to save the optimizer parameters into the checkpoint file.
+learning_rate_init = 1e-4
+lr_type = 'piecewise'  # Chosen from [fixed, exponential, cosine_decay, cosine_decay_restart, piecewise]
+lr_decay_epoch = 5  # Epochs after which learning rate decays. Int or float. Used when chosen `exponential` and `cosine_decay_restart` lr_type.
+lr_decay_factor = 0.96  # The learning rate decay factor. Used when chosen `exponential` lr_type.
+lr_lower_bound = 1e-6  # The minimum learning rate.
+# piecewise params
+pw_boundaries = [25, 40]  # epoch based boundaries
+pw_values = [learning_rate_init, 3e-5, 1e-4]
+
+### Load and finetune
+# Choose the parts you want to restore the weights. List form.
+# restore_include: None, restore_exclude: None  => restore the whole model
+# restore_include: None, restore_exclude: scope  => restore the whole model except `scope`
+# restore_include: scope1, restore_exclude: scope2  => if scope1 contains scope2, restore scope1 and not restore scope2 (scope1 - scope2)
+# choise 1: only restore the darknet body
+# restore_include = ['yolov3/darknet53_body']
+# restore_exclude = None
+# choise 2: restore all layers except the last 3 conv2d layers in 3 scale
+restore_include = None
+restore_exclude = ['yolov3/yolov3_head/Conv_14', 'yolov3/yolov3_head/Conv_6', 'yolov3/yolov3_head/Conv_22']
+# Choose the parts you want to finetune. List form.
+# Set to None to train the whole model.
+update_part = None
+
+### other training strategies
+multi_scale_train = True  # Whether to apply multi-scale training strategy. Image size varies from [320, 320] to [640, 640] by default.
+use_label_smooth = True # Whether to use class label smoothing strategy.
+use_focal_loss = True  # Whether to apply focal loss on the conf loss.
+use_mix_up = True  # Whether to use mix up data augmentation strategy. 
+use_warm_up = True  # whether to use warm up strategy to prevent from gradient exploding.
+warm_up_epoch = 3  # Warm up training epoches. Set to a larger value if gradient explodes.
+
+### some constants in validation
+# nms
+nms_threshold = 0.45  # iou threshold in nms operation
+score_threshold = 0.01 # threshold of the probability of the classes in nms operation, i.e. score = pred_confs * pred_probs. set lower for higher recall.
+nms_topk = 150  # keep at most nms_topk outputs after nms
+# mAP eval
+eval_threshold = 0.5  # the iou threshold applied in mAP evaluation
+use_voc_07_metric = False  # whether to use voc 2007 evaluation metric, i.e. the 11-point metric
+
+### parse some params
+anchors = parse_anchors(anchor_path)
+classes = read_class_names(class_name_path)
+class_num = len(classes)
+train_img_cnt = len(open(train_file, 'r').readlines())
+val_img_cnt = len(open(val_file, 'r').readlines())
+train_batch_num = int(math.ceil(float(train_img_cnt) / batch_size))
+
+lr_decay_freq = int(train_batch_num * lr_decay_epoch)
+pw_boundaries = [float(i) * train_batch_num + global_step for i in pw_boundaries]
@@ -0,0 +1,140 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import tensorflow as tf
+import numpy as np
+import argparse
+from tqdm import trange
+
+from utils.data_utils import get_batch_data
+from utils.misc_utils import parse_anchors, read_class_names, AverageMeter
+from utils.eval_utils import evaluate_on_cpu, evaluate_on_gpu, get_preds_gpu, voc_eval, parse_gt_rec
+from utils.nms_utils import gpu_nms
+
+from model import yolov3
+
+#################
+# ArgumentParser
+#################
+parser = argparse.ArgumentParser(description="YOLO-V3 eval procedure.")
+# some paths
+parser.add_argument("--eval_file", type=str, default="./data/my_data/val.txt",
+                    help="The path of the validation or test txt file.")
+
+parser.add_argument("--restore_path", type=str, default="./data/checkpoint_whole_finetune_no_letterbox/best_model_Epoch_32_step_91046_mAP_0.8754_loss_2.2147_lr_3e-05",
+                    help="The path of the weights to restore.")
+
+parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt",
+                    help="The path of the anchor txt file.")
+
+parser.add_argument("--class_name_path", type=str, default="./data/voc.names",
+                    help="The path of the class names.")
+
+# some numbers
+parser.add_argument("--img_size", nargs='*', type=int, default=[416, 416],
+                    help="Resize the input image to `img_size`, size format: [width, height]")
+
+parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=False,
+                    help="Whether to use the letterbox resize.")
+
+parser.add_argument("--num_threads", type=int, default=10,
+                    help="Number of threads for image processing used in tf.data pipeline.")
+
+parser.add_argument("--prefetech_buffer", type=int, default=5,
+                    help="Prefetech_buffer used in tf.data pipeline.")
+
+parser.add_argument("--nms_threshold", type=float, default=0.45,
+                    help="IOU threshold in nms operation.")
+
+parser.add_argument("--score_threshold", type=float, default=0.01,
+                    help="Threshold of the probability of the classes in nms operation.")
+
+parser.add_argument("--nms_topk", type=int, default=150,
+                    help="Keep at most nms_topk outputs after nms.")
+
+parser.add_argument("--use_voc_07_metric", type=lambda x: (str(x).lower() == 'true'), default=False,
+                    help="Whether to use the voc 2007 mAP metrics.")
+
+args = parser.parse_args()
+
+# args params
+args.anchors = parse_anchors(args.anchor_path)
+args.classes = read_class_names(args.class_name_path)
+args.class_num = len(args.classes)
+args.img_cnt = len(open(args.eval_file, 'r').readlines())
+
+# setting placeholders
+is_training = tf.placeholder(dtype=tf.bool, name="phase_train")
+handle_flag = tf.placeholder(tf.string, [], name='iterator_handle_flag')
+pred_boxes_flag = tf.placeholder(tf.float32, [1, None, None])
+pred_scores_flag = tf.placeholder(tf.float32, [1, None, None])
+gpu_nms_op = gpu_nms(pred_boxes_flag, pred_scores_flag, args.class_num, args.nms_topk, args.score_threshold, args.nms_threshold)
+
+##################
+# tf.data pipeline
+##################
+val_dataset = tf.data.TextLineDataset(args.eval_file)
+val_dataset = val_dataset.batch(1)
+val_dataset = val_dataset.map(
+    lambda x: tf.py_func(get_batch_data, [x, args.class_num, args.img_size, args.anchors, 'val', False, False, args.letterbox_resize], [tf.int64, tf.float32, tf.float32, tf.float32, tf.float32]),
+    num_parallel_calls=args.num_threads
+)
+val_dataset.prefetch(args.prefetech_buffer)
+iterator = val_dataset.make_one_shot_iterator()
+
+image_ids, image, y_true_13, y_true_26, y_true_52 = iterator.get_next()
+image_ids.set_shape([None])
+y_true = [y_true_13, y_true_26, y_true_52]
+image.set_shape([None, args.img_size[1], args.img_size[0], 3])
+for y in y_true:
+    y.set_shape([None, None, None, None, None])
+
+##################
+# Model definition
+##################
+yolo_model = yolov3(args.class_num, args.anchors)
+with tf.variable_scope('yolov3'):
+    pred_feature_maps = yolo_model.forward(image, is_training=is_training)
+loss = yolo_model.compute_loss(pred_feature_maps, y_true)
+y_pred = yolo_model.predict(pred_feature_maps)
+
+saver_to_restore = tf.train.Saver()
+
+with tf.Session() as sess:
+    sess.run([tf.global_variables_initializer()])
+    saver_to_restore.restore(sess, args.restore_path)
+
+    print('\n----------- start to eval -----------\n')
+
+    val_loss_total, val_loss_xy, val_loss_wh, val_loss_conf, val_loss_class = \
+        AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
+    val_preds = []
+
+    for j in trange(args.img_cnt):
+        __image_ids, __y_pred, __loss = sess.run([image_ids, y_pred, loss], feed_dict={is_training: False})
+        pred_content = get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, __image_ids, __y_pred)
+
+        val_preds.extend(pred_content)
+        val_loss_total.update(__loss[0])
+        val_loss_xy.update(__loss[1])
+        val_loss_wh.update(__loss[2])
+        val_loss_conf.update(__loss[3])
+        val_loss_class.update(__loss[4])
+
+    rec_total, prec_total, ap_total = AverageMeter(), AverageMeter(), AverageMeter()
+    gt_dict = parse_gt_rec(args.eval_file, args.img_size, args.letterbox_resize)
+    print('mAP eval:')
+    for ii in range(args.class_num):
+        npos, nd, rec, prec, ap = voc_eval(gt_dict, val_preds, ii, iou_thres=0.5, use_07_metric=args.use_voc_07_metric)
+        rec_total.update(rec, npos)
+        prec_total.update(prec, nd)
+        ap_total.update(ap, 1)
+        print('Class {}: Recall: {:.4f}, Precision: {:.4f}, AP: {:.4f}'.format(ii, rec, prec, ap))
+
+    mAP = ap_total.average
+    print('final mAP: {:.4f}'.format(mAP))
+    print("recall: {:.3f}, precision: {:.3f}".format(rec_total.average, prec_total.average))
+    print("total_loss: {:.3f}, loss_xy: {:.3f}, loss_wh: {:.3f}, loss_conf: {:.3f}, loss_class: {:.3f}".format(
+        val_loss_total.average, val_loss_xy.average, val_loss_wh.average, val_loss_conf.average, val_loss_class.average
+    ))
@@ -0,0 +1,20 @@
+aeroplane
+bicycle
+bird
+boat
+bottle
+bus
+car
+cat
+chair
+cow
+diningtable
+dog
+horse
+motorbike
+person
+pottedplant
+sheep
+sofa
+train
+tvmonitor
@@ -0,0 +1,96 @@
+# coding: utf-8
+
+import xml.etree.ElementTree as ET
+import os
+
+names_dict = {}
+cnt = 0
+f = open('./voc_names.txt', 'r').readlines()
+for line in f:
+    line = line.strip()
+    names_dict[line] = cnt
+    cnt += 1
+
+voc_07 = '/data/VOCdevkit/VOC2007'
+voc_12 = '/data/VOCdevkit/VOC2012'
+
+anno_path = [os.path.join(voc_07, 'Annotations'), os.path.join(voc_12, 'Annotations')]
+img_path = [os.path.join(voc_07, 'JPEGImages'), os.path.join(voc_12, 'JPEGImages')]
+
+trainval_path = [os.path.join(voc_07, 'ImageSets/Main/trainval.txt'),
+                 os.path.join(voc_12, 'ImageSets/Main/trainval.txt')]
+test_path = [os.path.join(voc_07, 'ImageSets/Main/test.txt')]
+
+
+def parse_xml(path):
+    tree = ET.parse(path)
+    img_name = path.split('/')[-1][:-4]
+    
+    height = tree.findtext("./size/height")
+    width = tree.findtext("./size/width")
+
+    objects = [img_name, width, height]
+
+    for obj in tree.findall('object'):
+        difficult = obj.find('difficult').text
+        if difficult == '1':
+            continue
+        name = obj.find('name').text
+        bbox = obj.find('bndbox')
+        xmin = bbox.find('xmin').text
+        ymin = bbox.find('ymin').text
+        xmax = bbox.find('xmax').text
+        ymax = bbox.find('ymax').text
+
+        name = str(names_dict[name])
+        objects.extend([name, xmin, ymin, xmax, ymax])
+    if len(objects) > 1:
+        return objects
+    else:
+        return None
+
+test_cnt = 0
+def gen_test_txt(txt_path):
+    global test_cnt
+    f = open(txt_path, 'w')
+
+    for i, path in enumerate(test_path):
+        img_names = open(path, 'r').readlines()
+        for img_name in img_names:
+            img_name = img_name.strip()
+            xml_path = anno_path[i] + '/' + img_name + '.xml'
+            objects = parse_xml(xml_path)
+            if objects:
+                objects[0] = img_path[i] + '/' + img_name + '.jpg'
+                if os.path.exists(objects[0]):
+                    objects.insert(0, str(test_cnt))
+                    test_cnt += 1
+                    objects = ' '.join(objects) + '\n'
+                    f.write(objects)
+    f.close()
+
+
+train_cnt = 0
+def gen_train_txt(txt_path):
+    global train_cnt
+    f = open(txt_path, 'w')
+
+    for i, path in enumerate(trainval_path):
+        img_names = open(path, 'r').readlines()
+        for img_name in img_names:
+            img_name = img_name.strip()
+            xml_path = anno_path[i] + '/' + img_name + '.xml'
+            objects = parse_xml(xml_path)
+            if objects:
+                objects[0] = img_path[i] + '/' + img_name + '.jpg'
+                if os.path.exists(objects[0]):
+                    objects.insert(0, str(train_cnt))
+                    train_cnt += 1
+                    objects = ' '.join(objects) + '\n'
+                    f.write(objects)
+    f.close()
+
+
+gen_train_txt('train.txt')
+gen_test_txt('val.txt')
+
@@ -0,0 +1,32 @@
+# coding: utf-8
+
+# This script is used to remove the optimizer parameters in the saved checkpoint files.
+# These parameters are useless in the forward process. 
+# Removing them will shrink the checkpoint size a lot.
+
+import sys
+sys.path.append('..')
+
+import os
+import tensorflow as tf
+from model import yolov3
+
+# params
+ckpt_path = ''
+class_num = 20
+save_dir = 'shrinked_ckpt'
+if not os.path.exists(save_dir):
+    os.makedirs(save_dir)
+
+image = tf.placeholder(tf.float32, [1, 416, 416, 3])
+yolo_model = yolov3(class_num, None)
+with tf.variable_scope('yolov3'):
+    pred_feature_maps = yolo_model.forward(image)
+
+saver_to_restore = tf.train.Saver()
+saver_to_save = tf.train.Saver()
+
+with tf.Session() as sess:
+    sess.run(tf.global_variables_initializer())
+    saver_to_restore.restore(sess, ckpt_path)
+    saver_to_save.save(sess, save_dir + '/shrinked')
@@ -0,0 +1,457 @@
+# coding=utf-8
+# for better understanding about yolov3 architecture, refer to this website (in Chinese):
+# https://blog.csdn.net/leviopku/article/details/82660381
+
+from __future__ import division, print_function
+
+import tensorflow as tf
+
+slim = tf.contrib.slim
+
+from utils.layer_utils import conv2d, darknet53_body, yolo_block, upsample_layer
+
+
+class yolov3(object):
+
+    def __init__(self, class_num, anchors, use_label_smooth=False, use_focal_loss=False, batch_norm_decay=0.999,
+                 weight_decay=5e-4, use_static_shape=True,
+                 img_size=(416, 416), batch_size=None):
+
+        # self.anchors = [[10, 13], [16, 30], [33, 23],
+        #                 [30, 61], [62, 45], [59, 119],
+        #                 [116, 90], [156, 198], [373, 326]]
+        self.class_num = class_num
+        self.anchors = anchors
+        self.batch_norm_decay = batch_norm_decay
+        self.use_label_smooth = use_label_smooth
+        self.use_focal_loss = use_focal_loss
+        self.weight_decay = weight_decay
+        # inference speed optimization
+        # if `use_static_shape` is True, use tensor.get_shape(), otherwise use tf.shape(tensor)
+        # static_shape is slightly faster
+        self.use_static_shape = use_static_shape
+        self.batch_size = batch_size
+        # self.img_size = (416, 416)
+        self.img_size = img_size
+        self.featrue_map_shape_base = [32, 16, 8]
+        self.featrue_map_shape = [(self.img_size[0] // i, self.img_size[1] // i) for i in self.featrue_map_shape_base]
+
+    def forward(self, inputs, is_training=False, reuse=False):
+        # the input img_size, form: [height, weight]
+        # it will be used later
+        # self.img_size = tf.shape(inputs)[1:3]
+        # self.featrue_map_shape = [(self.img_size[0]//i, self.img_size[1]//i) for i in self.featrue_map_shape_base]
+        # set batch norm params
+        batch_norm_params = {
+            'decay': self.batch_norm_decay,
+            'epsilon': 1e-05,
+            'scale': True,
+            'is_training': is_training,
+            'fused': None,  # Use fused batch norm if possible.
+        }
+
+        with slim.arg_scope([slim.conv2d, slim.batch_norm], reuse=reuse):
+            with slim.arg_scope([slim.conv2d],
+                                normalizer_fn=slim.batch_norm,
+                                normalizer_params=batch_norm_params,
+                                biases_initializer=None,
+                                activation_fn=lambda x: tf.nn.leaky_relu(x, alpha=0.1),
+                                weights_regularizer=slim.l2_regularizer(self.weight_decay)):
+                with tf.variable_scope('darknet53_body'):
+                    route_1, route_2, route_3 = darknet53_body(inputs)
+
+                with tf.variable_scope('yolov3_head'):
+                    inter1, net = yolo_block(route_3, 512)
+                    feature_map_1 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
+                                                stride=1, normalizer_fn=None,
+                                                activation_fn=None, biases_initializer=tf.zeros_initializer())
+                    feature_map_1 = tf.identity(feature_map_1, name='feature_map_1')
+
+                    inter1 = conv2d(inter1, 256, 1)
+                    inter1 = upsample_layer(inter1,
+                                            route_2.get_shape().as_list() if self.use_static_shape else tf.shape(
+                                                route_2))
+                    concat1 = tf.concat([inter1, route_2], axis=3)
+
+                    inter2, net = yolo_block(concat1, 256)
+                    feature_map_2 = slim.conv2d(net, 3 * (5 + self.class_num), 1,
+                                                stride=1, normalizer_fn=None,
+                                                activation_fn=None, biases_initializer=tf.zeros_initializer())
+                    feature_map_2 = tf.identity(feature_map_2, name='feature_map_2')
+
+                    inter2 = conv2d(inter2, 128, 1)
+                    inter2 = upsample_layer(inter2,
+                                            route_1.get_shape().as_list() if self.use_static_shape else tf.shape(
+                                                route_1))
+                    concat2 = tf.concat([inter2, route_1], axis=3)
+
+                    _, feature_map_3 = yolo_block(concat2, 128)
+                    feature_map_3 = slim.conv2d(feature_map_3, 3 * (5 + self.class_num), 1,
+                                                stride=1, normalizer_fn=None,
+                                                activation_fn=None, biases_initializer=tf.zeros_initializer())
+                    feature_map_3 = tf.identity(feature_map_3, name='feature_map_3')
+
+            return feature_map_1, feature_map_2, feature_map_3
+
+    def reorg_layer(self, feature_map, anchors):
+        '''
+        feature_map: a feature_map from [feature_map_1, feature_map_2, feature_map_3] returned
+            from `forward` function
+        anchors: shape: [3, 2]
+        '''
+        # NOTE: size in [h, w] format! don't get messed up!
+        grid_size = feature_map.get_shape().as_list()[1:3] if self.use_static_shape else tf.shape(feature_map)[
+                                                                                         1:3]  # [13, 13]
+        # the downscale ratio in height and weight
+        # ratio = tf.cast(self.img_size / grid_size, tf.float32)
+        ratio = tf.cast([self.img_size[0] / grid_size[0], self.img_size[1] / grid_size[1]], tf.float32)
+        # rescale the anchors to the feature_map
+        # NOTE: the anchor is in [w, h] format!
+        rescaled_anchors = [(anchor[0] / ratio[1], anchor[1] / ratio[0]) for anchor in anchors]
+
+        feature_map = tf.reshape(feature_map, [-1, grid_size[0], grid_size[1], 3, 5 + self.class_num])
+
+        # split the feature_map along the last dimension
+        # shape info: take 416x416 input image and the 13*13 feature_map for example:
+        # box_centers: [N, 13, 13, 3, 2] last_dimension: [center_x, center_y]
+        # box_sizes: [N, 13, 13, 3, 2] last_dimension: [width, height]
+        # conf_logits: [N, 13, 13, 3, 1]
+        # prob_logits: [N, 13, 13, 3, class_num]
+
+        # box_centers, box_sizes, conf_logits, prob_logits = tf.split(feature_map, [2, 2, 1, self.class_num], axis=-1)
+        box_centers = feature_map[..., :2]
+        box_sizes = feature_map[..., 2:4]
+        conf_logits = feature_map[..., 4:5]
+        prob_logits = feature_map[..., 5:]
+
+        # conf_logits = tf.expand_dims(conf_logits, -1)
+
+        box_centers = tf.nn.sigmoid(box_centers)
+
+        # use some broadcast tricks to get the mesh coordinates
+        grid_x = tf.range(grid_size[1], dtype=tf.int32)
+        grid_y = tf.range(grid_size[0], dtype=tf.int32)
+        grid_x, grid_y = tf.meshgrid(grid_x, grid_y)
+        x_offset = tf.reshape(grid_x, (-1, 1))
+        y_offset = tf.reshape(grid_y, (-1, 1))
+        x_y_offset = tf.concat([x_offset, y_offset], axis=-1)
+        # shape: [13, 13, 1, 2]
+        x_y_offset = tf.cast(tf.reshape(x_y_offset, [grid_size[0], grid_size[1], 1, 2]), tf.float32)
+
+        # get the absolute box coordinates on the feature_map 
+        box_centers = box_centers + x_y_offset
+        # rescale to the original image scale
+        box_centers = box_centers * ratio[::-1]
+
+        # avoid getting possible nan value with tf.clip_by_value
+        box_sizes = tf.exp(box_sizes) * rescaled_anchors
+        # box_sizes = tf.clip_by_value(tf.exp(box_sizes), 1e-9, 100) * rescaled_anchors
+        # rescale to the original image scale
+        box_sizes = box_sizes * ratio[::-1]
+
+        # shape: [N, 13, 13, 3, 4]
+        # last dimension: (center_x, center_y, w, h)
+        boxes = tf.concat([box_centers, box_sizes], axis=-1)
+
+        # shape:
+        # x_y_offset: [13, 13, 1, 2]
+        # boxes: [N, 13, 13, 3, 4], rescaled to the original image scale
+        # conf_logits: [N, 13, 13, 3, 1]
+        # prob_logits: [N, 13, 13, 3, class_num]
+        return x_y_offset, boxes, conf_logits, prob_logits
+
+    def predict(self, feature_maps):
+        '''
+        Receive the returned feature_maps from `forward` function,
+        the produce the output predictions at the test stage.
+        '''
+        feature_map_1, feature_map_2, feature_map_3 = feature_maps
+
+        feature_map_anchors = [(feature_map_1, self.anchors[6:9]),
+                               (feature_map_2, self.anchors[3:6]),
+                               (feature_map_3, self.anchors[0:3])]
+        reorg_results = [self.reorg_layer(feature_map, anchors) for (feature_map, anchors) in feature_map_anchors]
+
+        def _reshape(result):
+            x_y_offset, boxes, conf_logits, prob_logits = result
+            grid_size = x_y_offset.get_shape().as_list()[:2] if self.use_static_shape else tf.shape(x_y_offset)[:2]
+            boxes = tf.reshape(boxes, [-1, grid_size[0] * grid_size[1] * 3, 4])
+            conf_logits = tf.reshape(conf_logits, [-1, grid_size[0] * grid_size[1] * 3, 1])
+            prob_logits = tf.reshape(prob_logits, [-1, grid_size[0] * grid_size[1] * 3, self.class_num])
+            # shape: (take 416*416 input image and feature_map_1 for example)
+            # boxes: [N, 13*13*3, 4]
+            # conf_logits: [N, 13*13*3, 1]
+            # prob_logits: [N, 13*13*3, class_num]
+            return boxes, conf_logits, prob_logits
+
+        boxes_list, confs_list, probs_list = [], [], []
+        for result in reorg_results:
+            boxes, conf_logits, prob_logits = _reshape(result)
+            confs = tf.sigmoid(conf_logits)
+            probs = tf.sigmoid(prob_logits)
+            boxes_list.append(boxes)
+            confs_list.append(confs)
+            probs_list.append(probs)
+
+        # collect results on three scales
+        # take 416*416 input image for example:
+        # shape: [N, (13*13+26*26+52*52)*3, 4]
+        boxes = tf.concat(boxes_list, axis=1)
+        # shape: [N, (13*13+26*26+52*52)*3, 1]
+        confs = tf.concat(confs_list, axis=1)
+        # shape: [N, (13*13+26*26+52*52)*3, class_num]
+        probs = tf.concat(probs_list, axis=1)
+
+        # center_x, center_y, width, height = tf.split(boxes, [1, 1, 1, 1], axis=-1)
+
+        # center_x = tf.expand_dims(boxes[..., 0], 2)
+        # center_y = tf.expand_dims(boxes[..., 1], 2)
+        # width = tf.expand_dims(boxes[..., 2],    2)
+        # height = tf.expand_dims(boxes[..., 3],   2)
+
+        center_x = boxes[..., 0:1]
+        center_y = boxes[..., 1:2]
+        width = boxes[..., 2:3]
+        height = boxes[..., 3:]
+
+        x_min = center_x - width / 2
+        y_min = center_y - height / 2
+        x_max = center_x + width / 2
+        y_max = center_y + height / 2
+
+        boxes = tf.concat([x_min, y_min, x_max, y_max], axis=-1)
+
+        return boxes, confs, probs
+
+    def loss_layer(self, feature_map_i, y_true, anchors, feature_map_shape_i, gt_box_i):
+        '''
+        calc loss function from a certain scale
+        input:
+            feature_map_i: feature maps of a certain scale. shape: [N, 13, 13, 3*(5 + num_class)] etc.
+            y_true: y_ture from a certain scale. shape: [N, 13, 13, 3, 5 + num_class + 1] etc.
+            anchors: shape [9, 2]
+        '''
+
+        # size in [h, w] format! don't get messed up!
+        # grid_size = tf.shape(feature_map_i)[1:3]
+        grid_size = tf.shape(feature_map_i)[1:3]
+        # the downscale ratio in height and weight
+        ratio = tf.cast(self.img_size / grid_size, tf.float32)
+        # N: batch_size
+        N = tf.cast(tf.shape(feature_map_i)[0], tf.float32)
+
+        x_y_offset, pred_boxes, pred_conf_logits, pred_prob_logits = self.reorg_layer(feature_map_i, anchors)
+
+        ###########
+        # get mask
+        ###########
+
+        # shape: take 416x416 input image and 13*13 feature_map for example:
+        # [N, 13, 13, 3, 1]
+        object_mask = y_true[..., 4:5]
+
+        # the calculation of ignore mask if referred from
+        # https://github.com/pjreddie/darknet/blob/master/src/yolo_layer.c#L179
+        # ignore_mask = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
+        # def loop_cond(idx, ignore_mask):
+        #     return tf.less(idx, tf.cast(N, tf.int32))
+        # def loop_body(idx, ignore_mask=None):
+        #     # shape: [13, 13, 3, 4] & [13, 13, 3]  ==>  [V, 4]
+        #     # V: num of true gt box of each image in a batch
+        #     valid_true_boxes = tf.boolean_mask(y_true[idx, ..., 0:4], tf.cast(object_mask[idx, ..., 0], 'bool'))
+        #     # shape: [13, 13, 3, 4] & [V, 4] ==> [13, 13, 3, V]
+        #     iou = self.box_iou(pred_boxes[idx], valid_true_boxes)
+        #     # shape: [13, 13, 3]
+        #     best_iou = tf.reduce_max(iou, axis=-1)
+        #     # shape: [13, 13, 3]
+        #     ignore_mask_tmp = tf.cast(best_iou < 0.5, tf.float32)
+        #     # finally will be shape: [N, 13, 13, 3]
+        #     # ignore_mask = ignore_mask.write(idx, ignore_mask_tmp)
+        #     if ignore_mask is None:
+        #         ignore_mask = tf.expand_dims(ignore_mask_tmp, 0)
+        #     else:
+        #         ignore_mask = tf.concat([ignore_mask, tf.expand_dims(ignore_mask_tmp, 0)], 0)
+        #     print(idx, ignore_mask)
+        #     return idx + 1, ignore_mask
+        # ignore_mask = None
+        # _, ignore_mask = tf.while_loop(cond=loop_cond, body=loop_body, loop_vars=[0, ignore_mask])
+        # ignore_mask = ignore_mask.stack()
+
+        iou = self.box_iou(pred_boxes, gt_box_i)  # [N, 13, 13, 3, 16]
+        best_iou = tf.reduce_max(iou, axis=-1)  # [N, 13, 13, 3]
+        ignore_mask = tf.cast(best_iou < 0.5, tf.float32)  # [N, 13, 13, 3]
+        # shape: [N, 13, 13, 3, 1]
+        ignore_mask = tf.expand_dims(ignore_mask, -1)
+        ignore_mask = tf.stop_gradient(ignore_mask)
+
+        # shape: [N, 13, 13, 3, 2]
+        pred_box_xy = pred_boxes[..., 0:2]
+        pred_box_wh = pred_boxes[..., 2:4]
+
+        # get xy coordinates in one cell from the feature_map
+        # numerical range: 0 ~ 1
+        # shape: [N, 13, 13, 3, 2]
+        print(y_true[..., 0:2], ratio[::-1], x_y_offset)
+        true_xy = y_true[..., 0:2] / ratio[::-1] - x_y_offset
+        pred_xy = pred_box_xy / ratio[::-1] - x_y_offset
+
+        # get_tw_th
+        # numerical range: 0 ~ 1
+        # shape: [N, 13, 13, 3, 2]
+        true_tw_th = y_true[..., 2:4] / anchors
+        pred_tw_th = pred_box_wh / anchors
+        # for numerical stability
+        true_tw_th = tf.where(condition=tf.equal(true_tw_th, 0),
+                              x=tf.ones_like(true_tw_th), y=true_tw_th)
+        pred_tw_th = tf.where(condition=tf.equal(pred_tw_th, 0),
+                              x=tf.ones_like(pred_tw_th), y=pred_tw_th)
+        true_tw_th = tf.log(tf.clip_by_value(true_tw_th, 1e-9, 1e9))
+        pred_tw_th = tf.log(tf.clip_by_value(pred_tw_th, 1e-9, 1e9))
+
+        # box size punishment: 
+        # box with smaller area has bigger weight. This is taken from the yolo darknet C source code.
+        # shape: [N, 13, 13, 3, 1]
+        box_loss_scale = 2. - (y_true[..., 2:3] / tf.cast(self.img_size[1], tf.float32)) * (
+                y_true[..., 3:4] / tf.cast(self.img_size[0], tf.float32))
+
+        ############
+        # loss_part
+        ############
+        # mix_up weight
+        # mix_w = y_true[..., self.class_num+5]
+        # [N, 13, 13, 3, 1]
+        # mix_w = y_true[..., -1:]
+        mix_w = y_true[..., 85:]
+        # mix_w = tf.expand_dims(mix_w, -1)
+        # shape: [N, 13, 13, 3, 1]
+        xy_loss = tf.reduce_sum(tf.square(true_xy - pred_xy) * object_mask * box_loss_scale * mix_w) / N
+        wh_loss = tf.reduce_sum(tf.square(true_tw_th - pred_tw_th) * object_mask * box_loss_scale * mix_w) / N
+
+        # shape: [N, 13, 13, 3, 1]
+        conf_pos_mask = object_mask
+        conf_neg_mask = (1 - object_mask) * ignore_mask
+        conf_loss_pos = conf_pos_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
+                                                                                logits=pred_conf_logits)
+        conf_loss_neg = conf_neg_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=object_mask,
+                                                                                logits=pred_conf_logits)
+        # TODO: may need to balance the pos-neg by multiplying some weights
+        conf_loss = conf_loss_pos + conf_loss_neg
+        if self.use_focal_loss:
+            alpha = 1.0
+            gamma = 2.0
+            # TODO: alpha should be a mask array if needed
+            focal_mask = alpha * tf.pow(tf.abs(object_mask - tf.sigmoid(pred_conf_logits)), gamma)
+            conf_loss *= focal_mask
+        conf_loss = tf.reduce_sum(conf_loss * mix_w) / N
+
+        # shape: [N, 13, 13, 3, 1]
+        # whether to use label smooth
+        if self.use_label_smooth:
+            delta = 0.01
+            label_target = (1 - delta) * y_true[..., 5:(5 + self.class_num)] + delta * 1. / self.class_num
+        else:
+            label_target = y_true[..., 5:(5 + self.class_num)]
+        class_loss = object_mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=label_target,
+                                                                           logits=pred_prob_logits) * mix_w
+        class_loss = tf.reduce_sum(class_loss) / N
+
+        return xy_loss, wh_loss, conf_loss, class_loss
+
+    def box_iou(self, pred_boxes, valid_true_boxes):
+        '''
+        param:
+            pred_boxes: [13, 13, 3, 4], (center_x, center_y, w, h)
+            valid_true: [1, 16, 4]
+        '''
+        # valid_true_boxes = tf.expand_dims(valid_true_boxes, -2)
+
+        # [13, 13, 3, 2]
+        pred_box_xy = pred_boxes[..., 0:2]
+        pred_box_wh = pred_boxes[..., 2:4]
+
+        # shape: [13, 13, 3, 1, 2]
+        pred_box_xy = tf.expand_dims(pred_box_xy, -2)
+        pred_box_wh = tf.expand_dims(pred_box_wh, -2)
+
+        print('##################pred_box_wh', pred_box_wh)
+
+        # [V, 2]
+        # N,H,W,A,C = valid_true_boxes.shape
+        # valid_true_boxes = tf.gather(valid_true_boxes, tf.where(object_mask))
+        # print(valid_true_boxes, object_mask)
+        # print(valid_true_boxes)
+        # input()
+        # valid_true_boxes = tf.reshape(valid_true_boxes, (self.batch_size, 1, 1, 3, -1, 4))
+
+        # x = tf.reshape(valid_true_boxes[..., 0], (self.batch_size, 3, -1))
+        # y = tf.reshape(valid_true_boxes[..., 1], (self.batch_size, 3, -1))
+        # w = tf.reshape(valid_true_boxes[..., 2], (self.batch_size, 3, -1))
+        # h = tf.reshape(valid_true_boxes[..., 3], (self.batch_size, 3, -1))
+        # valid_true_boxes =  tf.stack([x,y,w,h], axis=-1)
+        valid_true_boxes = tf.expand_dims(valid_true_boxes, 1)  # [1, 1, 16, 4]
+        valid_true_boxes = tf.expand_dims(valid_true_boxes, 1)  # [1, 1, 1, 16, 4]
+
+        print('##################valid_true_boxes', valid_true_boxes)
+
+        # valid_true_boxes = tf.tile(valid_true_boxes, [1,H,W,1,1])
+        # print(valid_true_boxes)
+        # input()
+
+        true_box_xy = valid_true_boxes[..., :2]  # [1, 1, 1, 16, 2]
+        true_box_wh = valid_true_boxes[..., 2:]  # [1, 1, 1, 16, 2]
+
+        print('##################true_box_wh', true_box_wh)
+
+        # [13, 13, 3, 1, 2] & [1, 1, 1, 16, 2] ==> [13, 13, 3, 16, 2]
+        intersect_mins = tf.maximum(pred_box_xy - pred_box_wh / 2.,
+                                    true_box_xy - true_box_wh / 2.)
+        intersect_maxs = tf.minimum(pred_box_xy + pred_box_wh / 2.,
+                                    true_box_xy + true_box_wh / 2.)
+        intersect_wh = tf.maximum(intersect_maxs - intersect_mins, 0.)
+
+        print('##################intersect_mins', intersect_mins)
+        print('##################intersect_wh', intersect_wh)
+
+        # shape: [13, 13, 3, 16]
+        intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+        # shape: [13, 13, 3, 1]
+        pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
+        # shape: [1, 1, 1, 16]
+        true_box_area = true_box_wh[..., 0] * true_box_wh[..., 1]
+        # shape: [1, V]
+        # true_box_area = tf.expand_dims(true_box_area, -2)
+        print('##################intersect_area', intersect_area)
+        print('##################pred_box_area', pred_box_area)
+        print('##################true_box_area', true_box_area)
+        # [13, 13, 3, 16]
+        iou = intersect_area / (pred_box_area + true_box_area - intersect_area + 1e-10)
+        print('##################iou', iou)
+        # iou = tf.clip_by_value(iou, 0, 1)
+
+        # print(pred_box_xy, pred_box_wh)
+        # print(intersect_area , pred_box_area , true_box_area , intersect_area)
+        # print(iou)
+        # input()
+
+        return iou
+
+    def compute_loss(self, y_pred, y_true, gt_box):
+        '''
+        param:
+            y_pred: returned feature_map list by `forward` function: [feature_map_1, feature_map_2, feature_map_3]
+            y_true: input y_true by the tf.data pipeline
+        '''
+        loss_xy, loss_wh, loss_conf, loss_class = 0., 0., 0., 0.
+        anchor_group = [self.anchors[6:9], self.anchors[3:6], self.anchors[0:3]]
+
+        # calc loss in 3 scales
+        for i in range(len(y_pred)):
+            print('##################level', i)
+
+            result = self.loss_layer(y_pred[i], y_true[i], anchor_group[i], self.featrue_map_shape[i], gt_box[i])
+            loss_xy += result[0]
+            loss_wh += result[1]
+            loss_conf += result[2]
+            loss_class += result[3]
+        total_loss = loss_xy + loss_wh + loss_conf + loss_class
+        return [total_loss, loss_xy, loss_wh, loss_conf, loss_class]
@@ -0,0 +1,58 @@
+#!/bin/bash
+scriptDir=$(cd "$(dirname "$0")"; pwd)
+currentDir=$(cd "$(dirname "$scriptDir")"; pwd)
+
+# set env
+source ${currentDir}/config/npu_set_env.sh
+
+# setting main path
+CODE_PATH=currentDir/code
+
+# set env
+export ASCEND_HOME=/usr/local/Ascend
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/
+export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/te:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/topi:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/hccl:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$currentDir
+export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin
+export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+
+export DDK_VERSION_FLAG=1.60.T49.0.B201
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+#export DUMP_GE_GRAPH=2
+#export DUMP_GRAPH_LEVEL=3
+#export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=0
+
+# dump op data
+#export DISABLE_REUSE_MEMORY=1
+#export DUMP_OP=1
+
+ulimit -c unlimited
+
+# local variable
+RANK_SIZE=$1
+RANK_TABLE_FILE=./hccl_config/${RANK_SIZE}p.json
+RANK_ID_START=0
+SAVE_PATH=training/t1
+
+# training stage
+MODE=$2
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+echo
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device "$RANK_ID
+TMP_PATH=$SAVE_PATH/D$RANK_ID
+mkdir -p $TMP_PATH
+cp run_yolov3.sh $TMP_PATH/
+cp $RANK_TABLE_FILE $TMP_PATH/rank_table.json
+cd $TMP_PATH
+nohup bash run_yolov3.sh $RANK_ID $RANK_SIZE $CODE_PATH $MODE > train_$RANK_ID.log &
+cd -
+
+done
+
+
+
+
@@ -0,0 +1 @@
+nohup bash npu_train.sh 1 multi &
@@ -0,0 +1 @@
+nohup bash npu_train.sh 1 single &
@@ -0,0 +1 @@
+nohup bash npu_train.sh 8 multi &
@@ -0,0 +1 @@
+nohup bash npu_train.sh 8 single &
@@ -0,0 +1,50 @@
+
+#clean slog
+rm -rf /var/log/npu/slog/host-0/*.log
+rm -rf /var/log/npu/slog/device-*/*.log
+
+# setting main path
+MAIN_PATH=$(dirname $(readlink -f $0))
+
+# set env
+export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe/:$MAIN_PATH/../../../
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
+PATH=$PATH:$HOME/bin
+export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
+export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+export DDK_VERSION_FLAG=1.60.T49.0.B201
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+export DUMP_GE_GRAPH=1
+export DUMP_GRAPH_LEVEL=1
+export PRINT_MODEL=1
+#export SLOG_PRINT_TO_STDOUT=1
+
+ulimit -c unlimited
+
+# local variable
+RANK_SIZE=$1
+RANK_TABLE_FILE=./configs/${RANK_SIZE}p.json
+RANK_ID_START=1
+SAVE_PATH=training/t1
+
+for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
+do
+
+echo
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[debug]\" --device "$RANK_ID
+
+TMP_PATH=$SAVE_PATH/D$RANK_ID
+mkdir -p $TMP_PATH
+cp run_yolov3.sh $TMP_PATH/
+cp $RANK_TABLE_FILE $TMP_PATH/rank_table.json
+cd $TMP_PATH
+nohup bash run_yolov3.sh $RANK_ID $RANK_SIZE $MAIN_PATH > train_$RANK_ID.log &
+cd -
+
+done
+
+
+
+
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,57 @@
+
+#export CUDA_VISIBLE_DEVICES=''
+#export CUDA_VISIBLE_DEVICES=7
+
+
+
+# setting main path
+MAIN_PATH=$(dirname $(readlink -f $0))
+
+# set env
+export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe/:$MAIN_PATH/../../../
+export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
+PATH=$PATH:$HOME/bin
+export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
+export ASCEND_OPP_PATH=/usr/local/Ascend/opp
+export DDK_VERSION_FLAG=1.60.T49.0.B201
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+export RANK_ID=7
+export RANK_SIZE=1
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export JOB_ID=10087
+export FUSION_TENSOR_SIZE=1000000000
+#export SLOG_PRINT_TO_STDOUT=1
+#export DUMP_GE_GRAPH=2
+#export DUMP_GRAPH_LEVEL=3
+
+su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[debug]\" --device "$RANK_ID
+
+#RESTORE_PATH=/opt/npu/wujianping/epoch200/
+RESTORE_PATH=/opt/npu/w00558981/yolov3_ok_bak_zip/training/t1/D0/training/
+#RESTORE_PATH=/opt/npu/w00558981/training_done_yolov3/training/t1/D0/training/model-epoch_200_step_182000_loss_20.7852_lr_0
+
+while :
+do
+
+#python3.7 eval.py \
+#--save_img True \
+#--score_thresh 0.2 \
+#--restore_path $RESTORE_PATH \
+#--max_test 10 \
+
+
+python3.7 eval.py \
+--save_json True \
+--score_thresh 0.001 \
+--restore_path $RESTORE_PATH \
+--max_test 10000
+
+break
+sleep 1200
+
+done
+
+
@@ -0,0 +1,86 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import tensorflow as tf
+import numpy as np
+import argparse
+import cv2
+
+from utils.misc_utils import parse_anchors, read_class_names
+from utils.nms_utils import gpu_nms
+from utils.plot_utils import get_color_table, plot_one_box
+from utils.data_aug import letterbox_resize
+
+from model import yolov3
+
+parser = argparse.ArgumentParser(description="YOLO-V3 test single image test procedure.")
+parser.add_argument("input_image", type=str,
+                    help="The path of the input image.")
+parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt",
+                    help="The path of the anchor txt file.")
+parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416],
+                    help="Resize the input image with `new_size`, size format: [width, height]")
+parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=True,
+                    help="Whether to use the letterbox resize.")
+parser.add_argument("--class_name_path", type=str, default="./data/coco.names",
+                    help="The path of the class names.")
+parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt",
+                    help="The path of the weights to restore.")
+args = parser.parse_args()
+
+args.anchors = parse_anchors(args.anchor_path)
+args.classes = read_class_names(args.class_name_path)
+args.num_class = len(args.classes)
+
+color_table = get_color_table(args.num_class)
+
+img_ori = cv2.imread(args.input_image)
+if args.letterbox_resize:
+    img, resize_ratio, dw, dh = letterbox_resize(img_ori, args.new_size[0], args.new_size[1])
+else:
+    height_ori, width_ori = img_ori.shape[:2]
+    img = cv2.resize(img_ori, tuple(args.new_size))
+img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+img = np.asarray(img, np.float32)
+img = img[np.newaxis, :] / 255.
+
+with tf.Session() as sess:
+    input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data')
+    yolo_model = yolov3(args.num_class, args.anchors)
+    with tf.variable_scope('yolov3'):
+        pred_feature_maps = yolo_model.forward(input_data, False)
+    pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps)
+
+    pred_scores = pred_confs * pred_probs
+
+    boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=200, score_thresh=0.3, nms_thresh=0.45)
+
+    saver = tf.train.Saver()
+    saver.restore(sess, args.restore_path)
+
+    boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})
+
+    # rescale the coordinates to the original image
+    if args.letterbox_resize:
+        boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
+        boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio
+    else:
+        boxes_[:, [0, 2]] *= (width_ori/float(args.new_size[0]))
+        boxes_[:, [1, 3]] *= (height_ori/float(args.new_size[1]))
+
+    print("box coords:")
+    print(boxes_)
+    print('*' * 30)
+    print("scores:")
+    print(scores_)
+    print('*' * 30)
+    print("labels:")
+    print(labels_)
+
+    for i in range(len(boxes_)):
+        x0, y0, x1, y1 = boxes_[i]
+        plot_one_box(img_ori, [x0, y0, x1, y1], label=args.classes[labels_[i]] + ', {:.2f}%'.format(scores_[i] * 100), color=color_table[labels_[i]])
+    cv2.imshow('Detection result', img_ori)
+    cv2.imwrite('detection_result.jpg', img_ori)
+    cv2.waitKey(0)
@@ -0,0 +1,287 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import tensorflow as tf
+import numpy as np
+import logging
+from tqdm import trange
+import random
+import time
+import datetime
+from utils.data_utils import get_batch_data, color_jitter
+from utils.misc_utils import shuffle_and_overwrite, make_summary, config_learning_rate, config_optimizer, AverageMeter
+from utils.eval_utils import evaluate_on_cpu, evaluate_on_gpu, get_preds_gpu, voc_eval, parse_gt_rec
+from model import yolov3
+import time
+import os
+import sys
+# npu modified
+from npu_bridge.estimator import npu_ops
+from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
+from npu_bridge.estimator.npu.npu_loss_scale_optimizer import NPULossScaleOptimizer
+from npu_bridge.estimator.npu.npu_loss_scale_manager import FixedLossScaleManager
+from npu_bridge.estimator.npu.npu_loss_scale_manager import ExponentialUpdateLossScaleManager
+from tensorflow.core.protobuf.rewriter_config_pb2 import RewriterConfig
+from npu_bridge.estimator.npu import util
+
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../../'))
+sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../utils/atlasboost'))
+from benchmark_log import hwlog
+from benchmark_log.basic_utils import get_environment_info
+from benchmark_log.basic_utils import get_model_parameter
+import argparse
+
+hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
+cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
+config_info = get_model_parameter("tensorflow_config")
+initinal_data={"base_lr": 0.128, "dataset": "coco1024", "optimizer": "Adam", "loss_scale": 512, "batchsize": 32}
+
+hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
+hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
+hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
+hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
+hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
+hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
+hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
+hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
+hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
+hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
+hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
+
+parser = argparse.ArgumentParser(description="YOLO-V3 training setting.")
+parser.add_argument("--mode", type=str, default='single',
+                    help="setting train mode of training.")
+parser.add_argument("--resume", type=bool, default=False,
+                    help="setting if train from resume.")
+args_input = parser.parse_args()
+
+if args_input.mode == 'single':
+    import args_single as args
+elif args_input.mode == 'multi':
+    import args_multi as args
+print('setting train mode %s.' %args_input.mode)
+
+# setting loggers
+logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s',
+                    datefmt='%a, %d %b %Y %H:%M:%S', filename=args.progress_log_path, filemode='w')
+
+
+##################
+# tf.data pipeline
+##################
+train_dataset = tf.data.TextLineDataset(args.train_file)
+print('##########################args_input.rank_id', os.environ['RANK_ID'])
+logging.info('shuffle seed_%s args.', os.environ['RANK_ID'])
+
+train_dataset = train_dataset.shuffle(args.train_img_cnt, seed=int(os.environ['RANK_ID']),
+                                      reshuffle_each_iteration=True)
+print('##########################args.train_img_cnt', args.train_img_cnt)
+
+train_dataset = train_dataset.repeat()
+train_dataset = train_dataset.batch(args.batch_size, drop_remainder=True)  # npu modified
+train_dataset = train_dataset.map(
+    lambda x: tf.py_func(get_batch_data,
+                         inp=[x, args.class_num, args.img_size, args.anchors, 'train', args.multi_scale_train,
+                              args.use_mix_up, args.letterbox_resize],
+                         Tout=[tf.float32,
+                               tf.float32, tf.float32, tf.float32,
+                               tf.float32, tf.float32, tf.float32]),
+    num_parallel_calls=20
+)
+
+
+def valid_shape(*x):
+    image, y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52 = x
+    y_true = [y_true_13, y_true_26, y_true_52]
+    gt_box = [gt_box_13, gt_box_26, gt_box_52]
+
+    # npu modified
+    if args_input.mode == 'single':
+        image.set_shape([args.batch_size, args.img_size[0], args.img_size[1], 3])
+        y_true[0].set_shape([args.batch_size, 13, 13, 3, 86])
+        y_true[1].set_shape([args.batch_size, 26, 26, 3, 86])
+        y_true[2].set_shape([args.batch_size, 52, 52, 3, 86])
+    elif args_input.mode == 'multi':
+        image.set_shape([args.batch_size, args.img_size[0], args.img_size[1], 3])
+        y_true[0].set_shape([args.batch_size, 19*1, 19*1, 3, 86])
+        y_true[1].set_shape([args.batch_size, 19*2, 19*2, 3, 86])
+        y_true[2].set_shape([args.batch_size, 19*4, 19*4, 3, 86])
+
+    gt_box[0].set_shape([args.batch_size, 1, 32, 4])
+    gt_box[1].set_shape([args.batch_size, 1, 64, 4])
+    gt_box[2].set_shape([args.batch_size, 1, 128, 4])
+
+    image = color_jitter(
+        image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
+
+    return image, y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52
+
+
+train_dataset = train_dataset.map(valid_shape, num_parallel_calls=20)
+train_dataset = train_dataset.prefetch(args.prefetech_buffer)
+iterator = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)
+train_init_op = iterator.make_initializer(train_dataset)
+# get an element from the chosen dataset iterator
+image, y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52 = iterator.get_next()
+y_true = [y_true_13, y_true_26, y_true_52]
+gt_box = [gt_box_13, gt_box_26, gt_box_52]
+
+
+##################
+# Model definition
+##################
+yolo_model = yolov3(args.class_num, args.anchors, args.use_label_smooth, args.use_focal_loss, args.batch_norm_decay,
+                    args.weight_decay, use_static_shape=False,
+                    batch_size=args.batch_size, img_size=args.img_size)
+
+with tf.variable_scope('yolov3'):
+    pred_feature_maps = yolo_model.forward(image, is_training=True)
+loss = yolo_model.compute_loss(pred_feature_maps, y_true, gt_box)
+l2_loss = tf.losses.get_regularization_loss()
+
+# setting restore parts and vars to update
+saver_to_restore = tf.train.Saver(
+    var_list=tf.contrib.framework.get_variables_to_restore(include=args.restore_include, exclude=args.restore_exclude))
+update_vars = tf.contrib.framework.get_variables_to_restore(include=args.update_part)
+
+tf.summary.scalar('train_batch_statistics/total_loss', loss[0])
+tf.summary.scalar('train_batch_statistics/loss_xy', loss[1])
+tf.summary.scalar('train_batch_statistics/loss_wh', loss[2])
+tf.summary.scalar('train_batch_statistics/loss_conf', loss[3])
+tf.summary.scalar('train_batch_statistics/loss_class', loss[4])
+tf.summary.scalar('train_batch_statistics/loss_l2', l2_loss)
+tf.summary.scalar('train_batch_statistics/loss_ratio', l2_loss / loss[0])
+
+def learning_rate_fn(global_step):
+    """Builds scaled learning rate function with 0.08 epoch warm up."""
+    initial_learning_rate = args.learning_rate_init
+    batches_per_epoch = args.train_batch_num // args.iterations_per_loop * args.iterations_per_loop
+    total_steps = int(args.total_epoches * batches_per_epoch)
+    warmup_steps = int(batches_per_epoch * args.warm_up_epoch)
+    tf.compat.v1.logging.info('total_steps: %d', int(total_steps))
+    tf.compat.v1.logging.info('warmup_steps: %d', int(warmup_steps))
+    lr = tf.maximum(
+        tf.compat.v1.train.cosine_decay(
+            learning_rate=initial_learning_rate,
+            global_step=global_step - warmup_steps,
+            decay_steps=total_steps - warmup_steps,
+        ),
+        0,
+    )
+    warmup_lr = (
+            initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
+        warmup_steps, tf.float32))
+    return tf.cond(pred=global_step < warmup_steps,
+                   true_fn=lambda: warmup_lr,
+                   false_fn=lambda: lr)
+
+
+global_step = tf.train.get_or_create_global_step()
+learning_rate = learning_rate_fn(global_step)
+tf.summary.scalar('learning_rate', learning_rate)
+
+if not args.save_optimizer:
+    saver_to_save = tf.train.Saver()
+    saver_best = tf.train.Saver()
+
+optimizer = config_optimizer(args.optimizer_name, learning_rate)
+optimizer = NPUDistributedOptimizer(optimizer)
+loss_scale_manager = FixedLossScaleManager(loss_scale=128)
+if args.num_gpus > 1:
+    optimizer = NPULossScaleOptimizer(optimizer, loss_scale_manager, is_distributed=True)
+else:
+    optimizer = NPULossScaleOptimizer(optimizer, loss_scale_manager, is_distributed=False)
+
+# set dependencies for BN ops
+update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
+with tf.control_dependencies(update_ops):
+    # apply gradient clip to avoid gradient exploding
+    gvs = optimizer.compute_gradients(loss[0] + l2_loss, var_list=update_vars)
+    clip_grad_var = [gv if gv[0] is None else [
+        tf.clip_by_norm(gv[0], 100.), gv[1]] for gv in gvs]
+    train_op = optimizer.apply_gradients(clip_grad_var, global_step=tf.train.get_global_step())
+
+if args.save_optimizer:
+    print(
+        'Saving optimizer parameters to checkpoint! Remember to restore the global_step in the fine-tuning afterwards.')
+    saver_to_save = tf.train.Saver()
+    saver_best = tf.train.Saver()
+
+# npu modified
+config = tf.ConfigProto()
+custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
+custom_op.name = "NpuOptimizer"
+custom_op.parameter_map["use_off_line"].b = True  # training on Ascend chips
+custom_op.parameter_map["enable_data_pre_proc"].b = True
+custom_op.parameter_map["iterations_per_loop"].i = args.iterations_per_loop
+config.graph_options.rewrite_options.remapping = RewriterConfig.OFF
+
+with tf.Session(config=config) as sess:
+    # yolov3 finetuning训练开启（darknet53.ckpt）
+    sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
+
+    # 断点续训开启
+    if args_input.resume:
+        saver_to_restore = tf.train.Saver()
+        saver_to_restore.restore(sess, tf.train.latest_checkpoint(args.save_dir))
+    else:
+        saver_to_restore.restore(sess, args.restore_path)
+
+    merged = tf.summary.merge_all()
+    writer = tf.summary.FileWriter(args.log_dir, sess.graph)
+
+    print('\n----------- start to train -----------\n')
+     
+    #hwlog.logger.info("time_ts:%s, hardware:%s current os:%s" %(date_time,'Ascend910','Ubuntu 18.04'))
+    #hwlog.logger.info("time_ts:%s, framework is tensorflow 1.15.0 " %(date_time))
+    #remark_logger.info("ABK time_ts: %s, yolov3 %s model train begain, total train_epoches:%d, file: %s, lineno: %s" %(date_time,args_input.mode,args.total_epoches,file_name,sys._getframe().f_lineno))
+    hwlog.remark_print(key=hwlog.TOTAL_TRAIN_EPOCH, value=f"{args.total_epoches}")
+    best_mAP = -np.Inf
+    train_op = util.set_iteration_per_loop(sess, train_op, args.iterations_per_loop)
+    sess.run(train_init_op)
+    for epoch in range(args.total_epoches):
+        loss_total, loss_xy, loss_wh, loss_conf, loss_class = AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter(), AverageMeter()
+        for i in trange(args.train_batch_num // args.iterations_per_loop):
+            t = time.time()
+            _, summary, __y_true, __loss, __global_step, __lr = sess.run(
+                [train_op, merged, y_true, loss, global_step, learning_rate]
+            )
+            fps = 1 / (time.time() - t) * args.iterations_per_loop * args.num_gpus * args.batch_size
+
+            writer.add_summary(summary, global_step=__global_step)
+
+            loss_total.update(__loss[0], len(__y_true[0]))
+            loss_xy.update(__loss[1], len(__y_true[0]))
+            loss_wh.update(__loss[2], len(__y_true[0]))
+            loss_conf.update(__loss[3], len(__y_true[0]))
+            loss_class.update(__loss[4], len(__y_true[0]))
+
+            info = "Epoch: {}, global_step: {} fps: {:.2f} lr: {:.5f} | loss: total: {:.2f}, xy: {:.2f}, wh: {:.2f}, conf: {:.2f}, class: {:.2f} | ".format(
+                epoch, int(__global_step), fps, __lr, loss_total.average, loss_xy.average, loss_wh.average,
+                loss_conf.average,
+                loss_class.average)
+            print(info)
+            logging.info(info)
+            #remark_logger.info("ABK time_ts:%s, global_steps %d, learning rate %2f, file: %s, lineno: %s" %(date_time,int(__global_step),__lr,file_name,sys._getframe().f_lineno))
+            #remark_logger.info("ABK time_ts:%s, fps %2f, loss_total %2f, file: %s, lineno: %s" %(date_time,fps,loss_total.average,file_name,sys._getframe().f_lineno))
+
+            hwlog.remark_print(key=hwlog.FPS, value=f"{fps}")
+            hwlog.remark_print(key=hwlog.GLOBAL_STEP, value=f"{int(__global_step)}")
+
+        # NOTE: this is just demo. You can set the conditions when to save the weights.
+        temp_epoch = epoch + 1
+        if temp_epoch % args.save_epoch == 0 and epoch > 0:
+            saver_to_save.save(sess, args.save_dir + 'model-epoch_{}_step_{}_loss_{:.4f}_lr_{:.5g}'.format( \
+                temp_epoch,
+                int(__global_step),
+                loss_total.average,
+                __lr))
+
+        if __lr <= 0:
+            break
+
+    saver_to_save.save(sess, args.save_dir + 'model-final_step_{}_loss_{:.4f}_lr_{:.5g}'.format( \
+        int(__global_step),
+        loss_total.average,
+        __lr))
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,109 @@
+{
+    "board_id": "0x002f",
+    "chip_info": "910",
+    "deploy_mode": "lab",
+    "group_count": "1",
+    "group_list": [
+        {
+            "device_num": "8",
+            "server_num": "1",
+            "group_name": "",
+            "instance_count": "8",
+            "instance_list": [
+                {
+                    "devices": [
+                        {
+                            "device_id": "0",
+                            "device_ip": "192.168.100.101"
+                        }
+                    ],
+                    "rank_id": "0",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "1",
+                            "device_ip": "192.168.101.101"
+                        }
+                    ],
+                    "rank_id": "1",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "2",
+                            "device_ip": "192.168.102.101"
+                        }
+                    ],
+                    "rank_id": "2",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "3",
+                            "device_ip": "192.168.103.101"
+                        }
+                    ],
+                    "rank_id": "3",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "4",
+                            "device_ip": "192.168.100.100"
+                        }
+                    ],
+                    "rank_id": "4",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "5",
+                            "device_ip": "192.168.101.100"
+                        }
+                    ],
+                    "rank_id": "5",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "6",
+                            "device_ip": "192.168.102.100"
+                        }
+                    ],
+                    "rank_id": "6",
+                    "server_id": "0.0.0.0"
+                },
+                {
+                    "devices": [
+                        {
+                            "device_id": "7",
+                            "device_ip": "192.168.103.100"
+                        }
+                    ],
+                    "rank_id": "7",
+                    "server_id": "0.0.0.0"
+                }
+            ]
+        }
+    ],
+    "para_plane_nic_location": "device",
+    "para_plane_nic_name": [
+        "eth0",
+        "eth1",
+        "eth2",
+        "eth3",
+        "eth4",
+        "eth5",
+        "eth6",
+        "eth7"
+    ],
+    "para_plane_nic_num": "8",
+    "status": "completed"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+rm -rf Onnxgraph
+rm -rf Partition
+rm -rf OptimizeSubGraph
+rm -rf Aicpu_Optimized
+rm *txt
+rm -rf result_$RANK_ID
+
+
+
+export RANK_ID=$1
+export RANK_SIZE=$2
+export DEVICE_ID=$RANK_ID
+export DEVICE_INDEX=$RANK_ID
+export RANK_TABLE_FILE=rank_table.json
+export JOB_ID=123678
+export FUSION_TENSOR_SIZE=1000000000
+
+KERNEL_NUM=20
+PID_START=$((KERNEL_NUM * RANK_ID))
+PID_END=$((PID_START + KERNEL_NUM - 1))
+
+#sleep 5
+taskset -c  $PID_START-$PID_END python3 $3/train.py \
+--mode $4
+
+mkdir graph
+mv *.txt graph
+mv *.pbtxt graph
@@ -0,0 +1,450 @@
+# coding: utf-8
+# part of this is take from Gluon's repo:
+# https://github.com/dmlc/gluon-cv/blob/master/gluoncv/data/transforms/presets/yolo.py
+
+from __future__ import division, print_function
+
+import random
+import numpy as np
+import cv2
+# from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
+
+
+def mix_up(img1, img2, bbox1, bbox2):
+    '''
+    return:
+        mix_img: HWC format mix up image
+        mix_bbox: [N, 5] shape mix up bbox, i.e. `x_min, y_min, x_max, y_mix, mixup_weight`.
+    '''
+    height = max(img1.shape[0], img2.shape[0])
+    width = max(img1.shape[1], img2.shape[1])
+
+    mix_img = np.zeros(shape=(height, width, 3), dtype='float32')
+
+    # rand_num = np.random.random()
+    rand_num = np.random.beta(1.5, 1.5)
+    rand_num = max(0, min(1, rand_num))
+    mix_img[:img1.shape[0], :img1.shape[1], :] = img1.astype('float32') * rand_num
+    mix_img[:img2.shape[0], :img2.shape[1], :] += img2.astype('float32') * (1. - rand_num)
+
+    mix_img = mix_img.astype('uint8')
+
+    # the last element of the 2nd dimention is the mix up weight
+    bbox1 = np.concatenate((bbox1, np.full(shape=(bbox1.shape[0], 1), fill_value=rand_num)), axis=-1)
+    bbox2 = np.concatenate((bbox2, np.full(shape=(bbox2.shape[0], 1), fill_value=1. - rand_num)), axis=-1)
+    mix_bbox = np.concatenate((bbox1, bbox2), axis=0)
+
+    return mix_img, mix_bbox
+
+
+def bbox_crop(bbox, crop_box=None, allow_outside_center=True):
+    """Crop bounding boxes according to slice area.
+    This method is mainly used with image cropping to ensure bonding boxes fit
+    within the cropped image.
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    crop_box : tuple
+        Tuple of length 4. :math:`(x_{min}, y_{min}, width, height)`
+    allow_outside_center : bool
+        If `False`, remove bounding boxes which have centers outside cropping area.
+    Returns
+    -------
+    numpy.ndarray
+        Cropped bounding boxes with shape (M, 4+) where M <= N.
+    """
+    bbox = bbox.copy()
+    if crop_box is None:
+        return bbox
+    if not len(crop_box) == 4:
+        raise ValueError(
+            "Invalid crop_box parameter, requires length 4, given {}".format(str(crop_box)))
+    if sum([int(c is None) for c in crop_box]) == 4:
+        return bbox
+
+    l, t, w, h = crop_box
+
+    left = l if l else 0
+    top = t if t else 0
+    right = left + (w if w else np.inf)
+    bottom = top + (h if h else np.inf)
+    crop_bbox = np.array((left, top, right, bottom))
+
+    if allow_outside_center:
+        mask = np.ones(bbox.shape[0], dtype=bool)
+    else:
+        centers = (bbox[:, :2] + bbox[:, 2:4]) / 2
+        mask = np.logical_and(crop_bbox[:2] <= centers, centers < crop_bbox[2:]).all(axis=1)
+
+    # transform borders
+    bbox[:, :2] = np.maximum(bbox[:, :2], crop_bbox[:2])
+    bbox[:, 2:4] = np.minimum(bbox[:, 2:4], crop_bbox[2:4])
+    bbox[:, :2] -= crop_bbox[:2]
+    bbox[:, 2:4] -= crop_bbox[:2]
+
+    mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:4]).all(axis=1))
+    bbox = bbox[mask]
+    return bbox
+
+def bbox_iou(bbox_a, bbox_b, offset=0):
+    """Calculate Intersection-Over-Union(IOU) of two bounding boxes.
+    Parameters
+    ----------
+    bbox_a : numpy.ndarray
+        An ndarray with shape :math:`(N, 4)`.
+    bbox_b : numpy.ndarray
+        An ndarray with shape :math:`(M, 4)`.
+    offset : float or int, default is 0
+        The ``offset`` is used to control the whether the width(or height) is computed as
+        (right - left + ``offset``).
+        Note that the offset must be 0 for normalized bboxes, whose ranges are in ``[0, 1]``.
+    Returns
+    -------
+    numpy.ndarray
+        An ndarray with shape :math:`(N, M)` indicates IOU between each pairs of
+        bounding boxes in `bbox_a` and `bbox_b`.
+    """
+    if bbox_a.shape[1] < 4 or bbox_b.shape[1] < 4:
+        raise IndexError("Bounding boxes axis 1 must have at least length 4")
+
+    tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
+    br = np.minimum(bbox_a[:, None, 2:4], bbox_b[:, 2:4])
+
+    area_i = np.prod(br - tl + offset, axis=2) * (tl < br).all(axis=2)
+    area_a = np.prod(bbox_a[:, 2:4] - bbox_a[:, :2] + offset, axis=1)
+    area_b = np.prod(bbox_b[:, 2:4] - bbox_b[:, :2] + offset, axis=1)
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def random_crop_with_constraints(bbox, size, min_scale=0.25, max_scale=1,
+                                 max_aspect_ratio=2, constraints=None,
+                                 max_trial=10):
+    """Crop an image randomly with bounding box constraints.
+    This data augmentation is used in training of
+    Single Shot Multibox Detector [#]_. More details can be found in
+    data augmentation section of the original paper.
+    .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy,
+       Scott Reed, Cheng-Yang Fu, Alexander C. Berg.
+       SSD: Single Shot MultiBox Detector. ECCV 2016.
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    size : tuple
+        Tuple of length 2 of image shape as (width, height).
+    min_scale : float
+        The minimum ratio between a cropped region and the original image.
+        The default value is :obj:`0.3`.
+    max_scale : float
+        The maximum ratio between a cropped region and the original image.
+        The default value is :obj:`1`.
+    max_aspect_ratio : float
+        The maximum aspect ratio of cropped region.
+        The default value is :obj:`2`.
+    constraints : iterable of tuples
+        An iterable of constraints.
+        Each constraint should be :obj:`(min_iou, max_iou)` format.
+        If means no constraint if set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`.
+        If this argument defaults to :obj:`None`, :obj:`((0.1, None), (0.3, None),
+        (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used.
+    max_trial : int, default 40
+        Maximum number of trials for each constraint before exit no matter what.
+    Returns
+    -------
+    numpy.ndarray
+        Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N.
+    tuple
+        Tuple of length 4 as (x_offset, y_offset, new_width, new_height).
+    """
+    # default params in paper
+    if constraints is None:
+        constraints = (
+            # (0.1, None),
+            (0.3, None),
+            (0.5, None),
+            (0.7, None),
+            (0.9, None),
+            (None, 1),
+        )
+
+    w, h = size
+
+    candidates = [(0, 0, w, h)]
+    for min_iou, max_iou in constraints:
+        min_iou = -np.inf if min_iou is None else min_iou
+        max_iou = np.inf if max_iou is None else max_iou
+
+        for _ in range(max_trial):
+            scale = random.uniform(min_scale, max_scale)
+            aspect_ratio = random.uniform(
+                max(1 / max_aspect_ratio, scale * scale),
+                min(max_aspect_ratio, 1 / (scale * scale)))
+            crop_h = int(h * scale / np.sqrt(aspect_ratio))
+            crop_w = int(w * scale * np.sqrt(aspect_ratio))
+
+            crop_t = random.randrange(h - crop_h)
+            crop_l = random.randrange(w - crop_w)
+            crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h))
+
+            if len(bbox) == 0:
+                top, bottom = crop_t, crop_t + crop_h
+                left, right = crop_l, crop_l + crop_w
+                return bbox, (left, top, right-left, bottom-top)
+
+            iou = bbox_iou(bbox, crop_bb[np.newaxis])
+            if min_iou <= iou.min() and iou.max() <= max_iou:
+                top, bottom = crop_t, crop_t + crop_h
+                left, right = crop_l, crop_l + crop_w
+                candidates.append((left, top, right-left, bottom-top))
+                break
+
+    # random select one
+    while candidates:
+        crop = candidates.pop(np.random.randint(0, len(candidates)))
+        new_bbox = bbox_crop(bbox, crop, allow_outside_center=False)
+        if new_bbox.size < 1:
+            continue
+        new_crop = (crop[0], crop[1], crop[2], crop[3])
+        return new_bbox, new_crop
+    return bbox, (0, 0, w, h)
+
+def _rand(a=0., b=1.):
+    return np.random.rand() * (b - a) + a
+def random_color_distort(image_data, _hue=0.1, _sat=1.5, _val=1.5):
+    _hue = _rand(-_hue, _hue)
+    _sat = _rand(1, _sat) if _rand() < .5 else 1 / _rand(1, _sat)
+    _val = _rand(1, _val) if _rand() < .5 else 1 / _rand(1, _val)
+    x = rgb_to_hsv(image_data)
+    x[..., 0] += _hue
+    x[..., 0][x[..., 0] > 1] -= 1
+    x[..., 0][x[..., 0] < 0] += 1
+    x[..., 1] *= _sat
+    x[..., 2] *= _val
+    x[x > 1] = 1
+    x[x < 0] = 0
+    image_data = hsv_to_rgb(x)
+    image_data = image_data.astype(np.float32)
+    return image_data
+
+
+def random_color_distort_1(img, bgain=16, hgain=0.0138, sgain=0.678, vgain=0.36):
+    # brightness_delta = int(np.random.uniform(-bgain, bgain))
+    # img = np.clip(img + brightness_delta , 0, 255)
+    # img = img.astype(np.uint8)
+
+    r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1  # random gains
+    hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
+    dtype = img.dtype  # uint8
+
+    x = np.arange(0, 256, dtype=np.int16)
+    lut_hue = ((x * r[0]) % 180).astype(dtype)
+    lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
+    lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
+
+    img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
+    img = cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR)  # no return needed
+
+    return img
+
+
+
+def random_color_distort_raw(img, brightness_delta=16, hue_vari=0.01, sat_vari=0.15, val_vari=0.15, p=0.2):
+    '''
+    randomly distort image color. Adjust brightness, hue, saturation, value.
+    param:
+        img: a BGR uint8 format OpenCV image. HWC format.
+    '''
+
+    def random_hue(img_hsv, hue_vari, p=p):
+        if np.random.uniform(0, 1) > p:
+            hue_delta = np.random.randint(-hue_vari, hue_vari)
+            img_hsv[:, :, 0] = (img_hsv[:, :, 0] + hue_delta) % 180
+        return img_hsv
+
+    def random_saturation(img_hsv, sat_vari, p=p):
+        if np.random.uniform(0, 1) > p:
+            sat_mult = 1 + np.random.uniform(-sat_vari, sat_vari)
+            img_hsv[:, :, 1] *= sat_mult
+        return img_hsv
+
+    def random_value(img_hsv, val_vari, p=p):
+        if np.random.uniform(0, 1) > p:
+            val_mult = 1 + np.random.uniform(-val_vari, val_vari)
+            img_hsv[:, :, 2] *= val_mult
+        return img_hsv
+
+    def random_brightness(img, brightness_delta, p=p):
+        if np.random.uniform(0, 1) > p:
+            img = img.astype(np.float32)
+            brightness_delta = int(np.random.uniform(-brightness_delta, brightness_delta))
+            img = img + brightness_delta
+        return np.clip(img, 0, 255)
+
+    # brightness
+    img = random_brightness(img, brightness_delta)
+    img = img.astype(np.uint8)
+
+    # color jitter
+    img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV).astype(np.float32)
+
+    if np.random.randint(0, 2):
+        img_hsv = random_value(img_hsv, val_vari)
+        img_hsv = random_saturation(img_hsv, sat_vari)
+        img_hsv = random_hue(img_hsv, hue_vari)
+    else:
+        img_hsv = random_saturation(img_hsv, sat_vari)
+        img_hsv = random_hue(img_hsv, hue_vari)
+        img_hsv = random_value(img_hsv, val_vari)
+
+    img_hsv = np.clip(img_hsv, 0, 255)
+    img = cv2.cvtColor(img_hsv.astype(np.uint8), cv2.COLOR_HSV2BGR)
+
+    return img
+
+
+def letterbox_resize(img, new_width, new_height, interp=0):
+    '''
+    Letterbox resize. keep the original aspect ratio in the resized image.
+    '''
+    ori_height, ori_width = img.shape[:2]
+
+    resize_ratio = min(new_width / ori_width, new_height / ori_height)
+
+    resize_w = int(resize_ratio * ori_width)
+    resize_h = int(resize_ratio * ori_height)
+
+    img = cv2.resize(img, (resize_w, resize_h), interpolation=interp)
+    image_padded = np.full((new_height, new_width, 3), 128, np.uint8)
+
+    dw = int((new_width - resize_w) / 2)
+    dh = int((new_height - resize_h) / 2)
+
+    image_padded[dh: resize_h + dh, dw: resize_w + dw, :] = img
+
+    return image_padded, resize_ratio, dw, dh
+
+
+def resize_with_bbox(img, bbox, new_width, new_height, interp=0, letterbox=False):
+    '''
+    Resize the image and correct the bbox accordingly.
+    '''
+
+    if letterbox:
+        image_padded, resize_ratio, dw, dh = letterbox_resize(img, new_width, new_height, interp)
+
+        # xmin, xmax
+        bbox[:, [0, 2]] = bbox[:, [0, 2]] * resize_ratio + dw
+        # ymin, ymax
+        bbox[:, [1, 3]] = bbox[:, [1, 3]] * resize_ratio + dh
+
+        return image_padded, bbox
+    else:
+        ori_height, ori_width = img.shape[:2]
+
+        img = cv2.resize(img, (new_width, new_height), interpolation=interp)
+
+        # xmin, xmax
+        bbox[:, [0, 2]] = bbox[:, [0, 2]] / ori_width * new_width
+        # ymin, ymax
+        bbox[:, [1, 3]] = bbox[:, [1, 3]] / ori_height * new_height
+
+        return img, bbox
+
+
+def random_flip(img, bbox, px=0, py=0):
+    '''
+    Randomly flip the image and correct the bbox.
+    param:
+    px:
+        the probability of horizontal flip
+    py:
+        the probability of vertical flip
+    '''
+    height, width = img.shape[:2]
+    if np.random.uniform(0, 1) < px:
+        img = cv2.flip(img, 1)
+        xmax = width - bbox[:, 0]
+        xmin = width - bbox[:, 2]
+        bbox[:, 0] = xmin
+        bbox[:, 2] = xmax
+
+    if np.random.uniform(0, 1) < py:
+        img = cv2.flip(img, 0)
+        ymax = height - bbox[:, 1]
+        ymin = height - bbox[:, 3]
+        bbox[:, 1] = ymin
+        bbox[:, 3] = ymax
+    return img, bbox
+
+def random_resize(img, bbox, min_ratio=0.25, max_ratio=2, jitter=0.3):
+    '''
+    Random expand original image with borders, this is identical to placing
+    the original image on a larger canvas.
+    param:
+    max_ratio :
+        Maximum ratio of the output image on both direction(vertical and horizontal)
+    fill :
+        The value(s) for padded borders.
+    keep_ratio : bool
+        If `True`, will keep output image the same aspect ratio as input.
+    '''
+    h,w,c = img.shape
+    max_ratio_limited = 608 / max(h,w)
+    scale = random.uniform(min_ratio, max_ratio)
+    scale = min(max_ratio_limited, scale)
+
+    w_ratio = random.uniform(1 - jitter, 1 + jitter) * scale
+    h_ratio = random.uniform(1 - jitter, 1 + jitter) * scale
+
+    dst = cv2.resize(img, None, fx=w_ratio, fy=h_ratio)
+
+    # correct bbox
+    bbox[:, 0] *= w_ratio
+    bbox[:, 2] *= w_ratio
+    bbox[:, 1] *= h_ratio
+    bbox[:, 3] *= h_ratio
+
+    return dst, bbox
+
+
+def random_expand(img, bbox, max_ratio=2, fill=0, keep_ratio=True):
+    '''
+    Random expand original image with borders, this is identical to placing
+    the original image on a larger canvas.
+    param:
+    max_ratio :
+        Maximum ratio of the output image on both direction(vertical and horizontal)
+    fill :
+        The value(s) for padded borders.
+    keep_ratio : bool
+        If `True`, will keep output image the same aspect ratio as input.
+    '''
+    h, w, c = img.shape
+    ratio_x = random.uniform(1, max_ratio)
+    if keep_ratio:
+        ratio_y = ratio_x
+    else:
+        ratio_y = random.uniform(1, max_ratio)
+
+    oh, ow = int(h * ratio_y), int(w * ratio_x)
+    off_y = random.randint(0, oh - h)
+    off_x = random.randint(0, ow - w)
+
+    dst = np.full(shape=(oh, ow, c), fill_value=fill, dtype=img.dtype)
+
+    dst[off_y:off_y + h, off_x:off_x + w, :] = img
+
+    # correct bbox
+    bbox[:, :2] += (off_x, off_y)
+    bbox[:, 2:4] += (off_x, off_y)
+
+    return dst, bbox
@@ -0,0 +1,294 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import numpy as np
+import cv2
+import sys
+from utils.data_aug import *
+import random
+import tensorflow as tf
+
+PY_VERSION = sys.version_info[0]
+iter_cnt = 0
+IterControl = 50
+
+def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0):
+  """Distorts the color of the image.
+
+  Args:
+    image: The input image tensor.
+    brightness: A float, specifying the brightness for color jitter.
+    contrast: A float, specifying the contrast for color jitter.
+    saturation: A float, specifying the saturation for color jitter.
+    hue: A float, specifying the hue for color jitter.
+
+  Returns:
+    The distorted image tensor.
+  """
+  with tf.name_scope('distort_color'):
+    if brightness > 0:
+      image = tf.image.random_brightness(image, max_delta=brightness)
+    if contrast > 0:
+      image = tf.image.random_contrast(
+          image, lower=1-contrast, upper=1+contrast)
+    if saturation > 0:
+      image = tf.image.random_saturation(
+          image, lower=1-saturation, upper=1+saturation)
+    if hue > 0:
+      image = tf.image.random_hue(image, max_delta=hue)
+    return image
+
+def parse_line(line):
+    '''
+    Given a line from the training/test txt file, return parsed info.
+    line format: line_index, img_path, img_width, img_height, [box_info_1 (5 number)], ...
+    return:
+        line_idx: int32
+        pic_path: string.
+        boxes: shape [N, 4], N is the ground truth count, elements in the second
+            dimension are [x_min, y_min, x_max, y_max]
+        labels: shape [N]. class index.
+        img_width: int.
+        img_height: int
+    '''
+    if 'str' not in str(type(line)):
+        line = line.decode()
+    s = line.strip().split(' ')
+    assert len(
+        s) > 8, 'Annotation error! Please check your annotation file. Make sure there is at least one target object in each image.'
+    # line_idx = int(s[0])
+    pic_path = s[1]
+    img_width = int(s[2])
+    img_height = int(s[3])
+    s = s[4:]
+    assert len(
+        s) % 5 == 0, 'Annotation error! Please check your annotation file. Maybe partially missing some coordinates?'
+    box_cnt = len(s) // 5
+    boxes = []
+    labels = []
+    for i in range(box_cnt):
+        label, x_min, y_min, x_max, y_max = int(s[i * 5]), float(s[i * 5 + 1]), float(s[i * 5 + 2]), float(
+            s[i * 5 + 3]), float(s[i * 5 + 4])
+        boxes.append([x_min, y_min, x_max, y_max])
+        labels.append(label)
+    boxes = np.asarray(boxes, np.float32)
+    labels = np.asarray(labels, np.int32)
+    return pic_path, boxes, labels, img_width, img_height
+
+
+def process_box(boxes, labels, img_size, class_num, anchors):
+    '''
+    Generate the y_true label, i.e. the ground truth feature_maps in 3 different scales.
+    params:
+        boxes: [N, 5] shape, float32 dtype. `x_min, y_min, x_max, y_mix, mixup_weight`.
+        labels: [N] shape, int32 dtype.
+        class_num: int32 num.
+        anchors: [9, 4] shape, float32 dtype.
+    '''
+    anchors_mask = [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
+
+    # boxes = np.random.shuffle()
+    # convert boxes form:
+    # shape: [N, 2]
+    # (x_center, y_center)
+    box_centers = (boxes[:, 0:2] + boxes[:, 2:4]) / 2
+    # (width, height)
+    box_sizes = boxes[:, 2:4] - boxes[:, 0:2]
+
+    # [13, 13, 3, 5+num_class+1] `5` means coords and labels. `1` means mix up weight. 
+    y_true_13 = np.zeros((img_size[1] // 32, img_size[0] // 32, 3, 6 + class_num), np.float32)
+    y_true_26 = np.zeros((img_size[1] // 16, img_size[0] // 16, 3, 6 + class_num), np.float32)
+    y_true_52 = np.zeros((img_size[1] // 8, img_size[0] // 8, 3, 6 + class_num), np.float32)
+
+    gt_box_13 = np.zeros((1, 32, 4), np.float32)
+    gt_box_26 = np.zeros((1, 64, 4), np.float32)
+    gt_box_52 = np.zeros((1, 128, 4), np.float32)
+    gt_box_list = [gt_box_13, gt_box_26, gt_box_52]
+
+    # mix up weight default to 1.
+    y_true_13[..., -1] = 1.
+    y_true_26[..., -1] = 1.
+    y_true_52[..., -1] = 1.
+
+    y_true = [y_true_13, y_true_26, y_true_52]
+
+    # [N, 1, 2]
+    box_sizes = np.expand_dims(box_sizes, 1)
+    # broadcast tricks
+    # [N, 1, 2] & [9, 2] ==> [N, 9, 2]
+    mins = np.maximum(- box_sizes / 2, - anchors / 2)
+    maxs = np.minimum(box_sizes / 2, anchors / 2)
+    # [N, 9, 2]
+    whs = maxs - mins
+
+    # [N, 9]
+    iou = (whs[:, :, 0] * whs[:, :, 1]) / (
+            box_sizes[:, :, 0] * box_sizes[:, :, 1] + anchors[:, 0] * anchors[:, 1] - whs[:, :, 0] * whs[:, :,
+                                                                                                     1] + 1e-10)
+    # [N]
+    best_match_idx = np.argmax(iou, axis=1)
+
+    ratio_dict = {1.: 8., 2.: 16., 3.: 32.}
+    index_dict = {0: 0, 1: 0, 2: 0}
+    for i, idx in enumerate(best_match_idx):
+        # idx: 0,1,2 ==> 2; 3,4,5 ==> 1; 6,7,8 ==> 0
+        feature_map_group = 2 - idx // 3
+        # scale ratio: 0,1,2 ==> 8; 3,4,5 ==> 16; 6,7,8 ==> 32
+        ratio = ratio_dict[np.ceil((idx + 1) / 3.)]
+        x = int(np.floor(box_centers[i, 0] / ratio))
+        y = int(np.floor(box_centers[i, 1] / ratio))
+        k = anchors_mask[feature_map_group].index(idx)
+        c = labels[i]
+        # print(feature_map_group, '|', y,x,k,c)
+
+        y_true[feature_map_group][y, x, k, :2] = box_centers[i]
+        y_true[feature_map_group][y, x, k, 2:4] = box_sizes[i]
+        y_true[feature_map_group][y, x, k, 4] = 1.
+        y_true[feature_map_group][y, x, k, 5 + c] = 1.
+        y_true[feature_map_group][y, x, k, -1] = boxes[i, -1]
+
+        if index_dict[feature_map_group] < gt_box_list[feature_map_group].shape[1]:
+            gt_box_list[feature_map_group][0, index_dict[feature_map_group], :2] = box_centers[i]
+            gt_box_list[feature_map_group][0, index_dict[feature_map_group], 2:4] = box_sizes[i]
+            index_dict[feature_map_group] += 1
+
+    return y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52
+
+
+def parse_data(line, class_num, img_size, anchors, mode, letterbox_resize, multi_scale):
+    '''
+    param:
+        line: a line from the training/test txt file
+        class_num: totol class nums.
+        img_size: the size of image to be resized to. [width, height] format.
+        anchors: anchors.
+        mode: 'train' or 'val'. When set to 'train', data_augmentation will be applied.
+        letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
+    '''
+    if not isinstance(line, list):
+        print('###################### line')
+        pic_path, boxes, labels, _, _ = parse_line(line)
+        img = cv2.imread(pic_path)
+        # expand the 2nd dimension, mix up weight default to 1.
+        boxes = np.concatenate((boxes, np.full(shape=(boxes.shape[0], 1), fill_value=1., dtype=np.float32)), axis=-1)
+    else:
+        print('###################### mixup')
+        # the mix up case
+        pic_path1, boxes1, labels1, _, _ = parse_line(line[0])
+        img1 = cv2.imread(pic_path1)
+        pic_path2, boxes2, labels2, _, _ = parse_line(line[1])
+        img2 = cv2.imread(pic_path2)
+
+        img, boxes = mix_up(img1, img2, boxes1, boxes2)
+        labels = np.concatenate((labels1, labels2))
+
+    if mode == 'train':
+        img, boxes = random_resize(img, boxes, min_ratio=0.25, max_ratio=2, jitter=0.3)
+
+        # random expansion with prob 0.5
+        if np.random.uniform(0, 1) > 0.5:
+            img, boxes = random_expand(img, boxes, max_ratio=3, fill=128, keep_ratio=False)
+
+        # random cropping
+        h, w, _ = img.shape
+        boxes, crop = random_crop_with_constraints(boxes, (w, h))
+        x0, y0, w, h = crop
+        img = img[y0: y0 + h, x0: x0 + w]
+
+        # resize with random interpolation
+        h, w, _ = img.shape
+        interp = np.random.randint(0, 5)
+        img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=interp, letterbox=letterbox_resize)
+
+        # random horizontal flip
+        h, w, _ = img.shape
+        img, boxes = random_flip(img, boxes, px=0.5)
+
+    else:
+        img, boxes = resize_with_bbox(img, boxes, img_size[0], img_size[1], interp=1, letterbox=letterbox_resize)
+
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
+
+    # the input of yolo_v3 should be in range 0~1
+    img = img / 255.
+
+    if mode == 'train' and iter_cnt >= IterControl and multi_scale:
+        cav = np.zeros((608, 608, 3), dtype=np.float32) + 0.5
+        true_h, true_w, c = img.shape
+        cav[:true_h, :true_w, :] = img
+        img = cav.astype(np.float32)
+        img_size = [608, 608]
+
+    y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52 = process_box(boxes, labels, img_size, class_num,
+                                                                                   anchors)
+
+    return img, y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52
+
+
+def get_batch_data(batch_line, class_num, img_size, anchors, mode, multi_scale=False, mix_up=False,
+                   letterbox_resize=True, interval=10):
+    '''
+    generate a batch of imgs and labels
+    param:
+        batch_line: a batch of lines from train/val.txt files
+        class_num: num of total classes.
+        img_size: the image size to be resized to. format: [width, height].
+        anchors: anchors. shape: [9, 2].
+        mode: 'train' or 'val'. if set to 'train', data augmentation will be applied.
+        multi_scale: whether to use multi_scale training, img_size varies from [320, 320] to [640, 640] by default. Note that it will take effect only when mode is set to 'train'.
+        letterbox_resize: whether to use the letterbox resize, i.e., keep the original aspect ratio in the resized image.
+        interval: change the scale of image every interval batches. Note that it's indeterministic because of the multi threading.
+    '''
+    if isinstance(mode, bytes):
+        mode = mode.decode()
+
+    global iter_cnt
+    # multi_scale training
+    if multi_scale and mode == 'train' and iter_cnt >= IterControl:
+        random.seed(iter_cnt // interval)
+        random_img_size = [[x * 32, x * 32] for x in range(10, 20)]
+        img_size = random.sample(random_img_size, 1)[0]
+        print('multi_scale iter: %d, img_size: %d,%d' % (iter_cnt, img_size[0], img_size[1]))
+    else:
+        print('single_scale iter: %d, img_size: %d,%d' % (iter_cnt, img_size[0], img_size[1]))
+    iter_cnt += 1
+
+    img_idx_batch, img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = [], [], [], [], []
+    gt_box_13_batch, gt_box_26_batch, gt_box_52_batch = [], [], []
+
+    # mix up strategy
+    if mix_up and mode == 'train':
+        mix_lines = []
+        batch_line = batch_line.tolist()
+        for idx, line in enumerate(batch_line):
+            if np.random.uniform(0, 1) < 0.5:
+                mix_lines.append([line, random.sample(batch_line[:idx] + batch_line[idx + 1:], 1)[0]])
+            else:
+                mix_lines.append(line)
+        batch_line = mix_lines
+
+    for line in batch_line:
+        img, y_true_13, y_true_26, y_true_52, gt_box_13, gt_box_26, gt_box_52 = parse_data(line, class_num,
+                                                                                           img_size, anchors,
+                                                                                           mode,
+                                                                                           letterbox_resize,
+                                                                                           multi_scale)
+
+        img_batch.append(img)
+        y_true_13_batch.append(y_true_13)
+        y_true_26_batch.append(y_true_26)
+        y_true_52_batch.append(y_true_52)
+        gt_box_13_batch.append(gt_box_13)
+        gt_box_26_batch.append(gt_box_26)
+        gt_box_52_batch.append(gt_box_52)
+
+    img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch = np.asarray(img_batch, np.float32), np.asarray(
+        y_true_13_batch, np.float32), np.asarray(y_true_26_batch, np.float32), np.asarray(y_true_52_batch, np.float32)
+
+    gt_box_13_batch, gt_box_26_batch, gt_box_52_batch = \
+        np.asarray(gt_box_13_batch), np.asarray(gt_box_26_batch), np.asarray(gt_box_52_batch)
+
+    return img_batch, y_true_13_batch, y_true_26_batch, y_true_52_batch, \
+           gt_box_13_batch, gt_box_26_batch, gt_box_52_batch
+
@@ -0,0 +1,423 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import numpy as np
+import cv2
+from collections import Counter
+
+from utils.nms_utils import cpu_nms, gpu_nms
+from utils.data_utils import parse_line
+
+
+def calc_iou(pred_boxes, true_boxes):
+    '''
+    Maintain an efficient way to calculate the ios matrix using the numpy broadcast tricks.
+    shape_info: pred_boxes: [N, 4]
+                true_boxes: [V, 4]
+    return: IoU matrix: shape: [N, V]
+    '''
+
+    # [N, 1, 4]
+    pred_boxes = np.expand_dims(pred_boxes, -2)
+    # [1, V, 4]
+    true_boxes = np.expand_dims(true_boxes, 0)
+
+    # [N, 1, 2] & [1, V, 2] ==> [N, V, 2]
+    intersect_mins = np.maximum(pred_boxes[..., :2], true_boxes[..., :2])
+    intersect_maxs = np.minimum(pred_boxes[..., 2:], true_boxes[..., 2:])
+    intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.)
+
+    # shape: [N, V]
+    intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
+    # shape: [N, 1, 2]
+    pred_box_wh = pred_boxes[..., 2:] - pred_boxes[..., :2]
+    # shape: [N, 1]
+    pred_box_area = pred_box_wh[..., 0] * pred_box_wh[..., 1]
+    # [1, V, 2]
+    true_boxes_wh = true_boxes[..., 2:] - true_boxes[..., :2]
+    # [1, V]
+    true_boxes_area = true_boxes_wh[..., 0] * true_boxes_wh[..., 1]
+
+    # shape: [N, V]
+    iou = intersect_area / (pred_box_area + true_boxes_area - intersect_area + 1e-10)
+
+    return iou
+
+
+def evaluate_on_cpu(y_pred, y_true, num_classes, calc_now=True, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):
+    '''
+    Given y_pred and y_true of a batch of data, get the recall and precision of the current batch.
+    '''
+
+    num_images = y_true[0].shape[0]
+    true_labels_dict = {i: 0 for i in range(num_classes)}  # {class: count}
+    pred_labels_dict = {i: 0 for i in range(num_classes)}
+    true_positive_dict = {i: 0 for i in range(num_classes)}
+
+    for i in range(num_images):
+        true_labels_list, true_boxes_list = [], []
+        for j in range(3):  # three feature maps
+            # shape: [13, 13, 3, 80]
+            true_probs_temp = y_true[j][i][..., 5:-1]
+            # shape: [13, 13, 3, 4] (x_center, y_center, w, h)
+            true_boxes_temp = y_true[j][i][..., 0:4]
+
+            # [13, 13, 3]
+            object_mask = true_probs_temp.sum(axis=-1) > 0
+
+            # [V, 3] V: Ground truth number of the current image
+            true_probs_temp = true_probs_temp[object_mask]
+            # [V, 4]
+            true_boxes_temp = true_boxes_temp[object_mask]
+
+            # [V], labels
+            true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist()
+            # [V, 4] (x_center, y_center, w, h)
+            true_boxes_list += true_boxes_temp.tolist()
+
+        if len(true_labels_list) != 0:
+            for cls, count in Counter(true_labels_list).items():
+                true_labels_dict[cls] += count
+
+        # [V, 4] (xmin, ymin, xmax, ymax)
+        true_boxes = np.array(true_boxes_list)
+        box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4]
+        true_boxes[:, 0:2] = box_centers - box_sizes / 2.
+        true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes
+
+        # [1, xxx, 4]
+        pred_boxes = y_pred[0][i:i + 1]
+        pred_confs = y_pred[1][i:i + 1]
+        pred_probs = y_pred[2][i:i + 1]
+
+        # pred_boxes: [N, 4]
+        # pred_confs: [N]
+        # pred_labels: [N]
+        # N: Detected box number of the current image
+        pred_boxes, pred_confs, pred_labels = cpu_nms(pred_boxes, pred_confs * pred_probs, num_classes,
+                                                      max_boxes=max_boxes, score_thresh=score_thresh, iou_thresh=iou_thresh)
+
+        # len: N
+        pred_labels_list = [] if pred_labels is None else pred_labels.tolist()
+        if pred_labels_list == []:
+            continue
+
+        # calc iou
+        # [N, V]
+        iou_matrix = calc_iou(pred_boxes, true_boxes)
+        # [N]
+        max_iou_idx = np.argmax(iou_matrix, axis=-1)
+
+        correct_idx = []
+        correct_conf = []
+        for k in range(max_iou_idx.shape[0]):
+            pred_labels_dict[pred_labels_list[k]] += 1
+            match_idx = max_iou_idx[k]  # V level
+            if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]:
+                if match_idx not in correct_idx:
+                    correct_idx.append(match_idx)
+                    correct_conf.append(pred_confs[k])
+                else:
+                    same_idx = correct_idx.index(match_idx)
+                    if pred_confs[k] > correct_conf[same_idx]:
+                        correct_idx.pop(same_idx)
+                        correct_conf.pop(same_idx)
+                        correct_idx.append(match_idx)
+                        correct_conf.append(pred_confs[k])
+
+        for t in correct_idx:
+            true_positive_dict[true_labels_list[t]] += 1
+
+    if calc_now:
+        # avoid divided by 0
+        recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6)
+        precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6)
+
+        return recall, precision
+    else:
+        return true_positive_dict, true_labels_dict, pred_labels_dict
+
+
+def evaluate_on_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, y_pred, y_true, num_classes, iou_thresh=0.5, calc_now=True):
+    '''
+    Given y_pred and y_true of a batch of data, get the recall and precision of the current batch.
+    This function will perform gpu operation on the GPU.
+    '''
+
+    num_images = y_true[0].shape[0]
+    true_labels_dict = {i: 0 for i in range(num_classes)}  # {class: count}
+    pred_labels_dict = {i: 0 for i in range(num_classes)}
+    true_positive_dict = {i: 0 for i in range(num_classes)}
+
+    for i in range(num_images):
+        true_labels_list, true_boxes_list = [], []
+        for j in range(3):  # three feature maps
+            # shape: [13, 13, 3, 80]
+            true_probs_temp = y_true[j][i][..., 5:-1]
+            # shape: [13, 13, 3, 4] (x_center, y_center, w, h)
+            true_boxes_temp = y_true[j][i][..., 0:4]
+
+            # [13, 13, 3]
+            object_mask = true_probs_temp.sum(axis=-1) > 0
+
+            # [V, 80] V: Ground truth number of the current image
+            true_probs_temp = true_probs_temp[object_mask]
+            # [V, 4]
+            true_boxes_temp = true_boxes_temp[object_mask]
+
+            # [V], labels, each from 0 to 79
+            true_labels_list += np.argmax(true_probs_temp, axis=-1).tolist()
+            # [V, 4] (x_center, y_center, w, h)
+            true_boxes_list += true_boxes_temp.tolist()
+
+        if len(true_labels_list) != 0:
+            for cls, count in Counter(true_labels_list).items():
+                true_labels_dict[cls] += count
+
+        # [V, 4] (xmin, ymin, xmax, ymax)
+        true_boxes = np.array(true_boxes_list)
+        box_centers, box_sizes = true_boxes[:, 0:2], true_boxes[:, 2:4]
+        true_boxes[:, 0:2] = box_centers - box_sizes / 2.
+        true_boxes[:, 2:4] = true_boxes[:, 0:2] + box_sizes
+
+        # [1, xxx, 4]
+        pred_boxes = y_pred[0][i:i + 1]
+        pred_confs = y_pred[1][i:i + 1]
+        pred_probs = y_pred[2][i:i + 1]
+
+        # pred_boxes: [N, 4]
+        # pred_confs: [N]
+        # pred_labels: [N]
+        # N: Detected box number of the current image
+        pred_boxes, pred_confs, pred_labels = sess.run(gpu_nms_op,
+                                                       feed_dict={pred_boxes_flag: pred_boxes,
+                                                                  pred_scores_flag: pred_confs * pred_probs})
+        # len: N
+        pred_labels_list = [] if pred_labels is None else pred_labels.tolist()
+        if pred_labels_list == []:
+            continue
+
+        # calc iou
+        # [N, V]
+        iou_matrix = calc_iou(pred_boxes, true_boxes)
+        # [N]
+        max_iou_idx = np.argmax(iou_matrix, axis=-1)
+
+        correct_idx = []
+        correct_conf = []
+        for k in range(max_iou_idx.shape[0]):
+            pred_labels_dict[pred_labels_list[k]] += 1
+            match_idx = max_iou_idx[k]  # V level
+            if iou_matrix[k, match_idx] > iou_thresh and true_labels_list[match_idx] == pred_labels_list[k]:
+                if match_idx not in correct_idx:
+                    correct_idx.append(match_idx)
+                    correct_conf.append(pred_confs[k])
+                else:
+                    same_idx = correct_idx.index(match_idx)
+                    if pred_confs[k] > correct_conf[same_idx]:
+                        correct_idx.pop(same_idx)
+                        correct_conf.pop(same_idx)
+                        correct_idx.append(match_idx)
+                        correct_conf.append(pred_confs[k])
+
+        for t in correct_idx:
+            true_positive_dict[true_labels_list[t]] += 1
+
+    if calc_now:
+        # avoid divided by 0
+        recall = sum(true_positive_dict.values()) / (sum(true_labels_dict.values()) + 1e-6)
+        precision = sum(true_positive_dict.values()) / (sum(pred_labels_dict.values()) + 1e-6)
+
+        return recall, precision
+    else:
+        return true_positive_dict, true_labels_dict, pred_labels_dict
+
+
+def get_preds_gpu(sess, gpu_nms_op, pred_boxes_flag, pred_scores_flag, image_ids, y_pred):
+    '''
+    Given the y_pred of an input image, get the predicted bbox and label info.
+    return:
+        pred_content: 2d list.
+    '''
+    image_id = image_ids[0]
+
+    # keep the first dimension 1
+    pred_boxes = y_pred[0][0:1]
+    pred_confs = y_pred[1][0:1]
+    pred_probs = y_pred[2][0:1]
+
+    boxes, scores, labels = sess.run(gpu_nms_op,
+                                     feed_dict={pred_boxes_flag: pred_boxes,
+                                                pred_scores_flag: pred_confs * pred_probs})
+
+    pred_content = []
+    for i in range(len(labels)):
+        x_min, y_min, x_max, y_max = boxes[i]
+        score = scores[i]
+        label = labels[i]
+        pred_content.append([image_id, x_min, y_min, x_max, y_max, score, label])
+
+    return pred_content
+
+
+gt_dict = {}  # key: img_id, value: gt object list
+def parse_gt_rec(gt_filename, target_img_size, letterbox_resize=True):
+    '''
+    parse and re-organize the gt info.
+    return:
+        gt_dict: dict. Each key is a img_id, the value is the gt bboxes in the corresponding img.
+    '''
+
+    global gt_dict
+
+    if not gt_dict:
+        new_width, new_height = target_img_size
+        with open(gt_filename, 'r') as f:
+            for line in f:
+                img_id, pic_path, boxes, labels, ori_width, ori_height = parse_line(line)
+
+                objects = []
+                for i in range(len(labels)):
+                    x_min, y_min, x_max, y_max = boxes[i]
+                    label = labels[i]
+
+                    if letterbox_resize:
+                        resize_ratio = min(new_width / ori_width, new_height / ori_height)
+
+                        resize_w = int(resize_ratio * ori_width)
+                        resize_h = int(resize_ratio * ori_height)
+
+                        dw = int((new_width - resize_w) / 2)
+                        dh = int((new_height - resize_h) / 2)
+
+                        objects.append([x_min * resize_ratio + dw,
+                                        y_min * resize_ratio + dh,
+                                        x_max * resize_ratio + dw,
+                                        y_max * resize_ratio + dh,
+                                        label])
+                    else:
+                        objects.append([x_min * new_width / ori_width,
+                                        y_min * new_height / ori_height,
+                                        x_max * new_width / ori_width,
+                                        y_max * new_height / ori_height,
+                                        label])
+                gt_dict[img_id] = objects
+    return gt_dict
+
+
+# The following two functions are modified from FAIR's Detectron repo to calculate mAP:
+# https://github.com/facebookresearch/Detectron/blob/master/detectron/datasets/voc_eval.py
+def voc_ap(rec, prec, use_07_metric=False):
+    """Compute VOC AP given precision and recall. If use_07_metric is true, uses
+    the VOC 07 11-point method (default:False).
+    """
+    if use_07_metric:
+        # 11 point metric
+        ap = 0.
+        for t in np.arange(0., 1.1, 0.1):
+            if np.sum(rec >= t) == 0:
+                p = 0
+            else:
+                p = np.max(prec[rec >= t])
+            ap = ap + p / 11.
+    else:
+        # correct AP calculation
+        # first append sentinel values at the end
+        mrec = np.concatenate(([0.], rec, [1.]))
+        mpre = np.concatenate(([0.], prec, [0.]))
+
+        # compute the precision envelope
+        for i in range(mpre.size - 1, 0, -1):
+            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
+
+        # to calculate area under PR curve, look for points
+        # where X axis (recall) changes value
+        i = np.where(mrec[1:] != mrec[:-1])[0]
+
+        # and sum (\Delta recall) * prec
+        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
+    return ap
+
+
+def voc_eval(gt_dict, val_preds, classidx, iou_thres=0.5, use_07_metric=False):
+    '''
+    Top level function that does the PASCAL VOC evaluation.
+    '''
+    # 1.obtain gt: extract all gt objects for this class
+    class_recs = {}
+    npos = 0
+    for img_id in gt_dict:
+        R = [obj for obj in gt_dict[img_id] if obj[-1] == classidx]
+        bbox = np.array([x[:4] for x in R])
+        det = [False] * len(R)
+        npos += len(R)
+        class_recs[img_id] = {'bbox': bbox, 'det': det}
+
+    # 2. obtain pred results
+    pred = [x for x in val_preds if x[-1] == classidx]
+    img_ids = [x[0] for x in pred]
+    confidence = np.array([x[-2] for x in pred])
+    BB = np.array([[x[1], x[2], x[3], x[4]] for x in pred])
+
+    # 3. sort by confidence
+    sorted_ind = np.argsort(-confidence)
+    try:
+        BB = BB[sorted_ind, :]
+    except:
+        print('no box, ignore')
+        return 1e-6, 1e-6, 0, 0, 0
+    img_ids = [img_ids[x] for x in sorted_ind]
+
+    # 4. mark TPs and FPs
+    nd = len(img_ids)
+    tp = np.zeros(nd)
+    fp = np.zeros(nd)
+
+    for d in range(nd):
+        # all the gt info in some image
+        R = class_recs[img_ids[d]]
+        bb = BB[d, :]
+        ovmax = -np.Inf
+        BBGT = R['bbox']
+
+        if BBGT.size > 0:
+            # calc iou
+            # intersection
+            ixmin = np.maximum(BBGT[:, 0], bb[0])
+            iymin = np.maximum(BBGT[:, 1], bb[1])
+            ixmax = np.minimum(BBGT[:, 2], bb[2])
+            iymax = np.minimum(BBGT[:, 3], bb[3])
+            iw = np.maximum(ixmax - ixmin + 1., 0.)
+            ih = np.maximum(iymax - iymin + 1., 0.)
+            inters = iw * ih
+
+            # union
+            uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (
+                        BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
+
+            overlaps = inters / uni
+            ovmax = np.max(overlaps)
+            jmax = np.argmax(overlaps)
+
+        if ovmax > iou_thres:
+            # gt not matched yet
+            if not R['det'][jmax]:
+                tp[d] = 1.
+                R['det'][jmax] = 1
+            else:
+                fp[d] = 1.
+        else:
+            fp[d] = 1.
+
+    # compute precision recall
+    fp = np.cumsum(fp)
+    tp = np.cumsum(tp)
+    rec = tp / float(npos)
+    # avoid divide by zero in case the first detection matches a difficult
+    # ground truth
+    prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
+    ap = voc_ap(rec, prec, use_07_metric)
+
+    # return rec, prec, ap
+    return npos, nd, tp[-1] / float(npos), tp[-1] / float(nd), ap
@@ -0,0 +1,89 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import numpy as np
+import tensorflow as tf
+slim = tf.contrib.slim
+
+def conv2d(inputs, filters, kernel_size, strides=1):
+    def _fixed_padding(inputs, kernel_size):
+        pad_total = kernel_size - 1
+        pad_beg = pad_total // 2
+        pad_end = pad_total - pad_beg
+
+        padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg, pad_end],
+                                        [pad_beg, pad_end], [0, 0]], mode='CONSTANT')
+        return padded_inputs
+    if strides > 1: 
+        inputs = _fixed_padding(inputs, kernel_size)
+    inputs = slim.conv2d(inputs, filters, kernel_size, stride=strides,
+                         padding=('SAME' if strides == 1 else 'VALID'))
+    return inputs
+
+def darknet53_body(inputs):
+    def res_block(inputs, filters):
+        shortcut = inputs
+        net = conv2d(inputs, filters * 1, 1)
+        net = conv2d(net, filters * 2, 3)
+
+        net = net + shortcut
+
+        return net
+    
+    # first two conv2d layers
+    net = conv2d(inputs, 32,  3, strides=1)
+    net = conv2d(net, 64,  3, strides=2)
+
+    # res_block * 1
+    net = res_block(net, 32)
+
+    net = conv2d(net, 128, 3, strides=2)
+
+    # res_block * 2
+    for i in range(2):
+        net = res_block(net, 64)
+
+    net = conv2d(net, 256, 3, strides=2)
+
+    # res_block * 8
+    for i in range(8):
+        net = res_block(net, 128)
+
+    route_1 = net
+    net = conv2d(net, 512, 3, strides=2)
+
+    # res_block * 8
+    for i in range(8):
+        net = res_block(net, 256)
+
+    route_2 = net
+    net = conv2d(net, 1024, 3, strides=2)
+
+    # res_block * 4
+    for i in range(4):
+        net = res_block(net, 512)
+    route_3 = net
+
+    return route_1, route_2, route_3
+
+
+def yolo_block(inputs, filters):
+    net = conv2d(inputs, filters * 1, 1)
+    net = conv2d(net, filters * 2, 3)
+    net = conv2d(net, filters * 1, 1)
+    net = conv2d(net, filters * 2, 3)
+    net = conv2d(net, filters * 1, 1)
+    route = net
+    net = conv2d(net, filters * 2, 3)
+    return route, net
+
+
+def upsample_layer(inputs, out_shape):
+    new_height, new_width = out_shape[1], out_shape[2]
+    # NOTE: here height is the first
+    # TODO: Do we need to set `align_corners` as True?
+    inputs = tf.image.resize_nearest_neighbor(inputs, (new_height, new_width), name='upsampled')
+    return inputs
+
+
@@ -0,0 +1,165 @@
+# coding: utf-8
+
+import numpy as np
+import tensorflow as tf
+import random
+
+from tensorflow.core.framework import summary_pb2
+
+
+def make_summary(name, val):
+    return summary_pb2.Summary(value=[summary_pb2.Summary.Value(tag=name, simple_value=val)])
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.average = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.average = self.sum / float(self.count)
+
+
+def parse_anchors(anchor_path):
+    '''
+    parse anchors.
+    returned data: shape [N, 2], dtype float32
+    '''
+    anchors = np.reshape(np.asarray(open(anchor_path, 'r').read().split(','), np.float32), [-1, 2])
+    return anchors
+
+
+def read_class_names(class_name_path):
+    names = {}
+    with open(class_name_path, 'r') as data:
+        for ID, name in enumerate(data):
+            names[ID] = name.strip('\n')
+    return names
+
+
+def shuffle_and_overwrite(file_name):
+    content = open(file_name, 'r').readlines()
+    random.shuffle(content)
+    with open(file_name, 'w') as f:
+        for line in content:
+            f.write(line)
+
+
+def update_dict(ori_dict, new_dict):
+    if not ori_dict:
+        return new_dict
+    for key in ori_dict:
+        ori_dict[key] += new_dict[key]
+    return ori_dict
+
+
+def list_add(ori_list, new_list):
+    for i in range(len(ori_list)):
+        ori_list[i] += new_list[i]
+    return ori_list
+
+
+def load_weights(var_list, weights_file):
+    """
+    Loads and converts pre-trained weights.
+    param:
+        var_list: list of network variables.
+        weights_file: name of the binary file.
+    """
+    with open(weights_file, "rb") as fp:
+        np.fromfile(fp, dtype=np.int32, count=5)
+        weights = np.fromfile(fp, dtype=np.float32)
+
+    ptr = 0
+    i = 0
+    assign_ops = []
+    try:
+        while i < len(var_list) - 1:
+            var1 = var_list[i]
+            var2 = var_list[i + 1]
+            # do something only if we process conv layer
+            if 'Conv' in var1.name.split('/')[-2]:
+                # check type of next layer
+                if 'BatchNorm' in var2.name.split('/')[-2]:
+                    # load batch norm params
+                    gamma, beta, mean, var = var_list[i + 1:i + 5]
+                    batch_norm_vars = [beta, gamma, mean, var]
+                    for var in batch_norm_vars:
+                        shape = var.shape.as_list()
+                        num_params = np.prod(shape)
+                        var_weights = weights[ptr:ptr + num_params].reshape(shape)
+                        ptr += num_params
+                        assign_ops.append(tf.assign(var, var_weights, validate_shape=True))
+                    # we move the pointer by 4, because we loaded 4 variables
+                    i += 4
+                elif 'Conv' in var2.name.split('/')[-2]:
+                    # load biases
+                    bias = var2
+                    bias_shape = bias.shape.as_list()
+                    bias_params = np.prod(bias_shape)
+                    bias_weights = weights[ptr:ptr +
+                                           bias_params].reshape(bias_shape)
+                    ptr += bias_params
+                    assign_ops.append(tf.assign(bias, bias_weights, validate_shape=True))
+                    # we loaded 1 variable
+                    i += 1
+                # we can load weights of conv layer
+                shape = var1.shape.as_list()
+                num_params = np.prod(shape)
+
+                var_weights = weights[ptr:ptr + num_params].reshape(
+                    (shape[3], shape[2], shape[0], shape[1]))
+                # remember to transpose to column-major
+                var_weights = np.transpose(var_weights, (2, 3, 1, 0))
+                ptr += num_params
+                assign_ops.append(
+                    tf.assign(var1, var_weights, validate_shape=True))
+                i += 1
+    except:
+        pass
+    return assign_ops
+
+
+def config_learning_rate(args, global_step):
+    if args.lr_type == 'exponential':
+        lr_tmp = tf.train.exponential_decay(args.learning_rate_init, global_step, args.lr_decay_freq,
+                                            args.lr_decay_factor, staircase=True, name='exponential_learning_rate')
+        return tf.maximum(lr_tmp, args.lr_lower_bound)
+    elif args.lr_type == 'cosine_decay':
+        train_steps = (args.total_epoches - float(args.use_warm_up) * args.warm_up_epoch) * args.train_batch_num
+        return args.lr_lower_bound + 0.5 * (args.learning_rate_init - args.lr_lower_bound) * \
+            (1 + tf.cos(global_step / train_steps * np.pi))
+    elif args.lr_type == 'cosine_decay_restart':
+        return tf.train.cosine_decay_restarts(args.learning_rate_init, global_step, 
+                                              args.lr_decay_freq, t_mul=2.0, m_mul=1.0, 
+                                              name='cosine_decay_learning_rate_restart')
+    elif args.lr_type == 'fixed':
+        return tf.convert_to_tensor(args.learning_rate_init, name='fixed_learning_rate')
+    elif args.lr_type == 'piecewise':
+        return tf.train.piecewise_constant(global_step, boundaries=args.pw_boundaries, values=args.pw_values,
+                                           name='piecewise_learning_rate')
+    else:
+        raise ValueError('Unsupported learning rate type!')
+
+
+def config_optimizer(optimizer_name, learning_rate, decay=0.9, momentum=0.9):
+    if optimizer_name == 'momentum':
+        return tf.train.MomentumOptimizer(learning_rate, momentum=momentum, use_nesterov=False)
+    elif optimizer_name == 'nesterov':
+        return tf.train.MomentumOptimizer(learning_rate, momentum=momentum, use_nesterov=True)
+    elif optimizer_name == 'rmsprop':
+        return tf.train.RMSPropOptimizer(learning_rate, decay=decay, momentum=momentum)
+    elif optimizer_name == 'adam':
+        return tf.train.AdamOptimizer(learning_rate)
+    elif optimizer_name == 'sgd':
+        return tf.train.GradientDescentOptimizer(learning_rate)
+    else:
+        raise ValueError('Unsupported optimizer type!')
@@ -0,0 +1,123 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import numpy as np
+import tensorflow as tf
+
+def gpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, nms_thresh=0.5):
+    """
+    Perform NMS on GPU using TensorFlow.
+
+    params:
+        boxes: tensor of shape [1, 10647, 4] # 10647=(13*13+26*26+52*52)*3, for input 416*416 image
+        scores: tensor of shape [1, 10647, num_classes], score=conf*prob
+        num_classes: total number of classes
+        max_boxes: integer, maximum number of predicted boxes you'd like, default is 50
+        score_thresh: if [ highest class probability score < score_threshold]
+                        then get rid of the corresponding box
+        nms_thresh: real value, "intersection over union" threshold used for NMS filtering
+    """
+
+    boxes_list, label_list, score_list = [], [], []
+    max_boxes = tf.constant(max_boxes, dtype='int32')
+
+    # since we do nms for single image, then reshape it
+    boxes = tf.reshape(boxes, [-1, 4]) # '-1' means we don't konw the exact number of boxes
+    score = tf.reshape(scores, [-1, num_classes])
+
+    # Step 1: Create a filtering mask based on "box_class_scores" by using "threshold".
+    mask = tf.greater_equal(score, tf.constant(score_thresh))
+    # Step 2: Do non_max_suppression for each class
+    for i in range(num_classes):
+        # Step 3: Apply the mask to scores, boxes and pick them out
+        filter_boxes = tf.boolean_mask(boxes, mask[:,i])
+        filter_score = tf.boolean_mask(score[:,i], mask[:,i])
+        nms_indices = tf.image.non_max_suppression(boxes=filter_boxes,
+                                                   scores=filter_score,
+                                                   max_output_size=max_boxes,
+                                                   iou_threshold=nms_thresh, name='nms_indices')
+        label_list.append(tf.ones_like(tf.gather(filter_score, nms_indices), 'int32')*i)
+        boxes_list.append(tf.gather(filter_boxes, nms_indices))
+        score_list.append(tf.gather(filter_score, nms_indices))
+
+    boxes = tf.concat(boxes_list, axis=0)
+    score = tf.concat(score_list, axis=0)
+    label = tf.concat(label_list, axis=0)
+
+    return boxes, score, label
+
+
+def py_nms(boxes, scores, max_boxes=50, iou_thresh=0.5):
+    """
+    Pure Python NMS baseline.
+
+    Arguments: boxes: shape of [-1, 4], the value of '-1' means that dont know the
+                      exact number of boxes
+               scores: shape of [-1,]
+               max_boxes: representing the maximum of boxes to be selected by non_max_suppression
+               iou_thresh: representing iou_threshold for deciding to keep boxes
+    """
+    assert boxes.shape[1] == 4 and len(scores.shape) == 1
+
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = scores.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+        inds = np.where(ovr <= iou_thresh)[0]
+        order = order[inds + 1]
+
+    return keep[:max_boxes]
+
+
+def cpu_nms(boxes, scores, num_classes, max_boxes=50, score_thresh=0.5, iou_thresh=0.5):
+    """
+    Perform NMS on CPU.
+    Arguments:
+        boxes: shape [1, 10647, 4]
+        scores: shape [1, 10647, num_classes]
+    """
+
+    boxes = boxes.reshape(-1, 4)
+    scores = scores.reshape(-1, num_classes)
+    # Picked bounding boxes
+    picked_boxes, picked_score, picked_label = [], [], []
+
+    for i in range(num_classes):
+        indices = np.where(scores[:,i] >= score_thresh)
+        filter_boxes = boxes[indices]
+        filter_scores = scores[:,i][indices]
+        if len(filter_boxes) == 0: 
+            continue
+        # do non_max_suppression on the cpu
+        indices = py_nms(filter_boxes, filter_scores,
+                         max_boxes=max_boxes, iou_thresh=iou_thresh)
+        picked_boxes.append(filter_boxes[indices])
+        picked_score.append(filter_scores[indices])
+        picked_label.append(np.ones(len(indices), dtype='int32')*i)
+    if len(picked_boxes) == 0: 
+        return None, None, None
+
+    boxes = np.concatenate(picked_boxes, axis=0)
+    score = np.concatenate(picked_score, axis=0)
+    label = np.concatenate(picked_label, axis=0)
+
+    return boxes, score, label
@@ -0,0 +1,35 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import cv2
+import random
+
+
+def get_color_table(class_num, seed=2):
+    random.seed(seed)
+    color_table = {}
+    for i in range(class_num):
+        color_table[i] = [random.randint(0, 255) for _ in range(3)]
+    return color_table
+
+
+def plot_one_box(img, coord, label=None, color=None, line_thickness=None):
+    '''
+    coord: [x_min, y_min, x_max, y_max] format coordinates.
+    img: img to plot on.
+    label: str. The label name.
+    color: int. color index.
+    line_thickness: int. rectangle line thickness.
+    '''
+    tl = line_thickness or int(round(0.002 * max(img.shape[0:2])))  # line thickness
+    color = color or [random.randint(0, 255) for _ in range(3)]
+    c1, c2 = (int(coord[0]), int(coord[1])), (int(coord[2]), int(coord[3]))
+    cv2.rectangle(img, c1, c2, color, thickness=tl)
+    if label:
+        tf = max(tl - 1, 1)  # font thickness
+        t_size = cv2.getTextSize(label, 0, fontScale=float(tl) / 3, thickness=tf)[0]
+        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
+        cv2.rectangle(img, c1, c2, color, -1)  # filled
+        cv2.putText(img, label, (c1[0], c1[1] - 2), 0, float(tl) / 3, [0, 0, 0], thickness=tf, lineType=cv2.LINE_AA)
+
@@ -0,0 +1,102 @@
+# coding: utf-8
+
+from __future__ import division, print_function
+
+import tensorflow as tf
+import numpy as np
+import argparse
+import cv2
+import time
+
+from utils.misc_utils import parse_anchors, read_class_names
+from utils.nms_utils import gpu_nms
+from utils.plot_utils import get_color_table, plot_one_box
+from utils.data_aug import letterbox_resize
+
+from model import yolov3
+
+parser = argparse.ArgumentParser(description="YOLO-V3 video test procedure.")
+parser.add_argument("input_video", type=str,
+                    help="The path of the input video.")
+parser.add_argument("--anchor_path", type=str, default="./data/yolo_anchors.txt",
+                    help="The path of the anchor txt file.")
+parser.add_argument("--new_size", nargs='*', type=int, default=[416, 416],
+                    help="Resize the input image with `new_size`, size format: [width, height]")
+parser.add_argument("--letterbox_resize", type=lambda x: (str(x).lower() == 'true'), default=True,
+                    help="Whether to use the letterbox resize.")
+parser.add_argument("--class_name_path", type=str, default="./data/coco.names",
+                    help="The path of the class names.")
+parser.add_argument("--restore_path", type=str, default="./data/darknet_weights/yolov3.ckpt",
+                    help="The path of the weights to restore.")
+parser.add_argument("--save_video", type=lambda x: (str(x).lower() == 'true'), default=False,
+                    help="Whether to save the video detection results.")
+args = parser.parse_args()
+
+args.anchors = parse_anchors(args.anchor_path)
+args.classes = read_class_names(args.class_name_path)
+args.num_class = len(args.classes)
+
+color_table = get_color_table(args.num_class)
+
+vid = cv2.VideoCapture(args.input_video)
+video_frame_cnt = int(vid.get(7))
+video_width = int(vid.get(3))
+video_height = int(vid.get(4))
+video_fps = int(vid.get(5))
+
+if args.save_video:
+    fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
+    videoWriter = cv2.VideoWriter('video_result.mp4', fourcc, video_fps, (video_width, video_height))
+
+with tf.Session() as sess:
+    input_data = tf.placeholder(tf.float32, [1, args.new_size[1], args.new_size[0], 3], name='input_data')
+    yolo_model = yolov3(args.num_class, args.anchors)
+    with tf.variable_scope('yolov3'):
+        pred_feature_maps = yolo_model.forward(input_data, False)
+    pred_boxes, pred_confs, pred_probs = yolo_model.predict(pred_feature_maps)
+
+    pred_scores = pred_confs * pred_probs
+
+    boxes, scores, labels = gpu_nms(pred_boxes, pred_scores, args.num_class, max_boxes=200, score_thresh=0.3, nms_thresh=0.45)
+
+    saver = tf.train.Saver()
+    saver.restore(sess, args.restore_path)
+
+    for i in range(video_frame_cnt):
+        ret, img_ori = vid.read()
+        if args.letterbox_resize:
+            img, resize_ratio, dw, dh = letterbox_resize(img_ori, args.new_size[0], args.new_size[1])
+        else:
+            height_ori, width_ori = img_ori.shape[:2]
+            img = cv2.resize(img_ori, tuple(args.new_size))
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        img = np.asarray(img, np.float32)
+        img = img[np.newaxis, :] / 255.
+
+        start_time = time.time()
+        boxes_, scores_, labels_ = sess.run([boxes, scores, labels], feed_dict={input_data: img})
+        end_time = time.time()
+
+        # rescale the coordinates to the original image
+        if args.letterbox_resize:
+            boxes_[:, [0, 2]] = (boxes_[:, [0, 2]] - dw) / resize_ratio
+            boxes_[:, [1, 3]] = (boxes_[:, [1, 3]] - dh) / resize_ratio
+        else:
+            boxes_[:, [0, 2]] *= (width_ori/float(args.new_size[0]))
+            boxes_[:, [1, 3]] *= (height_ori/float(args.new_size[1]))
+
+
+        for i in range(len(boxes_)):
+            x0, y0, x1, y1 = boxes_[i]
+            plot_one_box(img_ori, [x0, y0, x1, y1], label=args.classes[labels_[i]] + ', {:.2f}%'.format(scores_[i] * 100), color=color_table[labels_[i]])
+        cv2.putText(img_ori, '{:.2f}ms'.format((end_time - start_time) * 1000), (40, 40), 0,
+                    fontScale=1, color=(0, 255, 0), thickness=2)
+        cv2.imshow('image', img_ori)
+        if args.save_video:
+            videoWriter.write(img_ori)
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+
+    vid.release()
+    if args.save_video:
+        videoWriter.release()
@@ -0,0 +1,9 @@
+{
+    "server_count": "1",
+    "server_list": [{
+        "device": [{devices}],
+        "server_id": "127.0.0.1"
+    }],
+    "status": "completed",
+    "version": "1.0"
+}
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# main env
+if [ -d /usr/local/Ascend/nnae/latest ];then
+
+	export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/Ascend/driver/tools/hccn_tool/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages
+	export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+else
+	export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/local/mpirun4.0/lib
+	export PYTHONPATH=$PYTHONPATH:/usr/local/Ascend/tfplugin/latest/tfplugin/python/site-packages:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe:/usr/local/Ascend/ascend-toolkit/latest//fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/tfplugin/python/site-packages:$projectDir
+	export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:/usr/local/mpirun4.0/bin
+	export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/
+	
+fi
+
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+#export DUMP_GE_GRAPH=2
+#export DUMP_GRAPH_LEVEL=3
+#export PRINT_MODEL=1
+export SLOG_PRINT_TO_STDOUT=0
+export HCCL_CONNECT_TIMEOUT=600
+
+
+# system env
+ulimit -c unlimited
@@ -0,0 +1,53 @@
+
+# setting main path
+MAIN_PATH=$(dirname $(readlink -f $0))
+echo $MAIN_PATH
+
+DEVICE_NUM=$1
+ckpt_path=$2
+
+#echo $1
+#echo $2
+# set env
+export DDK_VERSION_FLAG=1.60.T49.0.B201
+export NEW_GE_FE_ID=1
+export GE_AICPU_FLAG=1
+export SOC_VERSION=Ascend910
+
+export JOB_ID=10087
+export FUSION_TENSOR_SIZE=1000000000
+
+
+export RANK_ID=yolo
+#echo "device_num is  $DEVICE_NUM"
+for((i=0;i<${DEVICE_NUM};i++));
+do
+
+export RANK_SIZE=$DEVICE_NUM
+export DEVICE_ID=$i
+export DEVICE_INDEX=$i
+
+#su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[debug]\" --device "$RANK_ID
+cd ${MAIN_PATH}/../result
+if [ x"${ckpt_path}" == x"" ];then
+    lastresult=$(ls -t | grep -E "Train*" | head -n 1)
+    RESTORE_PATH=${lastresult}/${i}/training/
+   
+else
+    lastresult=${ckpt_path}
+    RESTORE_PATH=${ckpt_path}/${i}/training/
+   
+fi
+echo $RESTORE_PATH
+ python3.7 ${MAIN_PATH}/../code/eval.py \
+--save_json True \
+--score_thresh 0.0001 \
+--nms_thresh 0.55 \
+--max_boxes 100 \
+--restore_path $RESTORE_PATH \
+--max_test 10000 \
+--save_json_path eval_res_D$DEVICE_NUM.json > ${lastresult}/eval_$i.out 2>&1
+
+done
+
+
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+rank_size=$1
+yamlPath=$2
+toolsPath=$3
+if [ -f /.dockerenv ];then
+        CLUSTER=$4
+        MPIRUN_ALL_IP="$5"
+        export CLUSTER=${CLUSTER}
+fi
+currentDir=$(cd "$(dirname "$0")/.."; pwd)
+
+# 从 yaml 获取配置
+eval $(${toolsPath}/get_params_for_yaml.sh ${yamlPath} "tensorflow_config")
+source ${currentDir}/config/npu_set_env.sh
+
+if [ x"$runmode" != x"evaluate" ];then
+    currtime=`date +%Y%m%d%H%M%S`
+    mkdir -p ${currentDir%train*}/train/result/tf_yolov3/training_job_${currtime}/
+    train_job_dir=${currentDir%train*}/train/result/tf_yolov3/training_job_${currtime}/
+    echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${train_job_dir} &"
+fi
+
+
+# device 列表, 若无指定 device 根据 rank_size 顺序选择
+eval device_group=\$device_group_${rank_size}p
+if [ x"${device_group}" == x"" ] || [ ${rank_size} -ge 8 ];then
+    device_group="$(seq 0 "$(expr $rank_size - 1)")"
+fi
+
+# get last device id in device_group, hw log in performance from the dir named first_device_id
+device_group_str=`echo ${device_group} | sed 's/ //g'`
+first_device_id=`echo ${device_group_str: 0:1}`
+
+argsFilePath=${currentDir}/code/args_${mode}.py
+
+#echo "argsFilePath is "${argsFilePath}
+sed -i "0,/batch_size.*$/s//batch_size\ = ${batch_size}/g" ${argsFilePath}
+sed -i "s/save_epoch.*$/save_epoch\ = ${save_epoch}/g" ${argsFilePath}
+sed -i "s/total_epoches =.*$/total_epoches\ = ${total_epoches}/g" ${argsFilePath}
+sed -i 's/\r//g' ${argsFilePath}
+
+if [ x"${CLUSTER}" == x"True" ];then
+    # ln hw log
+    ln -snf ${train_job_dir}/0/hw_yolov3.log ${train_job_dir}
+    this_ip=$(hostname -I |awk '{print $1}')
+    for ip in $MPIRUN_ALL_IP;do
+        if [ x"$ip" != x"$this_ip" ];then
+            scp $yamlPath root@$ip:$yamlPath
+            scp $argsFilePath root@$ip:$argsFilePath
+        fi
+    done
+    export PATH=$PATH:/usr/local/mpirun4.0/bin
+    mpirun -H ${mpirun_ip} \
+    --bind-to none -map-by slot\
+    --allow-run-as-root \
+    --mca btl_tcp_if_exclude lo,docker0,endvnic,virbr0,vethf40501b,docker_gwbridge,br-f42ac38052b4\
+    --prefix /usr/local/mpirun4.0/ \
+    ${currentDir}/scripts/train.sh 0 $rank_size $yamlPath $currtime ${toolsPath} ${CLUSTER}
+elif [ $runmode == "train" ];then
+    ln -snf ${train_job_dir}/${first_device_id}/hw_yolov3.log ${train_job_dir}
+    rank_id=0
+    for device_id in $device_group;do
+      #echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] start: train ${device_id} & " >> ${currentDir}/result/main.log
+      ${currentDir}/scripts/train.sh $device_id $rank_size $yamlPath $currtime ${toolsPath} $rank_id&
+      let rank_id++
+    done
+else
+    echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] ${ckpt_path} &"
+    ln -snf ${train_job_dir}/${first_device_id}/hw_yolov3.log ${train_job_dir}
+    bash ${currentDir}/scripts/eval.sh ${rank_size} ${ckpt_path}
+fi
+
+wait
+
+#echo "[`date +%Y%m%d-%H:%M:%S`] [INFO] all train exit " >> ${currentDir}/result/main.log
+
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326`