[add]上传训练benchmark by z00560161

This commit is contained in:
liang_chaoming@huawei.com
2020-10-19 20:22:23 +08:00
parent 22b83024f5
commit 82522e2f61
1225 changed files with 345421 additions and 0 deletions
@@ -0,0 +1,281 @@
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""COCO-style evaluation metrics.
Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
COCO API: github.com/cocodataset/cocoapi/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
import tempfile
import time
from absl import flags
import numpy as np
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import six
#COCO = coco.COCO
#COCOeval = coco.COCOeval
import tensorflow as tf
import ssd_constants
FLAGS = flags.FLAGS
# https://github.com/cocodataset/cocoapi/issues/49
if six.PY3:
import pycocotools.coco
pycocotools.coco.unicode = str
def create_coco(val_json_file, use_cpp_extension=True):
"""Creates Microsoft COCO helper class object and return it."""
if val_json_file.startswith('gs://'):
_, local_val_json = tempfile.mkstemp(suffix='.json')
tf.gfile.Remove(local_val_json)
tf.gfile.Copy(val_json_file, local_val_json)
atexit.register(tf.gfile.Remove, local_val_json)
else:
local_val_json = val_json_file
if use_cpp_extension:
coco_gt = coco.COCO(local_val_json, False)
else:
coco_gt = COCO(local_val_json)
return coco_gt
def compute_map(labels_and_predictions,
coco_gt,
use_cpp_extension=True,
nms_on_tpu=True):
"""Use model predictions to compute mAP.
The evaluation code is largely copied from the MLPerf reference
implementation. While it is possible to write the evaluation as a tensor
metric and use Estimator.evaluate(), this approach was selected for simplicity
and ease of duck testing.
Args:
labels_and_predictions: A map from TPU predict method.
coco_gt: ground truch COCO object.
use_cpp_extension: use cocoeval C++ library.
nms_on_tpu: do NMS on TPU.
Returns:
Evaluation result.
"""
predictions = []
tic = time.time()
if nms_on_tpu:
p = []
for i in labels_and_predictions:
for j in i:
p.append(np.array(j, dtype=np.float32))
predictions = np.concatenate(list(p)).reshape((-1, 7))
else:
k = 0
for example in labels_and_predictions:
if ssd_constants.IS_PADDED in example and example[
ssd_constants.IS_PADDED]:
continue
print(k)
k += 1
htot, wtot, _ = example[ssd_constants.RAW_SHAPE]
pred_box = example['pred_box']
pred_scores = example['pred_scores']
indices = example['indices']
loc, label, prob = decode_single(
pred_box, pred_scores, indices, ssd_constants.OVERLAP_CRITERIA,
ssd_constants.MAX_NUM_EVAL_BOXES, ssd_constants.MAX_NUM_EVAL_BOXES)
for loc_, label_, prob_ in zip(loc, label, prob):
# Ordering convention differs, hence [1], [0] rather than [0], [1]
predictions.append([
int(example[ssd_constants.SOURCE_ID]),
loc_[1] * wtot, loc_[0] * htot, (loc_[3] - loc_[1]) * wtot,
(loc_[2] - loc_[0]) * htot, prob_,
ssd_constants.CLASS_INV_MAP[label_]
])
toc = time.time()
tf.logging.info('Prepare predictions DONE (t={:0.2f}s).'.format(toc - tic))
if coco_gt is None:
coco_gt = create_coco(
FLAGS.val_json_file, use_cpp_extension=use_cpp_extension)
if use_cpp_extension:
coco_dt = coco_gt.LoadRes(np.array(predictions, dtype=np.float32))
coco_eval = COCOeval(coco_gt, coco_dt, iou_type='bbox')
coco_eval.Evaluate()
coco_eval.Accumulate()
coco_eval.Summarize()
stats = coco_eval.GetStats()
else:
coco_dt = coco_gt.loadRes(np.array(predictions))
coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
stats = coco_eval.stats
print('Current AP: {:.5f}'.format(stats[0]))
metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
coco_time = time.time()
tf.logging.info('COCO eval DONE (t={:0.2f}s).'.format(coco_time - toc))
# Prefix with "COCO" to group in TensorBoard.
return {'COCO/' + key: value for key, value in zip(metric_names, stats)}
def calc_iou(target, candidates):
target_tiled = np.tile(target[np.newaxis, :], (candidates.shape[0], 1))
# Left Top & Right Bottom
lt = np.maximum(target_tiled[:,:2], candidates[:,:2])
rb = np.minimum(target_tiled[:,2:], candidates[:,2:])
delta = np.maximum(rb - lt, 0)
intersect = delta[:,0] * delta[:,1]
delta1 = target_tiled[:, 2:] - target_tiled[:, :2]
area1 = delta1[:,0] * delta1[:,1]
delta2 = candidates[:, 2:] - candidates[:, :2]
area2 = delta2[:,0] * delta2[:,1]
iou = intersect/(area1 + area2 - intersect)
return iou
def decode_single(bboxes_in,
scores_in,
indices,
criteria,
max_output,
max_num=200):
"""Implement Non-maximum suppression.
Reference to https://github.com/amdegroot/ssd.pytorch
Args:
bboxes_in: a Tensor with shape [N, 4], which stacks box regression outputs
on all feature levels. The N is the number of total anchors on all levels.
scores_in: a Tensor with shape [ssd_constants.MAX_NUM_EVAL_BOXES,
num_classes]. The top ssd_constants.MAX_NUM_EVAL_BOXES box scores for each
class.
indices: a Tensor with shape [ssd_constants.MAX_NUM_EVAL_BOXES,
num_classes]. The indices for these top boxes for each class.
criteria: a float number to specify the threshold of NMS.
max_output: maximum output length.
max_num: maximum number of boxes before NMS.
Returns:
boxes, labels and scores after NMS.
"""
bboxes_out = []
scores_out = []
labels_out = []
for i, score in enumerate(np.split(scores_in, scores_in.shape[1], 1)):
class_indices = indices[:, i]
bboxes = bboxes_in[class_indices, :]
score = np.squeeze(score, 1)
# skip background
if i == 0:
continue
mask = score > ssd_constants.MIN_SCORE
if not np.any(mask):
continue
bboxes, score = bboxes[mask, :], score[mask]
# remain_list = []
# for r in range(bboxes.shape[0]):
# if bboxes[r, 0] < 0 or bboxes[r, 1] < 0 or bboxes[r, 2] < 0 or bboxes[r, 3] < 0 or bboxes[r, 0] >= bboxes[r, 2] or \
# bboxes[r, 1] >= bboxes[r, 3]:
# continue
# remain_list.append(r)
# bboxes = bboxes[remain_list, :]
# score = score[remain_list]
remain_list = []
for r in range(bboxes.shape[0]):
for j in range(4):
if bboxes[r, j] < 0:
bboxes[r, j] = 0.00001
if bboxes[r, 0] >= bboxes[r, 2]:
bboxes[r, 2] = bboxes[r, 0] + 0.00001
if bboxes[r, 1] >= bboxes[r, 3]:
bboxes[r, 3] = bboxes[r, 1] + 0.00001
remain_list.append(r)
bboxes = bboxes[remain_list, :]
score = score[remain_list]
score_idx_sorted = np.argsort(score)
score_sorted = score[score_idx_sorted]
score_idx_sorted = score_idx_sorted[-max_num:]
candidates = []
# perform non-maximum suppression
while len(score_idx_sorted):
idx = score_idx_sorted[-1]
bboxes_sorted = bboxes[score_idx_sorted, :]
bboxes_idx = bboxes[idx, :]
iou = calc_iou(bboxes_idx, bboxes_sorted)
score_idx_sorted = score_idx_sorted[iou < criteria]
candidates.append(idx)
bboxes_out.append(bboxes[candidates, :])
scores_out.append(score[candidates])
labels_out.extend([i]*len(candidates))
if len(scores_out) == 0:
tf.logging.info("No objects detected. Returning dummy values.")
return (
np.zeros(shape=(1, 4), dtype=np.float32),
np.zeros(shape=(1,), dtype=np.int32),
np.ones(shape=(1,), dtype=np.float32) * ssd_constants.DUMMY_SCORE,
)
bboxes_out = np.concatenate(bboxes_out, axis=0)
scores_out = np.concatenate(scores_out, axis=0)
labels_out = np.array(labels_out)
max_ids = np.argsort(scores_out)[-max_output:]
return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
@@ -0,0 +1,369 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Convert raw COCO dataset to TFRecord for object_detection.
Example usage:
python create_coco_tf_record.py --logtostderr \
--image_dir="${TRAIN_IMAGE_DIR}" \
--object_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--caption_annotations_file="${CAPTION_ANNOTATIONS_FILE}" \
--output_file_prefix="${OUTPUT_DIR/FILE_PREFIX}" \
--num_shards=32
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import hashlib
import io
import json
import logging
import multiprocessing
import os
from absl import app
from absl import flags
import numpy as np
import PIL.Image
from pycocotools import mask
from research.object_detection.utils import dataset_util
from research.object_detection.utils import label_map_util
import tensorflow.compat.v1 as tf
flags.DEFINE_boolean(
'include_masks', False, 'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.')
flags.DEFINE_string('image_dir', '', 'Directory containing images.')
flags.DEFINE_string(
'image_info_file', '', 'File containing image information. '
'Tf Examples in the output files correspond to the image '
'info entries in this file. If this file is not provided '
'object_annotations_file is used if present. Otherwise, '
'caption_annotations_file is used to get image info.')
flags.DEFINE_string(
'object_annotations_file', '', 'File containing object '
'annotations - boxes and instance masks.')
flags.DEFINE_string('caption_annotations_file', '', 'File containing image '
'captions.')
flags.DEFINE_string('output_file_prefix', '/tmp/train', 'Path to output file')
flags.DEFINE_integer('num_shards', 32, 'Number of shards for output file.')
FLAGS = flags.FLAGS
logger = tf.get_logger()
logger.setLevel(logging.INFO)
def create_tf_example(image,
image_dir,
bbox_annotations=None,
category_index=None,
caption_annotations=None,
include_masks=False):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys: [u'license', u'file_name', u'coco_url', u'height',
u'width', u'date_captured', u'flickr_url', u'id']
image_dir: directory containing the image files.
bbox_annotations:
list of dicts with keys: [u'segmentation', u'area', u'iscrowd',
u'image_id', u'bbox', u'category_id', u'id'] Notice that bounding box
coordinates in the official COCO dataset are given as [x, y, width,
height] tuples using absolute coordinates where x, y represent the
top-left (0-indexed) corner. This function converts to the format
expected by the Tensorflow Object Detection API (which is which is
[ymin, xmin, ymax, xmax] with coordinates normalized relative to image
size).
category_index: a dict containing COCO category information keyed by the
'id' field of each category. See the label_map_util.create_category_index
function.
caption_annotations:
list of dict with keys: [u'id', u'image_id', u'str'].
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG
"""
image_height = image['height']
image_width = image['width']
filename = image['file_name']
image_id = image['id']
full_path = os.path.join(image_dir, filename)
with tf.gfile.GFile(full_path, 'rb') as fid:
encoded_jpg = fid.read()
encoded_jpg_io = io.BytesIO(encoded_jpg)
image = PIL.Image.open(encoded_jpg_io)
key = hashlib.sha256(encoded_jpg).hexdigest()
feature_dict = {
'image/height':
dataset_util.int64_feature(image_height),
'image/width':
dataset_util.int64_feature(image_width),
'image/filename':
dataset_util.bytes_feature(filename.encode('utf8')),
'image/source_id':
dataset_util.bytes_feature(str(image_id).encode('utf8')),
'image/key/sha256':
dataset_util.bytes_feature(key.encode('utf8')),
'image/encoded':
dataset_util.bytes_feature(encoded_jpg),
'image/format':
dataset_util.bytes_feature('jpeg'.encode('utf8')),
}
num_annotations_skipped = 0
if bbox_annotations:
xmin = []
xmax = []
ymin = []
ymax = []
is_crowd = []
category_names = []
category_ids = []
area = []
encoded_mask_png = []
for object_annotations in bbox_annotations:
(x, y, width, height) = tuple(object_annotations['bbox'])
if width <= 0 or height <= 0:
num_annotations_skipped += 1
continue
if x + width > image_width or y + height > image_height:
num_annotations_skipped += 1
continue
xmin.append(float(x) / image_width)
xmax.append(float(x + width) / image_width)
ymin.append(float(y) / image_height)
ymax.append(float(y + height) / image_height)
is_crowd.append(object_annotations['iscrowd'])
category_id = int(object_annotations['category_id'])
category_ids.append(category_id)
category_names.append(category_index[category_id]['name'].encode('utf8'))
area.append(object_annotations['area'])
if include_masks:
run_len_encoding = mask.frPyObjects(object_annotations['segmentation'],
image_height, image_width)
binary_mask = mask.decode(run_len_encoding)
if not object_annotations['iscrowd']:
binary_mask = np.amax(binary_mask, axis=2)
pil_image = PIL.Image.fromarray(binary_mask)
output_io = io.BytesIO()
pil_image.save(output_io, format='PNG')
encoded_mask_png.append(output_io.getvalue())
feature_dict.update({
'image/object/bbox/xmin':
dataset_util.float_list_feature(xmin),
'image/object/bbox/xmax':
dataset_util.float_list_feature(xmax),
'image/object/bbox/ymin':
dataset_util.float_list_feature(ymin),
'image/object/bbox/ymax':
dataset_util.float_list_feature(ymax),
'image/object/class/text':
dataset_util.bytes_list_feature(category_names),
'image/object/class/label':
dataset_util.int64_list_feature(category_ids),
'image/object/is_crowd':
dataset_util.int64_list_feature(is_crowd),
'image/object/area':
dataset_util.float_list_feature(area),
})
if include_masks:
feature_dict['image/object/mask'] = (
dataset_util.bytes_list_feature(encoded_mask_png))
if caption_annotations:
captions = []
for caption_annotation in caption_annotations:
captions.append(caption_annotation['caption'].encode('utf8'))
feature_dict.update(
{'image/caption': dataset_util.bytes_list_feature(captions)})
example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
return key, example, num_annotations_skipped
def _pool_create_tf_example(args):
return create_tf_example(*args)
def _load_object_annotations(object_annotations_file):
"""Loads object annotation JSON file."""
with tf.gfile.GFile(object_annotations_file, 'r') as fid:
obj_annotations = json.load(fid)
images = obj_annotations['images']
category_index = label_map_util.create_category_index(
obj_annotations['categories'])
img_to_obj_annotation = collections.defaultdict(list)
logging.info('Building bounding box index.')
for annotation in obj_annotations['annotations']:
image_id = annotation['image_id']
img_to_obj_annotation[image_id].append(annotation)
missing_annotation_count = 0
for image in images:
image_id = image['id']
if image_id not in img_to_obj_annotation:
missing_annotation_count += 1
logging.info('%d images are missing bboxes.', missing_annotation_count)
return img_to_obj_annotation, category_index
def _load_caption_annotations(caption_annotations_file):
"""Loads caption annotation JSON file."""
with tf.gfile.GFile(caption_annotations_file, 'r') as fid:
caption_annotations = json.load(fid)
img_to_caption_annotation = collections.defaultdict(list)
logging.info('Building caption index.')
for annotation in caption_annotations['annotations']:
image_id = annotation['image_id']
img_to_caption_annotation[image_id].append(annotation)
missing_annotation_count = 0
images = caption_annotations['images']
for image in images:
image_id = image['id']
if image_id not in img_to_caption_annotation:
missing_annotation_count += 1
logging.info('%d images are missing captions.', missing_annotation_count)
return img_to_caption_annotation
def _load_images_info(images_info_file):
with tf.gfile.GFile(images_info_file, 'r') as fid:
info_dict = json.load(fid)
return info_dict['images']
def _create_tf_record_from_coco_annotations(images_info_file,
image_dir,
output_path,
num_shards,
object_annotations_file=None,
caption_annotations_file=None,
include_masks=False):
"""Loads COCO annotation json files and converts to tf.Record format.
Args:
images_info_file: JSON file containing image info. The number of tf.Examples
in the output tf Record files is exactly equal to the number of image info
entries in this file. This can be any of train/val/test annotation json
files Eg. 'image_info_test-dev2017.json',
'instance_annotations_train2017.json',
'caption_annotations_train2017.json', etc.
image_dir: Directory containing the image files.
output_path: Path to output tf.Record file.
num_shards: Number of output files to create.
object_annotations_file: JSON file containing bounding box annotations.
caption_annotations_file: JSON file containing caption annotations.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
"""
logging.info('writing to output path: %s', output_path)
writers = [
tf.python_io.TFRecordWriter(
output_path + '-%05d-of-%05d.tfrecord' % (i, num_shards))
for i in range(num_shards)
]
images = _load_images_info(images_info_file)
img_to_obj_annotation = None
img_to_caption_annotation = None
category_index = None
if object_annotations_file:
img_to_obj_annotation, category_index = (
_load_object_annotations(object_annotations_file))
if caption_annotations_file:
img_to_caption_annotation = (
_load_caption_annotations(caption_annotations_file))
def _get_object_annotation(image_id):
if img_to_obj_annotation:
return img_to_obj_annotation[image_id]
else:
return None
def _get_caption_annotation(image_id):
if img_to_caption_annotation:
return img_to_caption_annotation[image_id]
else:
return None
pool = multiprocessing.Pool()
total_num_annotations_skipped = 0
for idx, (_, tf_example, num_annotations_skipped) in enumerate(
pool.imap(_pool_create_tf_example,
[(image, image_dir, _get_object_annotation(image['id']),
category_index, _get_caption_annotation(image['id']),
include_masks) for image in images])):
if idx % 100 == 0:
logging.info('On image %d of %d', idx, len(images))
total_num_annotations_skipped += num_annotations_skipped
writers[idx % num_shards].write(tf_example.SerializeToString())
pool.close()
pool.join()
for writer in writers:
writer.close()
logging.info('Finished writing, skipped %d annotations.',
total_num_annotations_skipped)
def main(_):
assert FLAGS.image_dir, '`image_dir` missing.'
assert (FLAGS.image_info_file or FLAGS.object_annotations_file or
FLAGS.caption_annotations_file), ('All annotation files are '
'missing.')
if FLAGS.image_info_file:
images_info_file = FLAGS.image_info_file
elif FLAGS.object_annotations_file:
images_info_file = FLAGS.object_annotations_file
else:
images_info_file = FLAGS.caption_annotations_file
directory = os.path.dirname(FLAGS.output_file_prefix)
if not tf.gfile.IsDirectory(directory):
tf.gfile.MakeDirs(directory)
_create_tf_record_from_coco_annotations(images_info_file, FLAGS.image_dir,
FLAGS.output_file_prefix,
FLAGS.num_shards,
FLAGS.object_annotations_file,
FLAGS.caption_annotations_file,
FLAGS.include_masks)
if __name__ == '__main__':
logger = tf.get_logger()
logger.setLevel(logging.INFO)
app.run(main)
@@ -0,0 +1,436 @@
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Data loader and processing."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import itertools as it
import math
import os
import numpy as np
import tensorflow as tf
from object_detection import argmax_matcher
from object_detection import box_list
from object_detection import faster_rcnn_box_coder
from object_detection import preprocessor
from object_detection import region_similarity_calculator
from object_detection import target_assigner
from object_detection import tf_example_decoder
import ssd_constants
def get_rank_size():
return int(os.environ['RANK_SIZE'])
def get_rank_id():
return int(os.environ['DEVICE_ID'])
class DefaultBoxes(object):
"""Default bounding boxes for 300x300 5 layer SSD.
Default bounding boxes generation follows the order of (W, H, anchor_sizes).
Therefore, the tensor converted from DefaultBoxes has a shape of
[anchor_sizes, H, W, 4]. The last dimension is the box coordinates; 'ltrb'
is [ymin, xmin, ymax, xmax] while 'xywh' is [cy, cx, h, w].
"""
def __init__(self):
fk = ssd_constants.IMAGE_SIZE / np.array(ssd_constants.STEPS)
self.default_boxes = []
# size of feature and number of feature
for idx, feature_size in enumerate(ssd_constants.FEATURE_SIZES):
sk1 = ssd_constants.SCALES[idx] / ssd_constants.IMAGE_SIZE
sk2 = ssd_constants.SCALES[idx+1] / ssd_constants.IMAGE_SIZE
sk3 = math.sqrt(sk1*sk2)
all_sizes = [(sk1, sk1), (sk3, sk3)]
for alpha in ssd_constants.ASPECT_RATIOS[idx]:
w, h = sk1 * math.sqrt(alpha), sk1 / math.sqrt(alpha)
all_sizes.append((w, h))
all_sizes.append((h, w))
assert len(all_sizes) == ssd_constants.NUM_DEFAULTS[idx]
for i, j in it.product(range(feature_size), repeat=2):
for w, h in all_sizes:
cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
box = tuple(np.clip(k, 0, 1) for k in (cy, cx, h, w))
self.default_boxes.append(box)
assert len(self.default_boxes) == ssd_constants.NUM_SSD_BOXES
def to_ltrb(cy, cx, h, w):
return cy - h / 2, cx - w / 2, cy + h / 2, cx + w / 2
# For IoU calculation
self.default_boxes_ltrb = tuple(to_ltrb(*i) for i in self.default_boxes)
def __call__(self, order='ltrb'):
if order == 'ltrb': return self.default_boxes_ltrb
if order == 'xywh': return self.default_boxes
def calc_iou_tensor(box1, box2):
""" Calculation of IoU based on two boxes tensor,
Reference to https://github.com/kuangliu/pytorch-ssd
input:
box1 (N, 4)
box2 (M, 4)
output:
IoU (N, M)
"""
N = tf.shape(box1)[0]
M = tf.shape(box2)[0]
be1 = tf.tile(tf.expand_dims(box1, axis=1), (1, M, 1))
be2 = tf.tile(tf.expand_dims(box2, axis=0), (N, 1, 1))
# Left Top & Right Bottom
lt = tf.maximum(be1[:,:,:2], be2[:,:,:2])
rb = tf.minimum(be1[:,:,2:], be2[:,:,2:])
delta = tf.maximum(rb - lt, 0)
intersect = delta[:,:,0]*delta[:,:,1]
delta1 = be1[:,:,2:] - be1[:,:,:2]
area1 = delta1[:,:,0]*delta1[:,:,1]
delta2 = be2[:,:,2:] - be2[:,:,:2]
area2 = delta2[:,:,0]*delta2[:,:,1]
iou = intersect/(area1 + area2 - intersect)
return iou
def ssd_crop(image, boxes, classes):
"""IoU biassed random crop.
Reference: https://github.com/chauhan-utk/ssd.DomainAdaptation
"""
num_boxes = tf.shape(boxes)[0]
def no_crop_check():
return (tf.random_uniform(shape=(), minval=0, maxval=1, dtype=tf.float32)
< ssd_constants.P_NO_CROP_PER_PASS)
def no_crop_proposal():
return (
tf.ones((), tf.bool),
tf.convert_to_tensor([0, 0, 1, 1], dtype=tf.float32),
tf.ones((num_boxes,), tf.bool),
)
def crop_proposal():
rand_vec = lambda minval, maxval: tf.random_uniform(
shape=(ssd_constants.NUM_CROP_PASSES, 1), minval=minval, maxval=maxval,
dtype=tf.float32)
width, height = rand_vec(0.3, 1), rand_vec(0.3, 1)
left, top = rand_vec(0, 1-width), rand_vec(0, 1-height)
right = left + width
bottom = top + height
ltrb = tf.concat([left, top, right, bottom], axis=1)
min_iou = tf.random_shuffle(ssd_constants.CROP_MIN_IOU_CHOICES)[0]
ious = calc_iou_tensor(ltrb, boxes)
# discard any bboxes whose center not in the cropped image
xc, yc = [tf.tile(0.5 * (boxes[:, i + 0] + boxes[:, i + 2])[tf.newaxis, :],
(ssd_constants.NUM_CROP_PASSES, 1)) for i in range(2)]
masks = tf.reduce_all(tf.stack([
tf.greater(xc, tf.tile(left, (1, num_boxes))),
tf.less(xc, tf.tile(right, (1, num_boxes))),
tf.greater(yc, tf.tile(top, (1, num_boxes))),
tf.less(yc, tf.tile(bottom, (1, num_boxes))),
], axis=2), axis=2)
# Checks of whether a crop is valid.
valid_aspect = tf.logical_and(tf.less(height/width, 2),
tf.less(width/height, 2))
valid_ious = tf.reduce_all(tf.greater(ious, min_iou), axis=1, keepdims=True)
valid_masks = tf.reduce_any(masks, axis=1, keepdims=True)
valid_all = tf.cast(tf.reduce_all(tf.concat(
[valid_aspect, valid_ious, valid_masks], axis=1), axis=1), tf.int32)
# One indexed, as zero is needed for the case of no matches.
index = tf.range(1, 1 + ssd_constants.NUM_CROP_PASSES, dtype=tf.int32)
# Either one-hot, or zeros if there is no valid crop.
selection = tf.equal(tf.reduce_max(index * valid_all), index)
use_crop = tf.reduce_any(selection)
output_ltrb = tf.reduce_sum(tf.multiply(ltrb, tf.tile(tf.cast(
selection, tf.float32)[:, tf.newaxis], (1, 4))), axis=0)
output_masks = tf.reduce_any(tf.logical_and(masks, tf.tile(
selection[:, tf.newaxis], (1, num_boxes))), axis=0)
return use_crop, output_ltrb, output_masks
def proposal(*args):
return tf.cond(
pred=no_crop_check(),
true_fn=no_crop_proposal,
false_fn=crop_proposal,
)
_, crop_bounds, box_masks = tf.while_loop(
cond=lambda x, *_: tf.logical_not(x),
body=proposal,
loop_vars=[tf.zeros((), tf.bool), tf.zeros((4,), tf.float32), tf.zeros((num_boxes,), tf.bool)],
)
filtered_boxes = tf.boolean_mask(boxes, box_masks, axis=0)
# Clip boxes to the cropped region.
filtered_boxes = tf.stack([
tf.maximum(filtered_boxes[:, 0], crop_bounds[0]),
tf.maximum(filtered_boxes[:, 1], crop_bounds[1]),
tf.minimum(filtered_boxes[:, 2], crop_bounds[2]),
tf.minimum(filtered_boxes[:, 3], crop_bounds[3]),
], axis=1)
left = crop_bounds[0]
top = crop_bounds[1]
width = crop_bounds[2] - left
height = crop_bounds[3] - top
cropped_boxes = tf.stack([
(filtered_boxes[:, 0] - left) / width,
(filtered_boxes[:, 1] - top) / height,
(filtered_boxes[:, 2] - left) / width,
(filtered_boxes[:, 3] - top) / height,
], axis=1)
cropped_image = tf.image.crop_and_resize(
image=image[tf.newaxis, :, :, :],
boxes=crop_bounds[tf.newaxis, :],
box_ind=tf.zeros((1,), tf.int32),
crop_size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE),
)[0, :, :, :]
cropped_classes = tf.boolean_mask(classes, box_masks, axis=0)
return cropped_image, cropped_boxes, cropped_classes
def color_jitter(image, brightness=0, contrast=0, saturation=0, hue=0):
"""Distorts the color of the image.
Args:
image: The input image tensor.
brightness: A float, specifying the brightness for color jitter.
contrast: A float, specifying the contrast for color jitter.
saturation: A float, specifying the saturation for color jitter.
hue: A float, specifying the hue for color jitter.
Returns:
The distorted image tensor.
"""
with tf.name_scope('distort_color'):
if brightness > 0:
image = tf.image.random_brightness(image, max_delta=brightness)
if contrast > 0:
image = tf.image.random_contrast(
image, lower=1-contrast, upper=1+contrast)
if saturation > 0:
image = tf.image.random_saturation(
image, lower=1-saturation, upper=1+saturation)
if hue > 0:
image = tf.image.random_hue(image, max_delta=hue)
return image
def encode_labels(gt_boxes, gt_labels):
"""Labels anchors with ground truth inputs.
Args:
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
encoded_classes: a tensor with shape [num_anchors, 1].
encoded_boxes: a tensor with shape [num_anchors, 4].
num_positives: scalar tensor storing number of positives in an image.
"""
similarity_calc = region_similarity_calculator.IouSimilarity()
matcher = argmax_matcher.ArgMaxMatcher(
matched_threshold=ssd_constants.MATCH_THRESHOLD,
unmatched_threshold=ssd_constants.MATCH_THRESHOLD,
negatives_lower_than_unmatched=True,
force_match_for_each_row=True)
box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
scale_factors=ssd_constants.BOX_CODER_SCALES)
default_boxes = box_list.BoxList(tf.convert_to_tensor(DefaultBoxes()('ltrb')))
target_boxes = box_list.BoxList(gt_boxes)
assigner = target_assigner.TargetAssigner(
similarity_calc, matcher, box_coder)
encoded_classes, _, encoded_boxes, _, matches = assigner.assign(
default_boxes, target_boxes, gt_labels)
num_matched_boxes = tf.reduce_sum(
tf.cast(tf.not_equal(matches.match_results, -1), tf.float32))
return encoded_classes, encoded_boxes, num_matched_boxes
class SSDInputReader(object):
"""Input reader for dataset."""
def __init__(self,
file_pattern,
transpose_input=False,
is_training=False,
distributed_eval=False,
count=-1):
self._file_pattern = file_pattern
self._transpose_input = transpose_input
self._is_training = is_training
self._distributed_eval = distributed_eval
self._count = count
def __call__(self, params):
example_decoder = tf_example_decoder.TfExampleDecoder()
def _parse_example(data):
with tf.name_scope('augmentation'):
source_id = data['source_id']
image = data['image'] # dtype uint8
raw_shape = tf.shape(image)
boxes = data['groundtruth_boxes']
classes = tf.reshape(data['groundtruth_classes'], [-1, 1])
# Only 80 of the 90 COCO classes are used.
class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
classes = tf.gather(class_map, classes)
classes = tf.cast(classes, dtype=tf.float32)
if self._is_training:
image, boxes, classes = ssd_crop(image, boxes, classes)
# ssd_crop resizes and returns image of dtype float32 and does not
# change its range (i.e., value in between 0--255). Divide by 255.
# converts it to [0, 1] range. Not doing this before cropping to
# avoid dtype cast (which incurs additional memory copy).
image /= 255.0
# random_horizontal_flip() is hard coded to flip with 50% chance.
image, boxes = preprocessor.random_horizontal_flip(
image=image, boxes=boxes)
# TODO(shibow): Investigate the parameters for color jitter.
image = color_jitter(
image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05)
encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
boxes, classes)
# TODO(taylorrobie): Check that this cast is valid.
encoded_classes = tf.cast(encoded_classes, tf.int32)
labels = {
ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes,
ssd_constants.BOXES: encoded_boxes,
ssd_constants.CLASSES: tf.squeeze(encoded_classes, axis=1),
}
return image, labels
else:
image = tf.image.resize_images(
image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))
# resize_image returns image of dtype float32 and does not change its
# range. Divide by 255 to convert image to [0, 1] range.
image /= 255.
def trim_and_pad(inp_tensor, dim_1):
"""Limit the number of boxes, and pad if necessary."""
inp_tensor = inp_tensor[:ssd_constants.MAX_NUM_EVAL_BOXES]
num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(inp_tensor)[0]
inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
return tf.reshape(
inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1])
boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1)
sample = {
ssd_constants.IMAGE: image,
ssd_constants.BOXES: boxes,
ssd_constants.CLASSES: classes,
ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32),
ssd_constants.RAW_SHAPE: raw_shape,
}
return sample
batch_size = params['batch_size']
dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
if self._is_training or self._distributed_eval:
if get_rank_size() == 1:
dataset = dataset.shard(1, 0)
else:
dataset = dataset.shard(get_rank_size(), get_rank_id())
if self._is_training:
dataset = dataset.shuffle( tf.to_int64(256))
# Prefetch data from files.
def _prefetch_dataset(filename):
dataset = tf.data.TFRecordDataset(filename).prefetch(1)
return dataset
dataset = dataset.apply(
tf.data.experimental.parallel_interleave(
_prefetch_dataset, cycle_length=32, sloppy=self._is_training))
# Parse the fetched records to input tensors for model function.
dataset = dataset.map(example_decoder.decode, num_parallel_calls=64)
if self._is_training:
dataset = dataset.map(
# pylint: disable=g-long-lambda
lambda data: (data,
tf.greater(tf.shape(data['groundtruth_boxes'])[0], 0)),
num_parallel_calls=64)
dataset = dataset.filter(lambda data, pred: pred)
dataset = dataset.shuffle(64).repeat()
dataset = dataset.map(lambda data, pred: data) # use the first value
dataset = dataset.map(_parse_example, num_parallel_calls=64)
dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
else:
dataset = dataset.prefetch(batch_size * 64)
dataset = dataset.map(_parse_example, num_parallel_calls=64)
dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
options = tf.data.Options()
options.experimental_threading.max_intra_op_parallelism = 1
options.experimental_threading.private_threadpool_size = 48
dataset = dataset.with_options(options)
return dataset
@@ -0,0 +1,24 @@
#!/bin/bash
export RANK_ID=$1
export RANK_SIZE=$2
export DEVICE_ID=$RANK_ID
export DEVICE_INDEX=$RANK_ID
export JOB_ID=990
export FUSION_TENSOR_SIZE=1000000000
python3 ${3}/ssd_main.py --mode=train_and_eval \
--train_batch_size=32 \
--training_file_pattern="train_tfrecord_path/train2017*" \
--resnet_checkpoint=resnet34_path/model.ckpt-28152 \
--validation_file_pattern="val_tfrecord_path/val2017*" \
--val_json_file="annotations_patah/instances_val2017.json" \
--eval_batch_size=32 \
--model_dir=result_npu
sleep 2
echo "**************** train finished ***************"
cp /var/log/npu/slog/host-0/* ./slog
cp /var/log/npu/slog/device-$DEVICE_ID/* ./slog
cp /var/log/npu/slog/device-os-$DEVICE_ID/* ./slog
@@ -0,0 +1,14 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
@@ -0,0 +1,199 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Argmax matcher implementation.
This class takes a similarity matrix and matches columns to rows based on the
maximum value per column. One can specify matched_thresholds and
to prevent columns from matching to rows (generally resulting in a negative
training example) and unmatched_theshold to ignore the match (generally
resulting in neither a positive or negative training example).
This matcher is used in Fast(er)-RCNN.
Note: matchers are used in TargetAssigners. There is a create_target_assigner
factory function for popular implementations.
"""
import tensorflow as tf
from object_detection import matcher
from object_detection import shape_utils
class ArgMaxMatcher(matcher.Matcher):
"""Matcher based on highest value.
This class computes matches from a similarity matrix. Each column is matched
to a single row.
To support object detection target assignment this class enables setting both
matched_threshold (upper threshold) and unmatched_threshold (lower thresholds)
defining three categories of similarity which define whether examples are
positive, negative, or ignored:
(1) similarity >= matched_threshold: Highest similarity. Matched/Positive!
(2) matched_threshold > similarity >= unmatched_threshold: Medium similarity.
Depending on negatives_lower_than_unmatched, this is either
Unmatched/Negative OR Ignore.
(3) unmatched_threshold > similarity: Lowest similarity. Depending on flag
negatives_lower_than_unmatched, either Unmatched/Negative OR Ignore.
For ignored matches this class sets the values in the Match object to -2.
"""
def __init__(self,
matched_threshold,
unmatched_threshold=None,
negatives_lower_than_unmatched=True,
force_match_for_each_row=False):
"""Construct ArgMaxMatcher.
Args:
matched_threshold: Threshold for positive matches. Positive if
sim >= matched_threshold, where sim is the maximum value of the
similarity matrix for a given column. Set to None for no threshold.
unmatched_threshold: Threshold for negative matches. Negative if
sim < unmatched_threshold. Defaults to matched_threshold
when set to None.
negatives_lower_than_unmatched: Boolean which defaults to True. If True
then negative matches are the ones below the unmatched_threshold,
whereas ignored matches are in between the matched and umatched
threshold. If False, then negative matches are in between the matched
and unmatched threshold, and everything lower than unmatched is ignored.
force_match_for_each_row: If True, ensures that each row is matched to
at least one column (which is not guaranteed otherwise if the
matched_threshold is high). Defaults to False. See
argmax_matcher_test.testMatcherForceMatch() for an example.
Raises:
ValueError: if unmatched_threshold is set but matched_threshold is not set
or if unmatched_threshold > matched_threshold.
"""
if (matched_threshold is None) and (unmatched_threshold is not None):
raise ValueError('Need to also define matched_threshold when'
'unmatched_threshold is defined')
self._matched_threshold = matched_threshold
if unmatched_threshold is None:
self._unmatched_threshold = matched_threshold
else:
if unmatched_threshold > matched_threshold:
raise ValueError('unmatched_threshold needs to be smaller or equal'
'to matched_threshold')
self._unmatched_threshold = unmatched_threshold
if not negatives_lower_than_unmatched:
if self._unmatched_threshold == self._matched_threshold:
raise ValueError('When negatives are in between matched and '
'unmatched thresholds, these cannot be of equal '
'value. matched: %s, unmatched: %s',
self._matched_threshold, self._unmatched_threshold)
self._force_match_for_each_row = force_match_for_each_row
self._negatives_lower_than_unmatched = negatives_lower_than_unmatched
def _match(self, similarity_matrix):
"""Tries to match each column of the similarity matrix to a row.
Args:
similarity_matrix: tensor of shape [N, M] representing any similarity
metric.
Returns:
Match object with corresponding matches for each of M columns.
"""
def _match_when_rows_are_empty():
"""Performs matching when the rows of similarity matrix are empty.
When the rows are empty, all detections are false positives. So we return
a tensor of -1's to indicate that the columns do not match to any rows.
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
similarity_matrix)
return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
def _match_when_rows_are_non_empty():
"""Performs matching when the rows of similarity matrix are non empty.
Returns:
matches: int32 tensor indicating the row each column matches to.
"""
# Matches for each column
matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32)
# Deal with matched and unmatched threshold
if self._matched_threshold is not None:
# Get logical indices of ignored and unmatched columns as tf.int64
matched_vals = tf.reduce_max(similarity_matrix, 0)
below_unmatched_threshold = tf.greater(self._unmatched_threshold,
matched_vals)
between_thresholds = tf.logical_and(
tf.greater_equal(matched_vals, self._unmatched_threshold),
tf.greater(self._matched_threshold, matched_vals))
if self._negatives_lower_than_unmatched:
matches = self._set_values_using_indicator(matches,
below_unmatched_threshold,
-1)
matches = self._set_values_using_indicator(matches,
between_thresholds,
-2)
else:
matches = self._set_values_using_indicator(matches,
below_unmatched_threshold,
-2)
matches = self._set_values_using_indicator(matches,
between_thresholds,
-1)
if self._force_match_for_each_row:
similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
similarity_matrix)
force_match_column_ids = tf.argmax(similarity_matrix, 1,
output_type=tf.int32)
force_match_column_indicators = tf.one_hot(
force_match_column_ids, depth=similarity_matrix_shape[1])
force_match_row_ids = tf.argmax(force_match_column_indicators, 0,
output_type=tf.int32)
force_match_column_mask = tf.cast(
tf.reduce_max(force_match_column_indicators, 0), tf.bool)
final_matches = tf.where(force_match_column_mask,
force_match_row_ids, matches)
return final_matches
else:
return matches
if similarity_matrix.shape.is_fully_defined():
if similarity_matrix.shape[0].value == 0:
return _match_when_rows_are_empty()
else:
return _match_when_rows_are_non_empty()
else:
return tf.cond(
tf.greater(tf.shape(similarity_matrix)[0], 0),
_match_when_rows_are_non_empty, _match_when_rows_are_empty)
def _set_values_using_indicator(self, x, indicator, val):
"""Set the indicated fields of x to val.
Args:
x: tensor.
indicator: boolean with same shape as x.
val: scalar with value to set.
Returns:
modified tensor.
"""
indicator = tf.cast(indicator, x.dtype)
return tf.add(tf.multiply(x, 1 - indicator), val * indicator)
@@ -0,0 +1,151 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base box coder.
Box coders convert between coordinate frames, namely image-centric
(with (0,0) on the top left of image) and anchor-centric (with (0,0) being
defined by a specific anchor).
Users of a BoxCoder can call two methods:
encode: which encodes a box with respect to a given anchor
(or rather, a tensor of boxes wrt a corresponding tensor of anchors) and
decode: which inverts this encoding with a decode operation.
In both cases, the arguments are assumed to be in 1-1 correspondence already;
it is not the job of a BoxCoder to perform matching.
"""
from abc import ABCMeta
from abc import abstractmethod
from abc import abstractproperty
import tensorflow as tf
# Box coder types.
FASTER_RCNN = 'faster_rcnn'
KEYPOINT = 'keypoint'
MEAN_STDDEV = 'mean_stddev'
SQUARE = 'square'
class BoxCoder(object):
"""Abstract base class for box coder."""
__metaclass__ = ABCMeta
@abstractproperty
def code_size(self):
"""Return the size of each code.
This number is a constant and should agree with the output of the `encode`
op (e.g. if rel_codes is the output of self.encode(...), then it should have
shape [N, code_size()]). This abstractproperty should be overridden by
implementations.
Returns:
an integer constant
"""
pass
def encode(self, boxes, anchors):
"""Encode a box list relative to an anchor collection.
Args:
boxes: BoxList holding N boxes to be encoded
anchors: BoxList of N anchors
Returns:
a tensor representing N relative-encoded boxes
"""
with tf.name_scope('Encode'):
return self._encode(boxes, anchors)
def decode(self, rel_codes, anchors):
"""Decode boxes that are encoded relative to an anchor collection.
Args:
rel_codes: a tensor representing N relative-encoded boxes
anchors: BoxList of anchors
Returns:
boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
with corners y_min, x_min, y_max, x_max)
"""
with tf.name_scope('Decode'):
return self._decode(rel_codes, anchors)
@abstractmethod
def _encode(self, boxes, anchors):
"""Method to be overriden by implementations.
Args:
boxes: BoxList holding N boxes to be encoded
anchors: BoxList of N anchors
Returns:
a tensor representing N relative-encoded boxes
"""
pass
@abstractmethod
def _decode(self, rel_codes, anchors):
"""Method to be overriden by implementations.
Args:
rel_codes: a tensor representing N relative-encoded boxes
anchors: BoxList of anchors
Returns:
boxlist: BoxList holding N boxes encoded in the ordinary way (i.e.,
with corners y_min, x_min, y_max, x_max)
"""
pass
def batch_decode(encoded_boxes, box_coder, anchors):
"""Decode a batch of encoded boxes.
This op takes a batch of encoded bounding boxes and transforms
them to a batch of bounding boxes specified by their corners in
the order of [y_min, x_min, y_max, x_max].
Args:
encoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
code_size] representing the location of the objects.
box_coder: a BoxCoder object.
anchors: a BoxList of anchors used to encode `encoded_boxes`.
Returns:
decoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
coder_size] representing the corners of the objects in the order
of [y_min, x_min, y_max, x_max].
Raises:
ValueError: if batch sizes of the inputs are inconsistent, or if
the number of anchors inferred from encoded_boxes and anchors are
inconsistent.
"""
encoded_boxes.get_shape().assert_has_rank(3)
if encoded_boxes.get_shape()[1].value != anchors.num_boxes_static():
raise ValueError('The number of anchors inferred from encoded_boxes'
' and anchors are inconsistent: shape[1] of encoded_boxes'
' %s should be equal to the number of anchors: %s.' %
(encoded_boxes.get_shape()[1].value,
anchors.num_boxes_static()))
decoded_boxes = tf.stack([
box_coder.decode(boxes, anchors).get()
for boxes in tf.unstack(encoded_boxes)
])
return decoded_boxes
@@ -0,0 +1,207 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Bounding Box List definition.
BoxList represents a list of bounding boxes as tensorflow
tensors, where each bounding box is represented as a row of 4 numbers,
[y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes
within a given list correspond to a single image. See also
box_list_ops.py for common box related operations (such as area, iou, etc).
Optionally, users can add additional related fields (such as weights).
We assume the following things to be true about fields:
* they correspond to boxes in the box_list along the 0th dimension
* they have inferrable rank at graph construction time
* all dimensions except for possibly the 0th can be inferred
(i.e., not None) at graph construction time.
Some other notes:
* Following tensorflow conventions, we use height, width ordering,
and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering
* Tensors are always provided as (flat) [N, 4] tensors.
"""
import tensorflow as tf
class BoxList(object):
"""Box collection."""
def __init__(self, boxes):
"""Constructs box collection.
Args:
boxes: a tensor of shape [N, 4] representing box corners
Raises:
ValueError: if invalid dimensions for bbox data or if bbox data is not in
float32 format.
"""
if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
raise ValueError('Invalid dimensions for box data.')
if boxes.dtype != tf.float32:
raise ValueError('Invalid tensor type: should be tf.float32')
self.data = {'boxes': boxes}
def num_boxes(self):
"""Returns number of boxes held in collection.
Returns:
a tensor representing the number of boxes held in the collection.
"""
return tf.shape(self.data['boxes'])[0]
def num_boxes_static(self):
"""Returns number of boxes held in collection.
This number is inferred at graph construction time rather than run-time.
Returns:
Number of boxes held in collection (integer) or None if this is not
inferrable at graph construction time.
"""
return self.data['boxes'].get_shape()[0].value
def get_all_fields(self):
"""Returns all fields."""
return self.data.keys()
def get_extra_fields(self):
"""Returns all non-box fields (i.e., everything not named 'boxes')."""
return [k for k in self.data.keys() if k != 'boxes']
def add_field(self, field, field_data):
"""Add field to box list.
This method can be used to add related box data such as
weights/labels, etc.
Args:
field: a string key to access the data via `get`
field_data: a tensor containing the data to store in the BoxList
"""
self.data[field] = field_data
def has_field(self, field):
return field in self.data
def get(self):
"""Convenience function for accessing box coordinates.
Returns:
a tensor with shape [N, 4] representing box coordinates.
"""
return self.get_field('boxes')
def set(self, boxes):
"""Convenience function for setting box coordinates.
Args:
boxes: a tensor of shape [N, 4] representing box corners
Raises:
ValueError: if invalid dimensions for bbox data
"""
if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
raise ValueError('Invalid dimensions for box data.')
self.data['boxes'] = boxes
def get_field(self, field):
"""Accesses a box collection and associated fields.
This function returns specified field with object; if no field is specified,
it returns the box coordinates.
Args:
field: this optional string parameter can be used to specify
a related field to be accessed.
Returns:
a tensor representing the box collection or an associated field.
Raises:
ValueError: if invalid field
"""
if not self.has_field(field):
raise ValueError('field ' + str(field) + ' does not exist')
return self.data[field]
def set_field(self, field, value):
"""Sets the value of a field.
Updates the field of a box_list with a given value.
Args:
field: (string) name of the field to set value.
value: the value to assign to the field.
Raises:
ValueError: if the box_list does not have specified field.
"""
if not self.has_field(field):
raise ValueError('field %s does not exist' % field)
self.data[field] = value
def get_center_coordinates_and_sizes(self, scope=None):
"""Computes the center coordinates, height and width of the boxes.
Args:
scope: name scope of the function.
Returns:
a list of 4 1-D tensors [ycenter, xcenter, height, width].
"""
with tf.name_scope(scope, 'get_center_coordinates_and_sizes'):
box_corners = self.get()
ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners))
width = xmax - xmin
height = ymax - ymin
ycenter = ymin + height / 2.
xcenter = xmin + width / 2.
return [ycenter, xcenter, height, width]
def transpose_coordinates(self, scope=None):
"""Transpose the coordinate representation in a boxlist.
Args:
scope: name scope of the function.
"""
with tf.name_scope(scope, 'transpose_coordinates'):
y_min, x_min, y_max, x_max = tf.split(
value=self.get(), num_or_size_splits=4, axis=1)
self.set(tf.concat([x_min, y_min, x_max, y_max], 1))
def as_tensor_dict(self, fields=None):
"""Retrieves specified fields as a dictionary of tensors.
Args:
fields: (optional) list of fields to return in the dictionary.
If None (default), all fields are returned.
Returns:
tensor_dict: A dictionary of tensors specified by fields.
Raises:
ValueError: if specified field is not contained in boxlist.
"""
tensor_dict = {}
if fields is None:
fields = self.get_all_fields()
for field in fields:
if not self.has_field(field):
raise ValueError('boxlist must contain all specified fields')
tensor_dict[field] = self.get_field(field)
return tensor_dict
@@ -0,0 +1,118 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Faster RCNN box coder.
Faster RCNN box coder follows the coding schema described below:
ty = (y - ya) / ha
tx = (x - xa) / wa
th = log(h / ha)
tw = log(w / wa)
where x, y, w, h denote the box's center coordinates, width and height
respectively. Similarly, xa, ya, wa, ha denote the anchor's center
coordinates, width and height. tx, ty, tw and th denote the anchor-encoded
center, width and height respectively.
See http://arxiv.org/abs/1506.01497 for details.
"""
import tensorflow as tf
from object_detection import box_coder
from object_detection import box_list
EPSILON = 1e-8
class FasterRcnnBoxCoder(box_coder.BoxCoder):
"""Faster RCNN box coder."""
def __init__(self, scale_factors=None):
"""Constructor for FasterRcnnBoxCoder.
Args:
scale_factors: List of 4 positive scalars to scale ty, tx, th and tw.
If set to None, does not perform scaling. For Faster RCNN,
the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
"""
if scale_factors:
assert len(scale_factors) == 4
for scalar in scale_factors:
assert scalar > 0
self._scale_factors = scale_factors
@property
def code_size(self):
return 4
def _encode(self, boxes, anchors):
"""Encode a box collection with respect to anchor collection.
Args:
boxes: BoxList holding N boxes to be encoded.
anchors: BoxList of anchors.
Returns:
a tensor representing N anchor-encoded boxes of the format
[ty, tx, th, tw].
"""
# Convert anchors to the center coordinate representation.
ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
ycenter, xcenter, h, w = boxes.get_center_coordinates_and_sizes()
# Avoid NaN in division and log below.
ha += EPSILON
wa += EPSILON
h += EPSILON
w += EPSILON
tx = (xcenter - xcenter_a) / wa
ty = (ycenter - ycenter_a) / ha
tw = tf.log(w / wa)
th = tf.log(h / ha)
# Scales location targets as used in paper for joint training.
if self._scale_factors:
ty *= self._scale_factors[0]
tx *= self._scale_factors[1]
th *= self._scale_factors[2]
tw *= self._scale_factors[3]
return tf.transpose(tf.stack([ty, tx, th, tw]))
def _decode(self, rel_codes, anchors):
"""Decode relative codes to boxes.
Args:
rel_codes: a tensor representing N anchor-encoded boxes.
anchors: BoxList of anchors.
Returns:
boxes: BoxList holding N bounding boxes.
"""
ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()
ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes))
if self._scale_factors:
ty /= self._scale_factors[0]
tx /= self._scale_factors[1]
th /= self._scale_factors[2]
tw /= self._scale_factors[3]
w = tf.exp(tw) * wa
h = tf.exp(th) * ha
ycenter = ty * ha + ycenter_a
xcenter = tx * wa + xcenter_a
ymin = ycenter - h / 2.
xmin = xcenter - w / 2.
ymax = ycenter + h / 2.
xmax = xcenter + w / 2.
return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
@@ -0,0 +1,241 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Matcher interface and Match class.
This module defines the Matcher interface and the Match object. The job of the
matcher is to match row and column indices based on the similarity matrix and
other optional parameters. Each column is matched to at most one row. There
are three possibilities for the matching:
1) match: A column matches a row.
2) no_match: A column does not match any row.
3) ignore: A column that is neither 'match' nor no_match.
The ignore case is regularly encountered in object detection: when an anchor has
a relatively small overlap with a ground-truth box, one neither wants to
consider this box a positive example (match) nor a negative example (no match).
The Match class is used to store the match results and it provides simple apis
to query the results.
"""
from abc import ABCMeta
from abc import abstractmethod
import tensorflow as tf
class Match(object):
"""Class to store results from the matcher.
This class is used to store the results from the matcher. It provides
convenient methods to query the matching results.
"""
def __init__(self, match_results):
"""Constructs a Match object.
Args:
match_results: Integer tensor of shape [N] with (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Raises:
ValueError: if match_results does not have rank 1 or is not an
integer int32 scalar tensor
"""
if match_results.shape.ndims != 1:
raise ValueError('match_results should have rank 1')
if match_results.dtype != tf.int32:
raise ValueError('match_results should be an int32 or int64 scalar '
'tensor')
self._match_results = match_results
@property
def match_results(self):
"""The accessor for match results.
Returns:
the tensor which encodes the match results.
"""
return self._match_results
def matched_column_indices(self):
"""Returns column indices that match to some row.
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(tf.greater(self._match_results, -1)))
def matched_column_indicator(self):
"""Returns column indices that are matched.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return tf.greater_equal(self._match_results, 0)
def num_matched_columns(self):
"""Returns number (int32 scalar tensor) of matched columns."""
return tf.size(self.matched_column_indices())
def unmatched_column_indices(self):
"""Returns column indices that do not match any row.
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(tf.equal(self._match_results, -1)))
def unmatched_column_indicator(self):
"""Returns column indices that are unmatched.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return tf.equal(self._match_results, -1)
def num_unmatched_columns(self):
"""Returns number (int32 scalar tensor) of unmatched columns."""
return tf.size(self.unmatched_column_indices())
def ignored_column_indices(self):
"""Returns column indices that are ignored (neither Matched nor Unmatched).
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(self.ignored_column_indicator()))
def ignored_column_indicator(self):
"""Returns boolean column indicator where True means the colum is ignored.
Returns:
column_indicator: boolean vector which is True for all ignored column
indices.
"""
return tf.equal(self._match_results, -2)
def num_ignored_columns(self):
"""Returns number (int32 scalar tensor) of matched columns."""
return tf.size(self.ignored_column_indices())
def unmatched_or_ignored_column_indices(self):
"""Returns column indices that are unmatched or ignored.
The indices returned by this op are always sorted in increasing order.
Returns:
column_indices: int32 tensor of shape [K] with column indices.
"""
return self._reshape_and_cast(tf.where(tf.greater(0, self._match_results)))
def matched_row_indices(self):
"""Returns row indices that match some column.
The indices returned by this op are ordered so as to be in correspondence
with the output of matched_column_indicator(). For example if
self.matched_column_indicator() is [0,2], and self.matched_row_indices() is
[7, 3], then we know that column 0 was matched to row 7 and column 2 was
matched to row 3.
Returns:
row_indices: int32 tensor of shape [K] with row indices.
"""
return self._reshape_and_cast(
tf.gather(self._match_results, self.matched_column_indices()))
def _reshape_and_cast(self, t):
return tf.cast(tf.reshape(t, [-1]), tf.int32)
def gather_based_on_match(self, input_tensor, unmatched_value,
ignored_value):
"""Gathers elements from `input_tensor` based on match results.
For columns that are matched to a row, gathered_tensor[col] is set to
input_tensor[match_results[col]]. For columns that are unmatched,
gathered_tensor[col] is set to unmatched_value. Finally, for columns that
are ignored gathered_tensor[col] is set to ignored_value.
Note that the input_tensor.shape[1:] must match with unmatched_value.shape
and ignored_value.shape
Args:
input_tensor: Tensor to gather values from.
unmatched_value: Constant tensor value for unmatched columns.
ignored_value: Constant tensor value for ignored columns.
Returns:
gathered_tensor: A tensor containing values gathered from input_tensor.
The shape of the gathered tensor is [match_results.shape[0]] +
input_tensor.shape[1:].
"""
input_tensor = tf.concat([tf.stack([ignored_value, unmatched_value]),
input_tensor], axis=0)
gather_indices = tf.maximum(self.match_results + 2, 0)
gathered_tensor = tf.gather(input_tensor, gather_indices)
return gathered_tensor
class Matcher(object):
"""Abstract base class for matcher.
"""
__metaclass__ = ABCMeta
def match(self, similarity_matrix, scope=None, **params):
"""Computes matches among row and column indices and returns the result.
Computes matches among the row and column indices based on the similarity
matrix and optional arguments.
Args:
similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
where higher value means more similar.
scope: Op scope name. Defaults to 'Match' if None.
**params: Additional keyword arguments for specific implementations of
the Matcher.
Returns:
A Match object with the results of matching.
"""
with tf.name_scope(scope, 'Match', [similarity_matrix, params]) as scope:
return Match(self._match(similarity_matrix, **params))
@abstractmethod
def _match(self, similarity_matrix, **params):
"""Method to be overridden by implementations.
Args:
similarity_matrix: Float tensor of shape [N, M] with pairwise similarity
where higher value means more similar.
**params: Additional keyword arguments for specific implementations of
the Matcher.
Returns:
match_results: Integer tensor of shape [M]: match_results[i]>=0 means
that column i is matched to row match_results[i], match_results[i]=-1
means that the column is not matched. match_results[i]=-2 means that
the column is ignored (usually this happens when there is a very weak
match which one neither wants as positive nor negative example).
"""
pass
@@ -0,0 +1,442 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Preprocess images and bounding boxes for detection.
We perform two sets of operations in preprocessing stage:
(a) operations that are applied to both training and testing data,
(b) operations that are applied only to training data for the purpose of
data augmentation.
A preprocessing function receives a set of inputs,
e.g. an image and bounding boxes,
performs an operation on them, and returns them.
Some examples are: randomly cropping the image, randomly mirroring the image,
randomly changing the brightness, contrast, hue and
randomly jittering the bounding boxes.
The image is a rank 4 tensor: [1, height, width, channels] with
dtype=tf.float32. The groundtruth_boxes is a rank 2 tensor: [N, 4] where
in each row there is a box with [ymin xmin ymax xmax].
Boxes are in normalized coordinates meaning
their coordinate values range in [0, 1]
Important Note: In tensor_dict, images is a rank 4 tensor, but preprocessing
functions receive a rank 3 tensor for processing the image. Thus, inside the
preprocess function we squeeze the image to become a rank 3 tensor and then
we pass it to the functions. At the end of the preprocess we expand the image
back to rank 4.
"""
import tensorflow as tf
from object_detection import box_list
def _flip_boxes_left_right(boxes):
"""Left-right flip the boxes.
Args:
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
Boxes are in normalized form meaning their coordinates vary
between [0, 1].
Each row is in the form of [ymin, xmin, ymax, xmax].
Returns:
Flipped boxes.
"""
ymin, xmin, ymax, xmax = tf.split(value=boxes, num_or_size_splits=4, axis=1)
flipped_xmin = tf.subtract(1.0, xmax)
flipped_xmax = tf.subtract(1.0, xmin)
flipped_boxes = tf.concat([ymin, flipped_xmin, ymax, flipped_xmax], 1)
return flipped_boxes
def _flip_masks_left_right(masks):
"""Left-right flip masks.
Args:
masks: rank 3 float32 tensor with shape
[num_instances, height, width] representing instance masks.
Returns:
flipped masks: rank 3 float32 tensor with shape
[num_instances, height, width] representing instance masks.
"""
return masks[:, :, ::-1]
def keypoint_flip_horizontal(keypoints, flip_point, flip_permutation,
scope=None):
"""Flips the keypoints horizontally around the flip_point.
This operation flips the x coordinate for each keypoint around the flip_point
and also permutes the keypoints in a manner specified by flip_permutation.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
flip_point: (float) scalar tensor representing the x coordinate to flip the
keypoints around.
flip_permutation: rank 1 int32 tensor containing the keypoint flip
permutation. This specifies the mapping from original keypoint indices
to the flipped keypoint indices. This is used primarily for keypoints
that are not reflection invariant. E.g. Suppose there are 3 keypoints
representing ['head', 'right_eye', 'left_eye'], then a logical choice for
flip_permutation might be [0, 2, 1] since we want to swap the 'left_eye'
and 'right_eye' after a horizontal flip.
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'FlipHorizontal'):
keypoints = tf.transpose(keypoints, [1, 0, 2])
keypoints = tf.gather(keypoints, flip_permutation)
v, u = tf.split(value=keypoints, num_or_size_splits=2, axis=2)
u = flip_point * 2.0 - u
new_keypoints = tf.concat([v, u], 2)
new_keypoints = tf.transpose(new_keypoints, [1, 0, 2])
return new_keypoints
def random_horizontal_flip(image,
boxes=None,
masks=None,
keypoints=None,
keypoint_flip_permutation=None,
seed=None):
"""Randomly flips the image and detections horizontally.
The probability of flipping the image is 50%.
Args:
image: rank 3 float32 tensor with shape [height, width, channels].
boxes: (optional) rank 2 float32 tensor with shape [N, 4]
containing the bounding boxes.
Boxes are in normalized form meaning their coordinates vary
between [0, 1].
Each row is in the form of [ymin, xmin, ymax, xmax].
masks: (optional) rank 3 float32 tensor with shape
[num_instances, height, width] containing instance masks. The masks
are of the same height, width as the input `image`.
keypoints: (optional) rank 3 float32 tensor with shape
[num_instances, num_keypoints, 2]. The keypoints are in y-x
normalized coordinates.
keypoint_flip_permutation: rank 1 int32 tensor containing the keypoint flip
permutation.
seed: random seed
Returns:
image: image which is the same shape as input image.
If boxes, masks, keypoints, and keypoint_flip_permutation are not None,
the function also returns the following tensors.
boxes: rank 2 float32 tensor containing the bounding boxes -> [N, 4].
Boxes are in normalized form meaning their coordinates vary
between [0, 1].
masks: rank 3 float32 tensor with shape [num_instances, height, width]
containing instance masks.
keypoints: rank 3 float32 tensor with shape
[num_instances, num_keypoints, 2]
Raises:
ValueError: if keypoints are provided but keypoint_flip_permutation is not.
"""
def _flip_image(image):
# flip image
image_flipped = tf.image.flip_left_right(image)
return image_flipped
if keypoints is not None and keypoint_flip_permutation is None:
raise ValueError(
'keypoints are provided but keypoints_flip_permutation is not provided')
with tf.name_scope('RandomHorizontalFlip', values=[image, boxes]):
result = []
# random variable defining whether to do flip or not
do_a_flip_random = tf.greater(tf.random_uniform([], seed=seed), 0.5)
# flip image
image = tf.cond(do_a_flip_random, lambda: _flip_image(image), lambda: image)
result.append(image)
# flip boxes
if boxes is not None:
boxes = tf.cond(do_a_flip_random, lambda: _flip_boxes_left_right(boxes),
lambda: boxes)
result.append(boxes)
# flip masks
if masks is not None:
masks = tf.cond(do_a_flip_random, lambda: _flip_masks_left_right(masks),
lambda: masks)
result.append(masks)
# flip keypoints
if keypoints is not None and keypoint_flip_permutation is not None:
permutation = keypoint_flip_permutation
keypoints = tf.cond(
do_a_flip_random,
lambda: keypoint_flip_horizontal(keypoints, 0.5, permutation),
lambda: keypoints)
result.append(keypoints)
return tuple(result)
def _compute_new_static_size(image, min_dimension, max_dimension):
"""Compute new static shape for resize_to_range method."""
image_shape = image.get_shape().as_list()
orig_height = image_shape[0]
orig_width = image_shape[1]
num_channels = image_shape[2]
orig_min_dim = min(orig_height, orig_width)
# Calculates the larger of the possible sizes
large_scale_factor = min_dimension / float(orig_min_dim)
# Scaling orig_(height|width) by large_scale_factor will make the smaller
# dimension equal to min_dimension, save for floating point rounding errors.
# For reasonably-sized images, taking the nearest integer will reliably
# eliminate this error.
large_height = int(round(orig_height * large_scale_factor))
large_width = int(round(orig_width * large_scale_factor))
large_size = [large_height, large_width]
if max_dimension:
# Calculates the smaller of the possible sizes, use that if the larger
# is too big.
orig_max_dim = max(orig_height, orig_width)
small_scale_factor = max_dimension / float(orig_max_dim)
# Scaling orig_(height|width) by small_scale_factor will make the larger
# dimension equal to max_dimension, save for floating point rounding
# errors. For reasonably-sized images, taking the nearest integer will
# reliably eliminate this error.
small_height = int(round(orig_height * small_scale_factor))
small_width = int(round(orig_width * small_scale_factor))
small_size = [small_height, small_width]
new_size = large_size
if max(large_size) > max_dimension:
new_size = small_size
else:
new_size = large_size
return tf.constant(new_size + [num_channels])
def _compute_new_dynamic_size(image, min_dimension, max_dimension):
"""Compute new dynamic shape for resize_to_range method."""
image_shape = tf.shape(image)
orig_height = tf.to_float(image_shape[0])
orig_width = tf.to_float(image_shape[1])
num_channels = image_shape[2]
orig_min_dim = tf.minimum(orig_height, orig_width)
# Calculates the larger of the possible sizes
min_dimension = tf.constant(min_dimension, dtype=tf.float32)
large_scale_factor = min_dimension / orig_min_dim
# Scaling orig_(height|width) by large_scale_factor will make the smaller
# dimension equal to min_dimension, save for floating point rounding errors.
# For reasonably-sized images, taking the nearest integer will reliably
# eliminate this error.
large_height = tf.to_int32(tf.round(orig_height * large_scale_factor))
large_width = tf.to_int32(tf.round(orig_width * large_scale_factor))
large_size = tf.stack([large_height, large_width])
if max_dimension:
# Calculates the smaller of the possible sizes, use that if the larger
# is too big.
orig_max_dim = tf.maximum(orig_height, orig_width)
max_dimension = tf.constant(max_dimension, dtype=tf.float32)
small_scale_factor = max_dimension / orig_max_dim
# Scaling orig_(height|width) by small_scale_factor will make the larger
# dimension equal to max_dimension, save for floating point rounding
# errors. For reasonably-sized images, taking the nearest integer will
# reliably eliminate this error.
small_height = tf.to_int32(tf.round(orig_height * small_scale_factor))
small_width = tf.to_int32(tf.round(orig_width * small_scale_factor))
small_size = tf.stack([small_height, small_width])
new_size = tf.cond(
tf.to_float(tf.reduce_max(large_size)) > max_dimension,
lambda: small_size, lambda: large_size)
else:
new_size = large_size
return tf.stack(tf.unstack(new_size) + [num_channels])
def resize_to_range(image,
masks=None,
min_dimension=None,
max_dimension=None,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False,
pad_to_max_dimension=False):
"""Resizes an image so its dimensions are within the provided value.
The output size can be described by two cases:
1. If the image can be rescaled so its minimum dimension is equal to the
provided value without the other dimension exceeding max_dimension,
then do so.
2. Otherwise, resize so the largest dimension is equal to max_dimension.
Args:
image: A 3D tensor of shape [height, width, channels]
masks: (optional) rank 3 float32 tensor with shape
[num_instances, height, width] containing instance masks.
min_dimension: (optional) (scalar) desired size of the smaller image
dimension.
max_dimension: (optional) (scalar) maximum allowed size
of the larger image dimension.
method: (optional) interpolation method used in resizing. Defaults to
BILINEAR.
align_corners: bool. If true, exactly align all 4 corners of the input
and output. Defaults to False.
pad_to_max_dimension: Whether to resize the image and pad it with zeros
so the resulting image is of the spatial size
[max_dimension, max_dimension]. If masks are included they are padded
similarly.
Returns:
Note that the position of the resized_image_shape changes based on whether
masks are present.
resized_image: A 3D tensor of shape [new_height, new_width, channels],
where the image has been resized (with bilinear interpolation) so that
min(new_height, new_width) == min_dimension or
max(new_height, new_width) == max_dimension.
resized_masks: If masks is not None, also outputs masks. A 3D tensor of
shape [num_instances, new_height, new_width].
resized_image_shape: A 1D tensor of shape [3] containing shape of the
resized image.
Raises:
ValueError: if the image is not a 3D tensor.
"""
if len(image.get_shape()) != 3:
raise ValueError('Image should be 3D tensor')
with tf.name_scope('ResizeToRange', values=[image, min_dimension]):
if image.get_shape().is_fully_defined():
new_size = _compute_new_static_size(image, min_dimension, max_dimension)
else:
new_size = _compute_new_dynamic_size(image, min_dimension, max_dimension)
new_image = tf.image.resize_images(
image, new_size[:-1], method=method, align_corners=align_corners)
if pad_to_max_dimension:
new_image = tf.image.pad_to_bounding_box(
new_image, 0, 0, max_dimension, max_dimension)
result = [new_image]
if masks is not None:
new_masks = tf.expand_dims(masks, 3)
new_masks = tf.image.resize_images(
new_masks,
new_size[:-1],
method=tf.image.ResizeMethod.NEAREST_NEIGHBOR,
align_corners=align_corners)
new_masks = tf.squeeze(new_masks, 3)
if pad_to_max_dimension:
new_masks = tf.image.pad_to_bounding_box(
new_masks, 0, 0, max_dimension, max_dimension)
result.append(new_masks)
result.append(new_size)
return result
def _copy_extra_fields(boxlist_to_copy_to, boxlist_to_copy_from):
"""Copies the extra fields of boxlist_to_copy_from to boxlist_to_copy_to.
Args:
boxlist_to_copy_to: BoxList to which extra fields are copied.
boxlist_to_copy_from: BoxList from which fields are copied.
Returns:
boxlist_to_copy_to with extra fields.
"""
for field in boxlist_to_copy_from.get_extra_fields():
boxlist_to_copy_to.add_field(field, boxlist_to_copy_from.get_field(field))
return boxlist_to_copy_to
def box_list_scale(boxlist, y_scale, x_scale, scope=None):
"""scale box coordinates in x and y dimensions.
Args:
boxlist: BoxList holding N boxes
y_scale: (float) scalar tensor
x_scale: (float) scalar tensor
scope: name scope.
Returns:
boxlist: BoxList holding N boxes
"""
with tf.name_scope(scope, 'Scale'):
y_scale = tf.cast(y_scale, tf.float32)
x_scale = tf.cast(x_scale, tf.float32)
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
y_min = y_scale * y_min
y_max = y_scale * y_max
x_min = x_scale * x_min
x_max = x_scale * x_max
scaled_boxlist = box_list.BoxList(
tf.concat([y_min, x_min, y_max, x_max], 1))
return _copy_extra_fields(scaled_boxlist, boxlist)
def keypoint_scale(keypoints, y_scale, x_scale, scope=None):
"""Scales keypoint coordinates in x and y dimensions.
Args:
keypoints: a tensor of shape [num_instances, num_keypoints, 2]
y_scale: (float) scalar tensor
x_scale: (float) scalar tensor
scope: name scope.
Returns:
new_keypoints: a tensor of shape [num_instances, num_keypoints, 2]
"""
with tf.name_scope(scope, 'Scale'):
y_scale = tf.cast(y_scale, tf.float32)
x_scale = tf.cast(x_scale, tf.float32)
new_keypoints = keypoints * [[[y_scale, x_scale]]]
return new_keypoints
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None):
"""Scales boxes from normalized to pixel coordinates.
Args:
image: A 3D float32 tensor of shape [height, width, channels].
boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding
boxes in normalized coordinates. Each row is of the form
[ymin, xmin, ymax, xmax].
keypoints: (optional) rank 3 float32 tensor with shape
[num_instances, num_keypoints, 2]. The keypoints are in y-x normalized
coordinates.
Returns:
image: unchanged input image.
scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the
bounding boxes in pixel coordinates.
scaled_keypoints: a 3D float32 tensor with shape
[num_instances, num_keypoints, 2] containing the keypoints in pixel
coordinates.
"""
boxlist = box_list.BoxList(boxes)
image_height = tf.shape(image)[0]
image_width = tf.shape(image)[1]
scaled_boxes = box_list_scale(boxlist, image_height, image_width).get()
result = [image, scaled_boxes]
if keypoints is not None:
scaled_keypoints = keypoint_scale(keypoints, image_height, image_width)
result.append(scaled_keypoints)
return tuple(result)
@@ -0,0 +1,135 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Region Similarity Calculators for BoxLists.
Region Similarity Calculators compare a pairwise measure of similarity
between the boxes in two BoxLists.
"""
from abc import ABCMeta
from abc import abstractmethod
import tensorflow as tf
def area(boxlist, scope=None):
"""Computes area of boxes.
Args:
boxlist: BoxList holding N boxes
scope: name scope.
Returns:
a tensor with shape [N] representing box areas.
"""
with tf.name_scope(scope, 'Area'):
y_min, x_min, y_max, x_max = tf.split(
value=boxlist.get(), num_or_size_splits=4, axis=1)
return tf.squeeze((y_max - y_min) * (x_max - x_min), [1])
def intersection(boxlist1, boxlist2, scope=None):
"""Compute pairwise intersection areas between boxes.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise intersections
"""
with tf.name_scope(scope, 'Intersection'):
y_min1, x_min1, y_max1, x_max1 = tf.split(
value=boxlist1.get(), num_or_size_splits=4, axis=1)
y_min2, x_min2, y_max2, x_max2 = tf.split(
value=boxlist2.get(), num_or_size_splits=4, axis=1)
all_pairs_min_ymax = tf.minimum(y_max1, tf.transpose(y_max2))
all_pairs_max_ymin = tf.maximum(y_min1, tf.transpose(y_min2))
intersect_heights = tf.maximum(0.0, all_pairs_min_ymax - all_pairs_max_ymin)
all_pairs_min_xmax = tf.minimum(x_max1, tf.transpose(x_max2))
all_pairs_max_xmin = tf.maximum(x_min1, tf.transpose(x_min2))
intersect_widths = tf.maximum(0.0, all_pairs_min_xmax - all_pairs_max_xmin)
return intersect_heights * intersect_widths
def iou(boxlist1, boxlist2, scope=None):
"""Computes pairwise intersection-over-union between box collections.
Args:
boxlist1: BoxList holding N boxes
boxlist2: BoxList holding M boxes
scope: name scope.
Returns:
a tensor with shape [N, M] representing pairwise iou scores.
"""
with tf.name_scope(scope, 'IOU'):
intersections = intersection(boxlist1, boxlist2)
areas1 = area(boxlist1)
areas2 = area(boxlist2)
unions = (
tf.expand_dims(areas1, 1) + tf.expand_dims(areas2, 0) - intersections)
return tf.where(
tf.equal(intersections, 0.0),
tf.zeros_like(intersections), tf.truediv(intersections, unions))
class RegionSimilarityCalculator(object):
"""Abstract base class for region similarity calculator."""
__metaclass__ = ABCMeta
def compare(self, boxlist1, boxlist2, scope=None):
"""Computes matrix of pairwise similarity between BoxLists.
This op (to be overriden) computes a measure of pairwise similarity between
the boxes in the given BoxLists. Higher values indicate more similarity.
Note that this method simply measures similarity and does not explicitly
perform a matching.
Args:
boxlist1: BoxList holding N boxes.
boxlist2: BoxList holding M boxes.
scope: Op scope name. Defaults to 'Compare' if None.
Returns:
a (float32) tensor of shape [N, M] with pairwise similarity score.
"""
with tf.name_scope(scope, 'Compare', [boxlist1, boxlist2]) as scope:
return self._compare(boxlist1, boxlist2)
@abstractmethod
def _compare(self, boxlist1, boxlist2):
pass
class IouSimilarity(RegionSimilarityCalculator):
"""Class to compute similarity based on Intersection over Union (IOU) metric.
This class computes pairwise similarity between two BoxLists based on IOU.
"""
def _compare(self, boxlist1, boxlist2):
"""Compute pairwise IOU similarity between the two BoxLists.
Args:
boxlist1: BoxList holding N boxes.
boxlist2: BoxList holding M boxes.
Returns:
A tensor with shape [N, M] representing pairwise iou scores.
"""
return iou(boxlist1, boxlist2)
@@ -0,0 +1,70 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utils used to manipulate tensor shapes."""
import tensorflow as tf
def assert_shape_equal(shape_a, shape_b):
"""Asserts that shape_a and shape_b are equal.
If the shapes are static, raises a ValueError when the shapes
mismatch.
If the shapes are dynamic, raises a tf InvalidArgumentError when the shapes
mismatch.
Args:
shape_a: a list containing shape of the first tensor.
shape_b: a list containing shape of the second tensor.
Returns:
Either a tf.no_op() when shapes are all static and a tf.assert_equal() op
when the shapes are dynamic.
Raises:
ValueError: When shapes are both static and unequal.
"""
if (all(isinstance(dim, int) for dim in shape_a) and
all(isinstance(dim, int) for dim in shape_b)):
if shape_a != shape_b:
raise ValueError('Unequal shapes {}, {}'.format(shape_a, shape_b))
else: return tf.no_op()
else:
return tf.assert_equal(shape_a, shape_b)
def combined_static_and_dynamic_shape(tensor):
"""Returns a list containing static and dynamic values for the dimensions.
Returns a list of static and dynamic values for shape dimensions. This is
useful to preserve static shapes when available in reshape operation.
Args:
tensor: A tensor of any type.
Returns:
A list of size tensor.shape.ndims containing integers or a scalar tensor.
"""
static_tensor_shape = tensor.shape.as_list()
dynamic_tensor_shape = tf.shape(tensor)
combined_shape = []
for index, dim in enumerate(static_tensor_shape):
if dim is not None:
combined_shape.append(dim)
else:
combined_shape.append(dynamic_tensor_shape[index])
return combined_shape
@@ -0,0 +1,310 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Base target assigner module.
The job of a TargetAssigner is, for a given set of anchors (bounding boxes) and
groundtruth detections (bounding boxes), to assign classification and regression
targets to each anchor as well as weights to each anchor (specifying, e.g.,
which anchors should not contribute to training loss).
It assigns classification/regression targets by performing the following steps:
1) Computing pairwise similarity between anchors and groundtruth boxes using a
provided RegionSimilarity Calculator
2) Computing a matching based on the similarity matrix using a provided Matcher
3) Assigning regression targets based on the matching and a provided BoxCoder
4) Assigning classification targets based on the matching and groundtruth labels
Note that TargetAssigners only operate on detections from a single
image at a time, so any logic for applying a TargetAssigner to multiple
images must be handled externally.
"""
import tensorflow as tf
from object_detection import box_list
from object_detection import shape_utils
KEYPOINTS_FIELD_NAME = 'keypoints'
class TargetAssigner(object):
"""Target assigner to compute classification and regression targets."""
def __init__(self, similarity_calc, matcher, box_coder,
negative_class_weight=1.0, unmatched_cls_target=None):
"""Construct Object Detection Target Assigner.
Args:
similarity_calc: a RegionSimilarityCalculator
matcher: Matcher used to match groundtruth to anchors.
box_coder: BoxCoder used to encode matching groundtruth boxes with
respect to anchors.
negative_class_weight: classification weight to be associated to negative
anchors (default: 1.0). The weight must be in [0., 1.].
unmatched_cls_target: a float32 tensor with shape [d_1, d_2, ..., d_k]
which is consistent with the classification target for each
anchor (and can be empty for scalar targets). This shape must thus be
compatible with the groundtruth labels that are passed to the "assign"
function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]).
If set to None, unmatched_cls_target is set to be [0] for each anchor.
Raises:
ValueError: if similarity_calc is not a RegionSimilarityCalculator or
if matcher is not a Matcher or if box_coder is not a BoxCoder
"""
self._similarity_calc = similarity_calc
self._matcher = matcher
self._box_coder = box_coder
self._negative_class_weight = negative_class_weight
if unmatched_cls_target is None:
self._unmatched_cls_target = tf.constant([0], tf.float32)
else:
self._unmatched_cls_target = unmatched_cls_target
@property
def box_coder(self):
return self._box_coder
def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None,
groundtruth_weights=None, **params):
"""Assign classification and regression targets to each anchor.
For a given set of anchors and groundtruth detections, match anchors
to groundtruth_boxes and assign classification and regression targets to
each anchor as well as weights based on the resulting match (specifying,
e.g., which anchors should not contribute to training loss).
Anchors that are not matched to anything are given a classification target
of self._unmatched_cls_target which can be specified via the constructor.
Args:
anchors: a BoxList representing N anchors
groundtruth_boxes: a BoxList representing M groundtruth boxes
groundtruth_labels: a tensor of shape [M, d_1, ... d_k]
with labels for each of the ground_truth boxes. The subshape
[d_1, ... d_k] can be empty (corresponding to scalar inputs). When set
to None, groundtruth_labels assumes a binary problem where all
ground_truth boxes get a positive label (of 1).
groundtruth_weights: a float tensor of shape [M] indicating the weight to
assign to all anchors match to a particular groundtruth box. The weights
must be in [0., 1.]. If None, all weights are set to 1.
**params: Additional keyword arguments for specific implementations of
the Matcher.
Returns:
cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k],
where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels
which has shape [num_gt_boxes, d_1, d_2, ... d_k].
cls_weights: a float32 tensor with shape [num_anchors]
reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension]
reg_weights: a float32 tensor with shape [num_anchors]
match: a matcher.Match object encoding the match between anchors and
groundtruth boxes, with rows corresponding to groundtruth boxes
and columns corresponding to anchors.
Raises:
ValueError: if anchors or groundtruth_boxes are not of type
box_list.BoxList
"""
if not isinstance(anchors, box_list.BoxList):
raise ValueError('anchors must be an BoxList')
if not isinstance(groundtruth_boxes, box_list.BoxList):
raise ValueError('groundtruth_boxes must be an BoxList')
if groundtruth_labels is None:
groundtruth_labels = tf.ones(tf.expand_dims(groundtruth_boxes.num_boxes(),
0))
groundtruth_labels = tf.expand_dims(groundtruth_labels, -1)
unmatched_shape_assert = shape_utils.assert_shape_equal(
shape_utils.combined_static_and_dynamic_shape(groundtruth_labels)[1:],
shape_utils.combined_static_and_dynamic_shape(
self._unmatched_cls_target))
labels_and_box_shapes_assert = shape_utils.assert_shape_equal(
shape_utils.combined_static_and_dynamic_shape(
groundtruth_labels)[:1],
shape_utils.combined_static_and_dynamic_shape(
groundtruth_boxes.get())[:1])
if groundtruth_weights is None:
num_gt_boxes = groundtruth_boxes.num_boxes_static()
if not num_gt_boxes:
num_gt_boxes = groundtruth_boxes.num_boxes()
groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32)
with tf.control_dependencies(
[unmatched_shape_assert, labels_and_box_shapes_assert]):
match_quality_matrix = self._similarity_calc.compare(groundtruth_boxes,
anchors)
match = self._matcher.match(match_quality_matrix, **params)
reg_targets = self._create_regression_targets(anchors,
groundtruth_boxes,
match)
cls_targets = self._create_classification_targets(groundtruth_labels,
match)
reg_weights = self._create_regression_weights(match, groundtruth_weights)
cls_weights = self._create_classification_weights(match,
groundtruth_weights)
num_anchors = anchors.num_boxes_static()
if num_anchors is not None:
reg_targets = self._reset_target_shape(reg_targets, num_anchors)
cls_targets = self._reset_target_shape(cls_targets, num_anchors)
reg_weights = self._reset_target_shape(reg_weights, num_anchors)
cls_weights = self._reset_target_shape(cls_weights, num_anchors)
return cls_targets, cls_weights, reg_targets, reg_weights, match
def _reset_target_shape(self, target, num_anchors):
"""Sets the static shape of the target.
Args:
target: the target tensor. Its first dimension will be overwritten.
num_anchors: the number of anchors, which is used to override the target's
first dimension.
Returns:
A tensor with the shape info filled in.
"""
target_shape = target.get_shape().as_list()
target_shape[0] = num_anchors
target.set_shape(target_shape)
return target
def _create_regression_targets(self, anchors, groundtruth_boxes, match):
"""Returns a regression target for each anchor.
Args:
anchors: a BoxList representing N anchors
groundtruth_boxes: a BoxList representing M groundtruth_boxes
match: a matcher.Match object
Returns:
reg_targets: a float32 tensor with shape [N, box_code_dimension]
"""
matched_gt_boxes = match.gather_based_on_match(
groundtruth_boxes.get(),
unmatched_value=tf.zeros(4),
ignored_value=tf.zeros(4))
matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME):
groundtruth_keypoints = groundtruth_boxes.get_field(KEYPOINTS_FIELD_NAME)
matched_keypoints = match.gather_based_on_match(
groundtruth_keypoints,
unmatched_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]),
ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]))
matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME, matched_keypoints)
matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors)
match_results_shape = shape_utils.combined_static_and_dynamic_shape(
match.match_results)
# Zero out the unmatched and ignored regression targets.
unmatched_ignored_reg_targets = tf.tile(
self._default_regression_target(), [match_results_shape[0], 1])
matched_anchors_mask = match.matched_column_indicator()
reg_targets = tf.where(matched_anchors_mask,
matched_reg_targets,
unmatched_ignored_reg_targets)
return reg_targets
def _default_regression_target(self):
"""Returns the default target for anchors to regress to.
Default regression targets are set to zero (though in
this implementation what these targets are set to should
not matter as the regression weight of any box set to
regress to the default target is zero).
Returns:
default_target: a float32 tensor with shape [1, box_code_dimension]
"""
return tf.constant([self._box_coder.code_size*[0]], tf.float32)
def _create_classification_targets(self, groundtruth_labels, match):
"""Create classification targets for each anchor.
Assign a classification target of for each anchor to the matching
groundtruth label that is provided by match. Anchors that are not matched
to anything are given the target self._unmatched_cls_target
Args:
groundtruth_labels: a tensor of shape [num_gt_boxes, d_1, ... d_k]
with labels for each of the ground_truth boxes. The subshape
[d_1, ... d_k] can be empty (corresponding to scalar labels).
match: a matcher.Match object that provides a matching between anchors
and groundtruth boxes.
Returns:
a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the
subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has
shape [num_gt_boxes, d_1, d_2, ... d_k].
"""
return match.gather_based_on_match(
groundtruth_labels,
unmatched_value=self._unmatched_cls_target,
ignored_value=self._unmatched_cls_target)
def _create_regression_weights(self, match, groundtruth_weights):
"""Set regression weight for each anchor.
Only positive anchors are set to contribute to the regression loss, so this
method returns a weight of 1 for every positive anchor and 0 for every
negative anchor.
Args:
match: a matcher.Match object that provides a matching between anchors
and groundtruth boxes.
groundtruth_weights: a float tensor of shape [M] indicating the weight to
assign to all anchors match to a particular groundtruth box.
Returns:
a float32 tensor with shape [num_anchors] representing regression weights.
"""
return match.gather_based_on_match(
groundtruth_weights, ignored_value=0., unmatched_value=0.)
def _create_classification_weights(self,
match,
groundtruth_weights):
"""Create classification weights for each anchor.
Positive (matched) anchors are associated with a weight of
positive_class_weight and negative (unmatched) anchors are associated with
a weight of negative_class_weight. When anchors are ignored, weights are set
to zero. By default, both positive/negative weights are set to 1.0,
but they can be adjusted to handle class imbalance (which is almost always
the case in object detection).
Args:
match: a matcher.Match object that provides a matching between anchors
and groundtruth boxes.
groundtruth_weights: a float tensor of shape [M] indicating the weight to
assign to all anchors match to a particular groundtruth box.
Returns:
a float32 tensor with shape [num_anchors] representing classification
weights.
"""
return match.gather_based_on_match(
groundtruth_weights,
ignored_value=0.,
unmatched_value=self._negative_class_weight)
def get_box_coder(self):
"""Get BoxCoder of this TargetAssigner.
Returns:
BoxCoder object.
"""
return self._box_coder
@@ -0,0 +1,210 @@
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tensorflow Example proto decoder for object detection.
A decoder to decode string tensors containing serialized tensorflow.Example
protos for object detection.
"""
import tensorflow as tf
slim_example_decoder = tf.contrib.slim.tfexample_decoder
class TfExampleDecoder(object):
"""Tensorflow Example proto decoder."""
def __init__(self):
"""Constructor sets keys_to_features and items_to_handlers."""
self.keys_to_features = {
'image/encoded':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/format':
tf.FixedLenFeature((), tf.string, default_value='jpeg'),
'image/filename':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/key/sha256':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/source_id':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/height':
tf.FixedLenFeature((), tf.int64, 1),
'image/width':
tf.FixedLenFeature((), tf.int64, 1),
# Object boxes and classes.
'image/object/bbox/xmin':
tf.VarLenFeature(tf.float32),
'image/object/bbox/xmax':
tf.VarLenFeature(tf.float32),
'image/object/bbox/ymin':
tf.VarLenFeature(tf.float32),
'image/object/bbox/ymax':
tf.VarLenFeature(tf.float32),
'image/object/class/label':
tf.VarLenFeature(tf.int64),
'image/object/class/text':
tf.VarLenFeature(tf.string),
'image/object/area':
tf.VarLenFeature(tf.float32),
'image/object/is_crowd':
tf.VarLenFeature(tf.int64),
'image/object/difficult':
tf.VarLenFeature(tf.int64),
'image/object/group_of':
tf.VarLenFeature(tf.int64),
'image/object/weight':
tf.VarLenFeature(tf.float32),
}
self.items_to_handlers = {
'image': slim_example_decoder.Image(
image_key='image/encoded', format_key='image/format', channels=3),
'source_id': (
slim_example_decoder.Tensor('image/source_id')),
'key': (
slim_example_decoder.Tensor('image/key/sha256')),
'filename': (
slim_example_decoder.Tensor('image/filename')),
# Object boxes and classes.
'groundtruth_boxes': (
slim_example_decoder.BoundingBox(
['ymin', 'xmin', 'ymax', 'xmax'], 'image/object/bbox/')),
'groundtruth_area': slim_example_decoder.Tensor(
'image/object/area'),
'groundtruth_is_crowd': (
slim_example_decoder.Tensor('image/object/is_crowd')),
'groundtruth_difficult': (
slim_example_decoder.Tensor('image/object/difficult')),
'groundtruth_group_of': (
slim_example_decoder.Tensor('image/object/group_of')),
'groundtruth_weights': (
slim_example_decoder.Tensor('image/object/weight')),
}
label_handler = slim_example_decoder.Tensor('image/object/class/label')
self.items_to_handlers['groundtruth_classes'] = label_handler
def decode(self, tf_example_string_tensor):
"""Decodes serialized tensorflow example and returns a tensor dictionary.
Args:
tf_example_string_tensor: a string tensor holding a serialized tensorflow
example proto.
Returns:
A dictionary of the following tensors.
image - 3D uint8 tensor of shape [None, None, 3]
containing image.
source_id - string tensor containing original
image id.
key - string tensor with unique sha256 hash key.
filename - string tensor with original dataset
filename.
groundtruth_boxes - 2D float32 tensor of shape
[None, 4] containing box corners.
groundtruth_classes - 1D int64 tensor of shape
groundtruth_weights - 1D float32 tensor of
shape [None] indicating the weights of groundtruth boxes.
[None] containing classes for the boxes.
groundtruth_area - 1D float32 tensor of shape
[None] containing containing object mask area in pixel squared.
groundtruth_is_crowd - 1D bool tensor of shape
[None] indicating if the boxes enclose a crowd.
Optional:
groundtruth_difficult - 1D bool tensor of shape
[None] indicating if the boxes represent `difficult` instances.
groundtruth_group_of - 1D bool tensor of shape
[None] indicating if the boxes represent `group_of` instances.
groundtruth_instance_masks - 3D float32 tensor of
shape [None, None, None] containing instance masks.
"""
serialized_example = tf.reshape(tf_example_string_tensor, shape=[])
decoder = slim_example_decoder.TFExampleDecoder(self.keys_to_features,
self.items_to_handlers)
keys = sorted(decoder.list_items())
tensors = decoder.decode(serialized_example, items=keys)
tensor_dict = dict(zip(keys, tensors))
is_crowd = 'groundtruth_is_crowd'
tensor_dict[is_crowd] = tf.cast(tensor_dict[is_crowd], dtype=tf.bool)
tensor_dict['image'].set_shape([None, None, 3])
def default_groundtruth_weights():
return tf.ones(
tf.shape(tensor_dict['groundtruth_boxes'])[0],
dtype=tf.float32)
tensor_dict['groundtruth_weights'] = tf.cond(
tf.greater(
tf.shape(
tensor_dict['groundtruth_weights'])[0],
0), lambda: tensor_dict['groundtruth_weights'],
default_groundtruth_weights)
return tensor_dict
class TfExampleSegmentationDecoder(object):
"""Tensorflow Example proto decoder."""
def __init__(self):
"""Constructor sets keys_to_features and items_to_handlers."""
self.keys_to_features = {
'image/encoded':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/filename':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/format':
tf.FixedLenFeature((), tf.string, default_value='jpeg'),
'image/height':
tf.FixedLenFeature((), tf.int64, default_value=0),
'image/width':
tf.FixedLenFeature((), tf.int64, default_value=0),
'image/segmentation/class/encoded':
tf.FixedLenFeature((), tf.string, default_value=''),
'image/segmentation/class/format':
tf.FixedLenFeature((), tf.string, default_value='png'),
}
self.items_to_handlers = {
'image': slim_example_decoder.Image(
image_key='image/encoded', format_key='image/format', channels=3),
'labels_class': slim_example_decoder.Image(
image_key='image/segmentation/class/encoded',
format_key='image/segmentation/class/format',
channels=1)
}
def decode(self, tf_example_string_tensor):
"""Decodes serialized tensorflow example and returns a tensor dictionary.
Args:
tf_example_string_tensor: a string tensor holding a serialized tensorflow
example proto.
Returns:
A dictionary of the following tensors.
image - 3D uint8 tensor of shape [None, None, 3] containing image.
labels_class - 2D unit8 tensor of shape [None, None] containing
pixel-wise class labels.
"""
serialized_example = tf.reshape(tf_example_string_tensor, shape=[])
decoder = slim_example_decoder.TFExampleDecoder(self.keys_to_features,
self.items_to_handlers)
keys = sorted(decoder.list_items())
keys = ['image', 'labels_class']
tensors = decoder.decode(serialized_example, items=keys)
tensor_dict = dict(zip(keys, tensors))
tensor_dict['image'].set_shape([None, None, 3])
return tensor_dict
@@ -0,0 +1,44 @@
#clean slog
rm -rf /var/log/npu/slog/host-0/*.log
rm -rf /var/log/npu/slog/device-*/*.log
# set env
export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
PATH=$PATH:$HOME/bin
export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
export ASCEND_OPP_PATH=/usr/local/Ascend/opp
export DDK_VERSION_FLAG=1.71.T5.0.B060
export NEW_GE_FE_ID=1
export GE_AICPU_FLAG=1
export SOC_VERSION=Ascend910
export DUMP_GE_GRAPH=1
export DUMP_GRAPH_LEVEL=3
export PRINT_MODEL=1
export SLOG_PRINT_TO_STDOUT=1
export RANK_SIZE=1
RANK_ID_START=1
SAVE_PATH=training
BASE_PATH=`pwd`
echo $BASE_PATH
for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
do
echo
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device "$RANK_ID
TMP_PATH=$SAVE_PATH/D$RANK_ID
mkdir -p $TMP_PATH
cp exec_main.sh $TMP_PATH/
cd $TMP_PATH
bash exec_main.sh $RANK_ID $RANK_SIZE $BASE_PATH > train_$RANK_ID.log &
cd -
done
@@ -0,0 +1,41 @@
#clean slog
rm -rf /var/log/npu/slog/host-0/*.log
rm -rf /var/log/npu/slog/device-*/*.log
# set env
export PYTHONPATH=/usr/local/Ascend/ops/op_impl/built-in/ai_core/tbe/
export LD_LIBRARY_PATH=/usr/local/lib/:/usr/lib/:/usr/local/Ascend/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/x86_64-linux-gnu
PATH=$PATH:$HOME/bin
export PATH=$PATH:/usr/local/Ascend/fwkacllib/ccec_compiler/bin:$PATH
export ASCEND_OPP_PATH=/usr/local/Ascend/opp
export DDK_VERSION_FLAG=1.71.T5.0.B060
export NEW_GE_FE_ID=1
export GE_AICPU_FLAG=1
export SOC_VERSION=Ascend910
export DUMP_GE_GRAPH=1
export DUMP_GRAPH_LEVEL=3
export PRINT_MODEL=1
export SLOG_PRINT_TO_STDOUT=1
export RANK_SIZE=8
export RANK_TABLE_FILE=${PWD}/npu_config/${RANK_SIZE}p.json
RANK_ID_START=0
BASE_PATH=`pwd`
SAVE_PATH=training
for((RANK_ID=$RANK_ID_START;RANK_ID<$((RANK_SIZE+RANK_ID_START));RANK_ID++));
do
echo
su HwHiAiUser -c "adc --host 0.0.0.0:22118 --log \"SetLogLevel(0)[error]\" --device "$RANK_ID
TMP_PATH=$SAVE_PATH/D$RANK_ID
mkdir -p $TMP_PATH
cp exec_main.sh $TMP_PATH/
cd $TMP_PATH
nohup bash exec_main.sh $RANK_ID $RANK_SIZE $BASE_PATH > train_$RANK_ID.log &
cd -
done
@@ -0,0 +1,484 @@
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""SSD (via ResNet50) model definition.
Defines the SSD model and loss functions from this paper:
https://arxiv.org/pdf/1708.02002
Uses the ResNet model as a basis.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import ssd_constants
def batch_norm_relu(inputs,
is_training_bn,
params,
relu=True,
init_zero=False,
data_format='channels_last',
name=None):
"""Performs a batch normalization followed by a ReLU.
Args:
inputs: `Tensor` of shape `[batch, channels, ...]`.
is_training_bn: `bool` for whether the model is training.
params: params of the model, a dict including `distributed_group_size`
and `num_shards`.
relu: `bool` if False, omits the ReLU operation.
init_zero: `bool` if True, initializes scale parameter of batch
normalization with 0 instead of 1 (default).
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
name: the name of the batch normalization layer
Returns:
A normalized `Tensor` with the same `data_format`.
"""
if init_zero:
gamma_initializer = tf.zeros_initializer()
else:
gamma_initializer = tf.ones_initializer()
if data_format == 'channels_first':
axis = 1
else:
axis = 3
inputs = tf.layers.batch_normalization(
inputs=inputs,
axis=axis,
momentum=ssd_constants.BATCH_NORM_DECAY,
epsilon=ssd_constants.BATCH_NORM_EPSILON,
center=True,
scale=True,
training=is_training_bn,
fused=True,
gamma_initializer=gamma_initializer,
name=name)
if relu:
inputs = tf.nn.relu(inputs)
return inputs
def fixed_padding(inputs, kernel_size, data_format='channels_last'):
"""Pads the input along the spatial dimensions independently of input size.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]` or
`[batch, height, width, channels]` depending on `data_format`.
kernel_size: `int` kernel size to be used for `conv2d` or max_pool2d`
operations. Should be a positive integer.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
A padded `Tensor` of the same `data_format` with size either intact
(if `kernel_size == 1`) or padded (if `kernel_size > 1`).
"""
pad_total = kernel_size - 1
pad_beg = pad_total // 2
pad_end = pad_total - pad_beg
if data_format == 'channels_first':
padded_inputs = tf.pad(
inputs, [[0, 0], [0, 0], [pad_beg, pad_end], [pad_beg, pad_end]])
else:
padded_inputs = tf.pad(
inputs, [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]])
return padded_inputs
def conv2d_fixed_padding(inputs,
filters,
kernel_size,
strides,
data_format='channels_last'):
"""Strided 2-D convolution with explicit padding.
The padding is consistent and is based only on `kernel_size`, not on the
dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).
Args:
inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
filters: `int` number of filters in the convolution.
kernel_size: `int` size of the kernel to be used in the convolution.
strides: `int` strides of the convolution.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
A `Tensor` of shape `[batch, filters, height_out, width_out]`.
"""
if strides > 1:
inputs = fixed_padding(inputs, kernel_size, data_format=data_format)
return tf.layers.conv2d(
inputs=inputs,
filters=filters,
kernel_size=kernel_size,
strides=strides,
padding=('SAME' if strides == 1 else 'VALID'),
use_bias=False,
kernel_initializer=tf.variance_scaling_initializer(),
data_format=data_format)
def residual_block(inputs,
filters,
is_training_bn,
strides,
params,
use_projection=False,
data_format='channels_last'):
"""Standard building block for residual networks with BN after convolutions.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
filters: `int` number of filters for the first two convolutions. Note that
the third and final convolution will use 4 times as many filters.
is_training_bn: `bool` for whether the model is in training.
strides: `int` block stride. If greater than 1, this block will ultimately
downsample the input.
params: params of the model, a dict.
use_projection: `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
The output `Tensor` of the block.
"""
shortcut = inputs
if use_projection:
# Projection shortcut in first layer to match filters and strides
shortcut = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=1,
strides=strides,
data_format=data_format)
shortcut = batch_norm_relu(
shortcut, is_training_bn, params, relu=False, data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=3,
strides=strides,
data_format=data_format)
inputs = batch_norm_relu(
inputs, is_training_bn, params, data_format=data_format)
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=filters,
kernel_size=3,
strides=1,
data_format=data_format)
inputs = batch_norm_relu(
inputs,
is_training_bn,
params,
relu=False,
init_zero=True,
data_format=data_format)
return tf.nn.relu(inputs + shortcut)
def block_group(inputs,
filters,
block_fn,
blocks,
strides,
is_training_bn,
name,
params,
data_format='channels_last',
use_projection=True):
"""Creates one group of blocks for the ResNet model.
Args:
inputs: `Tensor` of size `[batch, channels, height, width]`.
filters: `int` number of filters for the first convolution of the layer.
block_fn: `function` for the block to use within the model
blocks: `int` number of blocks contained in the layer.
strides: `int` stride to use for the first convolution of the layer. If
greater than 1, this layer will downsample the input.
is_training_bn: `bool` for whether the model is training.
name: `str`name for the Tensor output of the block layer.
params: params of the model, a dict.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
use_projection: `bool` for whether this block should use a projection
shortcut (versus the default identity shortcut). This is usually `True`
for the first block of a block group, which may change the number of
filters and the resolution.
Returns:
The output `Tensor` of the block layer.
"""
# Only the first block per block_group uses projection shortcut and strides.
inputs = block_fn(
inputs,
filters,
is_training_bn,
strides,
params,
use_projection=use_projection,
data_format=data_format)
for _ in range(1, blocks):
inputs = block_fn(
inputs, filters, is_training_bn, 1, params, data_format=data_format)
return tf.identity(inputs, name)
def resnet_v1_generator(block_fn, layers, params, data_format='channels_last'):
"""Generator of ResNet v1 model with classification layers removed.
Our actual ResNet network. We return the output of c2, c3,c4,c5
N.B. batch norm is always run with trained parameters, as we use very small
batches when training the object layers.
Args:
block_fn: `function` for the block to use within the model. Either
`residual_block` or `bottleneck_block`.
layers: list of 4 `int`s denoting the number of blocks to include in each
of the 4 block groups. Each group consists of blocks that take inputs of
the same resolution.
params: params of the model, a dict.
data_format: `str` either "channels_first" for `[batch, channels, height,
width]` or "channels_last for `[batch, height, width, channels]`.
Returns:
Model `function` that takes in `inputs` and `is_training` and returns the
output `Tensor` of the ResNet model.
"""
def model(inputs, is_training_bn=False):
"""Creation of the model graph."""
inputs = conv2d_fixed_padding(
inputs=inputs,
filters=64,
kernel_size=7,
strides=2,
data_format=data_format)
inputs = tf.identity(inputs, 'initial_conv')
inputs = batch_norm_relu(
inputs, is_training_bn, params, data_format=data_format)
inputs = tf.layers.max_pooling2d(
inputs=inputs,
pool_size=3,
strides=2,
padding='SAME',
data_format=data_format)
inputs = tf.identity(inputs, 'initial_max_pool')
c2 = block_group(
inputs=inputs,
filters=64,
blocks=layers[0],
strides=1,
block_fn=block_fn,
is_training_bn=is_training_bn,
params=params,
name='block_group1',
data_format=data_format,
use_projection=False)
c3 = block_group(
inputs=c2,
filters=128,
blocks=layers[1],
strides=2,
block_fn=block_fn,
is_training_bn=is_training_bn,
params=params,
name='block_group2',
data_format=data_format)
c4 = block_group(
inputs=c3,
filters=256,
blocks=layers[2],
strides=1,
block_fn=block_fn,
is_training_bn=is_training_bn,
params=params,
name='block_group3',
data_format=data_format)
return c2, c3, c4
return model
def resnet_v1(resnet_depth, params, data_format='channels_last'):
"""Returns the ResNet model for a given size and number of output classes."""
model_params = {
34: {'block': residual_block, 'layers': [3, 4, 6, 3]}
}
if resnet_depth not in model_params:
raise ValueError('Not a valid resnet_depth:', resnet_depth)
resnet_params = model_params[resnet_depth]
return resnet_v1_generator(resnet_params['block'], resnet_params['layers'],
params, data_format)
def class_net(images, level, num_classes):
"""Class prediction network for SSD."""
return tf.layers.conv2d(
images,
num_classes * ssd_constants.NUM_DEFAULTS_BY_LEVEL[level],
kernel_size=(3, 3),
padding='same',
activation=None,
name='class-%d' % (level),
)
def box_net(images, level):
"""Box regression network for SSD."""
return tf.layers.conv2d(
images,
4 * ssd_constants.NUM_DEFAULTS_BY_LEVEL[level],
kernel_size=(3, 3),
padding='same',
activation=None,
name='box-%d' % (level),
)
def ssd(features, params, is_training_bn=False):
"""SSD classification and regression model."""
# upward layers
with tf.variable_scope(
'resnet%s' % ssd_constants.RESNET_DEPTH, reuse=tf.AUTO_REUSE):
resnet_fn = resnet_v1(ssd_constants.RESNET_DEPTH, params)
_, _, u4 = resnet_fn(features, is_training_bn)
with tf.variable_scope('ssd', reuse=tf.AUTO_REUSE):
feats = {}
# output channels for mlperf logging.
out_channels = [256]
feats[3] = u4
feats[4] = tf.layers.conv2d(
feats[3],
filters=256,
kernel_size=(1, 1),
padding='same',
activation=tf.nn.relu,
name='block7-conv1x1')
feats[4] = tf.layers.conv2d(
feats[4],
filters=512,
strides=(2, 2),
kernel_size=(3, 3),
padding='same',
activation=tf.nn.relu,
name='block7-conv3x3')
out_channels.append(512)
feats[5] = tf.layers.conv2d(
feats[4],
filters=256,
kernel_size=(1, 1),
padding='same',
activation=tf.nn.relu,
name='block8-conv1x1')
feats[5] = tf.layers.conv2d(
feats[5],
filters=512,
strides=(2, 2),
kernel_size=(3, 3),
padding='same',
activation=tf.nn.relu,
name='block8-conv3x3')
out_channels.append(512)
feats[6] = tf.layers.conv2d(
feats[5],
filters=128,
kernel_size=(1, 1),
padding='same',
activation=tf.nn.relu,
name='block9-conv1x1')
feats[6] = tf.layers.conv2d(
feats[6],
filters=256,
strides=(2, 2),
kernel_size=(3, 3),
padding='same',
activation=tf.nn.relu,
name='block9-conv3x3')
out_channels.append(256)
feats[7] = tf.layers.conv2d(
feats[6],
filters=128,
kernel_size=(1, 1),
padding='same',
activation=tf.nn.relu,
name='block10-conv1x1')
feats[7] = tf.layers.conv2d(
feats[7],
filters=256,
kernel_size=(3, 3),
padding='valid',
activation=tf.nn.relu,
name='block10-conv3x3')
out_channels.append(256)
feats[8] = tf.layers.conv2d(
feats[7],
filters=128,
kernel_size=(1, 1),
padding='same',
activation=tf.nn.relu,
name='block11-conv1x1')
feats[8] = tf.layers.conv2d(
feats[8],
filters=256,
kernel_size=(3, 3),
padding='valid',
activation=tf.nn.relu,
name='block11-conv3x3')
out_channels.append(256)
class_outputs = {}
box_outputs = {}
min_level = ssd_constants.MIN_LEVEL
max_level = ssd_constants.MAX_LEVEL
num_classes = ssd_constants.NUM_CLASSES
with tf.variable_scope('class_net', reuse=tf.AUTO_REUSE):
for level in range(min_level, max_level + 1):
class_outputs[level] = class_net(
feats[level], level, num_classes)
with tf.variable_scope('box_net', reuse=tf.AUTO_REUSE):
for level in range(min_level, max_level + 1):
box_outputs[level] = box_net(
feats[level], level)
return class_outputs, box_outputs
@@ -0,0 +1,122 @@
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Central location for all constants related to MLPerf SSD."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# ==============================================================================
# == Model =====================================================================
# ==============================================================================
IMAGE_SIZE = 300
SPACE_TO_DEPTH_BLOCK_SIZE = 2
# TODO(taylorrobie): MLPerf uses 80, but COCO documents 90. (RetinaNet uses 90)
# Update(taylorrobie): Labels > 81 show up in the pipeline. This will need to
# be resolved.
NUM_CLASSES = 81 # Including "no class". Not all COCO classes are used.
# Note: Zero is special. (Background class) CLASS_INV_MAP[0] must be zero.
CLASS_INV_MAP = (
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21,
22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 67, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87,
88, 89, 90)
_MAP = {j: i for i, j in enumerate(CLASS_INV_MAP)}
CLASS_MAP = tuple(_MAP.get(i, -1) for i in range(max(CLASS_INV_MAP) + 1))
NUM_SSD_BOXES = 8732
RESNET_DEPTH = 34
"""SSD specific"""
MIN_LEVEL = 3
MAX_LEVEL = 8
FEATURE_SIZES = (38, 19, 10, 5, 3, 1)
STEPS = (8, 16, 32, 64, 100, 300)
# https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
SCALES = (21, 45, 99, 153, 207, 261, 315)
ASPECT_RATIOS = ((2,), (2, 3), (2, 3), (2, 3), (2,), (2,))
NUM_DEFAULTS = (4, 6, 6, 6, 4, 4)
NUM_DEFAULTS_BY_LEVEL = {3: 4, 4: 6, 5: 6, 6: 6, 7: 4, 8: 4}
SCALE_XY = 0.1
SCALE_HW = 0.2
BOX_CODER_SCALES = (1 / SCALE_XY, 1 / SCALE_XY, 1 / SCALE_HW, 1 / SCALE_HW)
MATCH_THRESHOLD = 0.5
# https://discuss.pytorch.org/t/how-to-preprocess-input-for-pre-trained-networks/683
NORMALIZATION_MEAN = (0.485, 0.456, 0.406)
NORMALIZATION_STD = (0.229, 0.224, 0.225)
# SSD Cropping
NUM_CROP_PASSES = 50
CROP_MIN_IOU_CHOICES = (0, 0.1, 0.3, 0.5, 0.7, 0.9)
P_NO_CROP_PER_PASS = 1 / (len(CROP_MIN_IOU_CHOICES) + 1)
# Hard example mining
NEGS_PER_POSITIVE = 3
# Batch normalization
BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5
# ==============================================================================
# == Optimizer =================================================================
# ==============================================================================
BASE_LEARNING_RATE = 3.0e-3
FIRST_LR_DROP_STEP = 160000 # 该参数不起作用
SECOND_LR_DROP_STEP = 200000 # 该参数不起作用
MOMENTUM = 0.9
WEIGHT_DECAY = 5e-4
DEFAULT_BATCH_SIZE = 32.0
# ==============================================================================
# == Keys ======================================================================
# ==============================================================================
BOXES = "boxes"
CLASSES = "classes"
NUM_MATCHED_BOXES = "num_matched_boxes"
IMAGE = "image"
SOURCE_ID = "source_id"
RAW_SHAPE = "raw_shape"
IS_PADDED = "is_padded"
# ==============================================================================
# == Evaluation ================================================================
# ==============================================================================
# Note: This is based on a batch size of 32
# https://github.com/mlperf/reference/blob/master/single_stage_detector/ssd/train.py#L21-L37 # pylint: disable=line-too-long
EVAL_SAMPLES = 5000
CHECKPOINT_FREQUENCY = 5000
MAX_NUM_EVAL_BOXES = 200
OVERLAP_CRITERIA = 0.5 # Used for nonmax supression
MIN_SCORE = 0.05 # Minimum score to be considered during evaluation.
DUMMY_SCORE = -1e5 # If no boxes are matched.
# Eval step intervals starting from 0
#EVAL_STEPS = (24000, 24000, 24000, 24000, 24000,24000, 24000, 24000, 24000, 24000)
EVAL_STEPS = (432000,)
# Target COCO/AP for mlperf.
EVAL_TARGET = 0.24
# For multiprocessing.
QUEUE_SIZE = 24
WORKER_COUNT = 10
@@ -0,0 +1,309 @@
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Training script for SSD.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import multiprocessing
import os
import sys
sys.path.append(os.path.join(os.path.abspath(os.path.dirname(__file__)),'../../../../utils/atlasboost'))
import threading
from absl import app
import numpy as np
import tensorflow as tf
from npu_bridge.estimator import npu_ops
from tensorflow.core.protobuf import rewriter_config_pb2
from npu_bridge.estimator.npu.npu_config import NPURunConfig
from npu_bridge.estimator.npu.npu_estimator import NPUEstimator
import coco_metric
import dataloader
import ssd_constants
import ssd_model
def get_rank_size():
return int(os.environ['RANK_SIZE'])
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
tf.flags.DEFINE_string('model_dir', None, 'Location of model_dir')
tf.flags.DEFINE_string('resnet_checkpoint', '',
'Location of the ResNet checkpoint to use for model '
'initialization.')
tf.flags.DEFINE_integer('train_batch_size', 64, 'training batch size')
tf.flags.DEFINE_integer('eval_batch_size', 1, 'evaluation batch size')
tf.flags.DEFINE_integer('eval_samples', 5000, 'The number of samples for '
'evaluation.')
tf.flags.DEFINE_string(
'training_file_pattern', None,
'Glob for training data files (e.g., COCO train - minival set)')
tf.flags.DEFINE_string(
'validation_file_pattern', None,
'Glob for evaluation tfrecords (e.g., COCO val2017 set)')
tf.flags.DEFINE_string(
'val_json_file',
None,
'COCO validation JSON containing golden bounding boxes.')
tf.flags.DEFINE_integer('num_examples_per_epoch', 120000,
'Number of examples in one epoch')
tf.flags.DEFINE_float('num_epochs', 58, 'Number of epochs for training')
tf.flags.DEFINE_string('mode', 'train_and_eval',
'Mode to run: train_and_eval, train, eval')
tf.flags.DEFINE_integer(
'keep_checkpoint_max', 32,
'Maximum number of checkpoints to keep.')
FLAGS = tf.flags.FLAGS
SUCCESS = False
def construct_run_config():
"""Construct the run config."""
# Parse hparams
hparams = ssd_model.default_hparams()
params = dict(
hparams.values(),
num_examples_per_epoch=FLAGS.num_examples_per_epoch,
resnet_checkpoint=FLAGS.resnet_checkpoint,
val_json_file=FLAGS.val_json_file,
mode=FLAGS.mode,
model_dir=FLAGS.model_dir,
eval_samples=FLAGS.eval_samples,
)
return NPURunConfig(
model_dir=FLAGS.model_dir,
session_config=tf.ConfigProto(),
keep_checkpoint_max=FLAGS.keep_checkpoint_max,
save_checkpoints_steps=ssd_constants.CHECKPOINT_FREQUENCY,
enable_data_pre_proc=True,
save_summary_steps=100,
iterations_per_loop=100,
precision_mode='allow_mix_precision'
), params
def coco_eval(predictions,
current_step,
summary_writer,
coco_gt,
use_cpp_extension=True,
nms_on_tpu=True):
"""Call the coco library to get the eval metrics."""
global SUCCESS
eval_results = coco_metric.compute_map(
predictions,
coco_gt,
use_cpp_extension=use_cpp_extension,
nms_on_tpu=nms_on_tpu)
if eval_results['COCO/AP'] >= ssd_constants.EVAL_TARGET and not SUCCESS:
SUCCESS = True
tf.logging.info('Eval results: %s' % eval_results)
hwlog.remark_print(key=hwlog.EVAL_RESULTS, value=eval_results)
# Write out eval results for the checkpoint.
with tf.Graph().as_default():
summaries = []
for metric in eval_results:
summaries.append(
tf.Summary.Value(tag=metric, simple_value=eval_results[metric]))
tf_summary = tf.Summary(value=list(summaries))
summary_writer.add_summary(tf_summary, current_step)
def init_npu():
"""Initialize npu manually.
Returns:
`init_sess` npu init session config.
`npu_init` npu init ops.
"""
npu_init = npu_ops.initialize_system()
config = tf.ConfigProto()
#npu mix precision attribute set to true when using mix precision
config.graph_options.rewrite_options.remapping = rewriter_config_pb2.RewriterConfig.OFF
custom_op = config.graph_options.rewrite_options.custom_optimizers.add()
custom_op.name = "NpuOptimizer"
custom_op.parameter_map["use_off_line"].b = True
init_sess = tf.Session(config=config)
return init_sess,npu_init
def main(argv):
init_sess, npu_init = init_npu()
init_sess.run(npu_init)
del argv # Unused.
global SUCCESS
# Check data path
if FLAGS.mode in ('train',
'train_and_eval') and FLAGS.training_file_pattern is None:
raise RuntimeError('You must specify --training_file_pattern for training.')
if FLAGS.mode in ('train_and_eval', 'eval'):
if FLAGS.validation_file_pattern is None:
raise RuntimeError('You must specify --validation_file_pattern '
'for evaluation.')
if FLAGS.val_json_file is None:
raise RuntimeError('You must specify --val_json_file for evaluation.')
run_config, params = construct_run_config()
if FLAGS.mode == 'train':
train_params = dict(params)
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=train_params['num_examples_per_epoch'])
train_params['batch_size'] = FLAGS.train_batch_size
train_estimator = NPUEstimator(
model_fn=ssd_model.ssd_model_fn,
model_dir=FLAGS.model_dir,
config=run_config,
params=train_params)
tf.logging.info(params)
train_estimator.train(
input_fn=dataloader.SSDInputReader(
FLAGS.training_file_pattern,
params['transpose_input'],
is_training=True),
steps=int((FLAGS.num_epochs * FLAGS.num_examples_per_epoch) /
FLAGS.train_batch_size / get_rank_size()))
elif FLAGS.mode == 'train_and_eval':
output_dir = os.path.join(FLAGS.model_dir, 'eval')
tf.gfile.MakeDirs(output_dir)
# Summary writer writes out eval metrics.
summary_writer = tf.summary.FileWriter(output_dir)
current_step = 0
coco_gt = coco_metric.create_coco(
FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc'])
for eval_step in ssd_constants.EVAL_STEPS:
# Compute the actual eval steps based on the actural train_batch_size
steps = int(eval_step / get_rank_size() * ssd_constants.DEFAULT_BATCH_SIZE /
FLAGS.train_batch_size)
print('###################################', steps)
tf.logging.info('Starting training cycle for %d steps.' % steps)
run_config, params = construct_run_config()
train_params = dict(params)
hwlog.remark_print(key=hwlog.CURRENT_EPOCH, value=train_params['num_examples_per_epoch'])
train_params['batch_size'] = FLAGS.train_batch_size
train_estimator = NPUEstimator(
model_fn=ssd_model.ssd_model_fn,
model_dir=FLAGS.model_dir,
config=run_config,
params=train_params)
tf.logging.info(params)
train_estimator.train(
input_fn=dataloader.SSDInputReader(
FLAGS.training_file_pattern,
params['transpose_input'],
is_training=True),
steps=steps)
if SUCCESS:
break
current_step = current_step + steps
tf.logging.info('Starting evaluation cycle at step %d.' % current_step)
# Run evaluation at the given step.
eval_params = dict(params)
eval_params['batch_size'] = FLAGS.eval_batch_size
eval_estimator = NPUEstimator(
model_fn=ssd_model.ssd_model_fn,
model_dir=FLAGS.model_dir,
config=run_config,
params=eval_params)
predictions = list(
eval_estimator.predict(
input_fn=dataloader.SSDInputReader(
FLAGS.validation_file_pattern,
is_training=False)))
coco_eval(predictions, current_step, summary_writer, coco_gt, params['use_cocoeval_cc'], False)
summary_writer.close()
elif FLAGS.mode == 'eval':
coco_gt = coco_metric.create_coco(
FLAGS.val_json_file, use_cpp_extension=params['use_cocoeval_cc'])
eval_params = dict(params)
eval_params['batch_size'] = FLAGS.eval_batch_size
eval_estimator = NPUEstimator(
model_fn=ssd_model.ssd_model_fn,
model_dir=FLAGS.model_dir,
config=run_config,
params=eval_params)
output_dir = os.path.join(FLAGS.model_dir, 'eval')
tf.gfile.MakeDirs(output_dir)
# Summary writer writes out eval metrics.
summary_writer = tf.summary.FileWriter(output_dir)
ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
tf.logging.info('Starting to evaluate on newest checkpoint.')
predictions = list(
eval_estimator.predict(
checkpoint_path=ckpt,
input_fn=dataloader.SSDInputReader(
FLAGS.validation_file_pattern,
is_training=False)))
tf.logging.info('Starting to cal coco ap.')
current_step = int(os.path.basename(ckpt).split('-')[1])
coco_eval(predictions, current_step, summary_writer, coco_gt,
params['use_cocoeval_cc'], False)
tf.logging.info('end to evaluate.')
summary_writer.close()
npu_shutdown = npu_ops.shutdown_system()
init_sess.run(npu_shutdown)
init_sess.close()
if __name__ == '__main__':
hwlog.ROOT_DIR = os.path.split(os.path.abspath(__file__))[0]
cpu_info, npu_info, framework_info, os_info, benchmark_version = get_environment_info("tensorflow")
config_info = get_model_parameter("tensorflow_config")
initinal_data = {"base_lr": 0.01, "dataset": "imagenet1024", "optimizer": "SGD", "loss_scale": 512}
hwlog.remark_print(key=hwlog.CPU_INFO, value=cpu_info)
hwlog.remark_print(key=hwlog.NPU_INFO, value=npu_info)
hwlog.remark_print(key=hwlog.OS_INFO, value=os_info)
hwlog.remark_print(key=hwlog.FRAMEWORK_INFO, value=framework_info)
hwlog.remark_print(key=hwlog.BENCHMARK_VERSION, value=benchmark_version)
hwlog.remark_print(key=hwlog.CONFIG_INFO, value=config_info)
hwlog.remark_print(key=hwlog.BASE_LR, value=initinal_data.get("base_lr"))
hwlog.remark_print(key=hwlog.DATASET, value=initinal_data.get("dataset"))
hwlog.remark_print(key=hwlog.OPT_NAME, value=initinal_data.get("optimizer"))
hwlog.remark_print(key=hwlog.LOSS_SCALE, value=initinal_data.get("loss_scale"))
hwlog.remark_print(key=hwlog.INPUT_BATCH_SIZE, value=initinal_data.get("batchsize"))
tf.logging.set_verbosity(tf.logging.INFO)
app.run(main)
@@ -0,0 +1,500 @@
# Copyright 2018 Google. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Model defination for the SSD Model.
Defines model_fn of SSD for TF Estimator. The model_fn includes SSD
model architecture, loss function, learning rate schedule, and evaluation
procedure.
T.-Y. Lin, P. Goyal, R. Girshick, K. He, and P. Dollar
Focal Loss for Dense Object Detection. arXiv:1708.02002
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import tensorflow as tf
from object_detection import box_coder
from object_detection import box_list
from object_detection import faster_rcnn_box_coder
from tensorflow.python.estimator import model_fn as model_fn_lib
import dataloader
import ssd_architecture
import ssd_constants
from benchmark_log import hwlog
from benchmark_log.basic_utils import get_environment_info
from benchmark_log.basic_utils import get_model_parameter
def get_rank_size():
return int(os.environ['RANK_SIZE'])
def select_top_k_scores(scores_in, pre_nms_num_detections=5000):
"""Select top_k scores and indices for each class.
Args:
scores_in: a Tensor with shape [batch_size, N, num_classes], which stacks
class logit outputs on all feature levels. The N is the number of total
anchors on all levels. The num_classes is the number of classes predicted
by the model.
pre_nms_num_detections: Number of candidates before NMS.
Returns:
scores and indices: Tensors with shape [batch_size, pre_nms_num_detections,
num_classes].
"""
scores_trans = tf.transpose(scores_in, perm=[0, 2, 1])
top_k_scores, top_k_indices = tf.nn.top_k(
scores_trans, k=pre_nms_num_detections, sorted=True)
return tf.transpose(top_k_scores, [0, 2, 1]), tf.transpose(
top_k_indices, [0, 2, 1])
def concat_outputs(cls_outputs, box_outputs):
"""Concatenate predictions into a single tensor.
This function takes the dicts of class and box prediction tensors and
concatenates them into a single tensor for comparison with the ground truth
boxes and class labels.
Args:
cls_outputs: an OrderDict with keys representing levels and values
representing logits in [batch_size, height, width,
num_anchors * num_classses].
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in
[batch_size, height, width, num_anchors * 4].
Returns:
concatenanted cls_outputs and box_outputs.
"""
assert set(cls_outputs.keys()) == set(box_outputs.keys())
# This sort matters. The labels assume a certain order based on
# ssd_constants.FEATURE_SIZES, and this sort matches that convention.
keys = sorted(cls_outputs.keys())
batch_size = int(cls_outputs[keys[0]].shape[0])
flat_cls = []
flat_box = []
for i, k in enumerate(keys):
# TODO(taylorrobie): confirm that this reshape, transpose,
# reshape is correct.
scale = ssd_constants.FEATURE_SIZES[i] # 不同特征尺度, 38,19,10,5,3,1
split_shape = (ssd_constants.NUM_DEFAULTS[i], ssd_constants.NUM_CLASSES) # 4,81)、(6,81...
assert cls_outputs[k].shape[3] == split_shape[0] * split_shape[1]
intermediate_shape = (batch_size, scale, scale) + split_shape # (32,38,38)+ (4,81)=(32,38,38,4,81)
final_shape = (batch_size, scale ** 2 * split_shape[0], split_shape[1]) # (32, 38^2 * 4, 81)
flat_cls.append(tf.reshape(tf.reshape(
cls_outputs[k], intermediate_shape), final_shape))
split_shape = (ssd_constants.NUM_DEFAULTS[i], 4) # (4,4), (6,4)...
assert box_outputs[k].shape[3] == split_shape[0] * split_shape[1]
intermediate_shape = (batch_size, scale, scale) + split_shape # (32, 19,19) + (6,4) 为避免歧义,以第二个default box为例
final_shape = (batch_size, scale ** 2 * split_shape[0], split_shape[1]) # (32, 19^2 * 6, 4)
flat_box.append(tf.reshape(tf.reshape(
box_outputs[k], intermediate_shape), final_shape))
return tf.concat(flat_cls, axis=1), tf.concat(flat_box, axis=1)
def _localization_loss(pred_locs, gt_locs, gt_labels, num_matched_boxes):
"""Computes the localization loss.
Computes the localization loss using smooth l1 loss.
Args:
pred_locs: a dict from index to tensor of predicted locations. The shape
of each tensor is [batch_size, num_anchors, 4].
gt_locs: a list of tensors representing box regression targets in
[batch_size, num_anchors, 4].
gt_labels: a list of tensors that represents the classification groundtruth
targets. The shape is [batch_size, num_anchors, 1].
num_matched_boxes: the number of anchors that are matched to a groundtruth
targets, used as the loss normalizater. The shape is [batch_size].
Returns:
box_loss: a float32 representing total box regression loss.
"""
keys = sorted(pred_locs.keys())
box_loss = 0
for i, k in enumerate(keys):
gt_label = gt_labels[i]
gt_loc = gt_locs[i]
pred_loc = tf.reshape(pred_locs[k], gt_loc.shape)
mask = tf.greater(gt_label, 0)
float_mask = tf.cast(mask, tf.float32)
smooth_l1 = tf.reduce_sum(
tf.losses.huber_loss(
gt_loc, pred_loc, reduction=tf.losses.Reduction.NONE),
axis=-1)
smooth_l1 = tf.multiply(smooth_l1, float_mask)
box_loss = box_loss + tf.reduce_sum(
smooth_l1, axis=list(range(1, smooth_l1.shape.ndims)))
# TODO(taylorrobie): Confirm that normalizing by the number of boxes matches
# reference
return tf.reduce_mean(box_loss / num_matched_boxes)
@tf.custom_gradient
def _softmax_cross_entropy(logits, label):
"""Helper function to compute softmax cross entropy loss."""
shifted_logits = logits - tf.expand_dims(tf.reduce_max(logits, -1), -1)
exp_shifted_logits = tf.math.exp(shifted_logits)
sum_exp = tf.reduce_sum(exp_shifted_logits, -1)
log_sum_exp = tf.math.log(sum_exp)
one_hot_label = tf.one_hot(label, ssd_constants.NUM_CLASSES)
shifted_logits = tf.reduce_sum(shifted_logits * one_hot_label, -1)
loss = log_sum_exp - shifted_logits
def grad(dy):
return (exp_shifted_logits / tf.expand_dims(sum_exp, -1) -
one_hot_label) * tf.expand_dims(dy, -1), dy
return loss, grad
def _classification_loss(pred_labels, gt_labels, num_matched_boxes):
"""Computes the classification loss.
Computes the classification loss with hard negative mining.
Args:
pred_labels: a dict from index to tensor of predicted class. The shape
of the tensor is [batch_size, num_anchors, num_classes].
gt_labels: a list of tensor that represents the classification groundtruth
targets. The shape is [batch_size, num_anchors, 1].
num_matched_boxes: the number of anchors that are matched to a groundtruth
targets. This is used as the loss normalizater.
Returns:
box_loss: a float32 representing total box regression loss.
"""
keys = sorted(pred_labels.keys())
batch_size = gt_labels[0].shape[0]
cross_entropy = []
for i, k in enumerate(keys):
gt_label = gt_labels[i]
pred_label = tf.reshape(
pred_labels[k],
gt_label.get_shape().as_list() + [ssd_constants.NUM_CLASSES])
cross_entropy.append(
tf.reshape(
_softmax_cross_entropy(pred_label, gt_label), [batch_size, -1]))
# Put the rest of the loss computation on one device to avoid excessive
# communication inside topk_mask with spatial partition
#with tf.device(tf.contrib.tpu.core(0)):
cross_entropy = tf.concat(cross_entropy, 1)
gt_label = tf.concat([tf.reshape(l, [batch_size, -1]) for l in gt_labels],
1)
mask = tf.greater(gt_label, 0)
float_mask = tf.cast(mask, tf.float32)
# Hard example mining
neg_masked_cross_entropy = cross_entropy * (1 - float_mask)
value1, _ = tf.math.top_k(neg_masked_cross_entropy, k=4096)
kth1 = tf.reduce_min(value1, 1, keepdims=True)
mask1 = tf.cast(tf.less(neg_masked_cross_entropy, kth1), tf.float32)
value2, _ = tf.math.top_k(tf.multiply(neg_masked_cross_entropy, mask1), k=4096)
kth2 = tf.reduce_min(value2, 1, keepdims=True)
mask2 = tf.cast(tf.less(neg_masked_cross_entropy, kth2), tf.float32)
value3, _ = tf.math.top_k(tf.multiply(neg_masked_cross_entropy, mask2), k=540)
value = tf.concat([value1, value2, value3], axis=1)
num_neg_boxes = tf.minimum(
tf.to_int32(num_matched_boxes) * ssd_constants.NEGS_PER_POSITIVE, 8731)
large_neg_ce = tf.batch_gather(value, num_neg_boxes[:, tf.newaxis])
top_k_neg_mask = tf.cast(tf.greater_equal(neg_masked_cross_entropy, large_neg_ce), tf.float32)
class_loss = tf.reduce_sum(
tf.multiply(cross_entropy, float_mask + top_k_neg_mask), axis=1)
# TODO(taylorrobie): Confirm that normalizing by the number of boxes matches
# reference
return tf.reduce_mean(class_loss / num_matched_boxes)
def detection_loss(cls_outputs, box_outputs, labels):
"""Computes total detection loss.
Computes total detection loss including box and class loss from all levels.
Args:
cls_outputs: an OrderDict with keys representing levels and values
representing logits in [batch_size, height, width, num_anchors].
box_outputs: an OrderDict with keys representing levels and values
representing box regression targets in
[batch_size, height, width, num_anchors * 4].
labels: the dictionary that returned from dataloader that includes
groundturth targets.
Returns:
total_loss: a float32 representing total loss reducing from class and box
losses from all levels.
cls_loss: a float32 representing total class loss.
box_loss: a float32 representing total box regression loss.
"""
if isinstance(labels[ssd_constants.BOXES], dict):
gt_boxes = list(labels[ssd_constants.BOXES].values())
gt_classes = list(labels[ssd_constants.CLASSES].values())
else:
gt_boxes = [labels[ssd_constants.BOXES]]
gt_classes = [labels[ssd_constants.CLASSES]]
cls_outputs, box_outputs = concat_outputs(cls_outputs, box_outputs)
cls_outputs = {'flatten': cls_outputs}
box_outputs = {'flatten': box_outputs}
box_loss = _localization_loss(box_outputs, gt_boxes, gt_classes,
labels[ssd_constants.NUM_MATCHED_BOXES])
class_loss = _classification_loss(cls_outputs, gt_classes,
labels[ssd_constants.NUM_MATCHED_BOXES])
return class_loss + box_loss, class_loss, box_loss
def update_learning_rate_schedule_parameters(params):
"""Updates params that are related to the learning rate schedule.
Args:
params: a parameter dictionary that includes learning_rate, lr_warmup_epoch,
first_lr_drop_epoch, and second_lr_drop_epoch.
"""
batch_size = params['batch_size']
# Learning rate is proportional to the batch size
steps_per_epoch = params['num_examples_per_epoch'] / batch_size // get_rank_size()
params['lr_warmup_step'] = int(params['lr_warmup_epoch'] * steps_per_epoch)
params['cos_decay_step'] = int(
params['cos_decay_epoch'] * steps_per_epoch)
def learning_rate_schedule(params, global_step):
"""Handles learning rate scaling, linear warmup, and learning rate decay.
Args:
params: A dictionary that defines hyperparameters of model.
global_step: A tensor representing current global step.
Returns:
A tensor representing current learning rate.
"""
base_learning_rate = params['base_learning_rate']
lr_warmup_step = params['lr_warmup_step']
cos_decay_step = params['cos_decay_step']
batch_size = params['batch_size']
scaling_factor = get_rank_size() * batch_size / ssd_constants.DEFAULT_BATCH_SIZE
adjusted_learning_rate = base_learning_rate * scaling_factor
learning_rate = (tf.cast(global_step, dtype=tf.float32) /
lr_warmup_step) * adjusted_learning_rate
learning_rate = tf.where(global_step < lr_warmup_step, learning_rate,
tf.train.cosine_decay(adjusted_learning_rate, global_step, cos_decay_step, alpha=0.01))
return learning_rate
class ExamplesPerSecondHook(tf.train.SessionRunHook):
def __init__(
self,
batch_size,
lr=0,
loss=0,
every_n_steps=100,
every_n_secs=None,):
if (every_n_steps is None) == (every_n_secs is None):
raise ValueError('exactly one of every_n_steps'
' and every_n_secs should be provided.')
self._timer = tf.train.SecondOrStepTimer(
every_steps=every_n_steps, every_secs=every_n_secs)
self._step_train_time = 0
self._total_steps = 0
self._batch_size = batch_size
self._lr = lr
self._loss = loss
def begin(self):
self._global_step_tensor = tf.compat.v1.train.get_global_step()
if self._global_step_tensor is None:
raise RuntimeError(
'Global step should be created to use StepCounterHook.')
def before_run(self, run_context): # pylint: disable=unused-argument
return tf.train.SessionRunArgs([self._global_step_tensor, self._lr, self._loss])
def after_run(self, run_context, run_values):
_ = run_context
global_step, lr, loss = run_values.results
if self._timer.should_trigger_for_step(global_step):
elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
global_step)
if elapsed_time is not None:
steps_per_sec = elapsed_steps / elapsed_time
self._step_train_time += elapsed_time
self._total_steps += elapsed_steps
current_examples_per_sec = steps_per_sec * self._batch_size
tf.logging.info('%s: %g, %s: %s, %s: %s', 'FPS', current_examples_per_sec, 'learning rate', lr, 'loss', loss)
hwlog.remark_print(key=hwlog.FPS, value='%7.1f' % current_examples_per_sec)
def _model_fn(features, labels, mode, params, model):
"""Model defination for the SSD model based on ResNet-50.
Args:
features: the input image tensor with shape [batch_size, height, width, 3].
The height and width are fixed and equal.
labels: the input labels in a dictionary. The labels include class targets
and box targets which are dense label maps. The labels are generated from
get_input_fn function in data/dataloader.py
mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
params: the dictionary defines hyperparameters of model. The default
settings are in default_hparams function in this file.
model: the SSD model outputs class logits and box regression outputs.
Returns:
spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation,
or prediction.
"""
if mode == tf.estimator.ModeKeys.PREDICT:
labels = features
features = labels.pop('image')
features -= tf.constant(
ssd_constants.NORMALIZATION_MEAN, shape=[1, 1, 3], dtype=features.dtype)
features /= tf.constant(
ssd_constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype)
def _model_outputs():
return model(
features, params, is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN))
cls_outputs, box_outputs = _model_outputs()
# First check if it is in PREDICT mode.
if mode == tf.estimator.ModeKeys.PREDICT:
flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs)
ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
scale_factors=ssd_constants.BOX_CODER_SCALES)
anchors = box_list.BoxList(
tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb')))
decoded_boxes = box_coder.batch_decode(
encoded_boxes=flattened_box, box_coder=ssd_box_coder, anchors=anchors)
pred_scores = tf.nn.softmax(flattened_cls, axis=2)
pred_scores, indices = select_top_k_scores(pred_scores,
ssd_constants.MAX_NUM_EVAL_BOXES)
predictions = dict(
labels,
indices=indices,
pred_scores=pred_scores,
pred_box=decoded_boxes,
)
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
# Load pretrained model from checkpoint.
if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:
def scaffold_fn():
"""Loads pretrained model through scaffold function."""
tf.train.init_from_checkpoint(params['resnet_checkpoint'], {
'/': 'resnet%s/' % ssd_constants.RESNET_DEPTH,
})
return tf.train.Scaffold()
else:
scaffold_fn = None
# Set up training loss and learning rate.
update_learning_rate_schedule_parameters(params)
global_step = tf.train.get_or_create_global_step()
learning_rate = learning_rate_schedule(params, global_step)
# cls_loss and box_loss are for logging. only total_loss is optimized.
total_loss, cls_loss, box_loss = detection_loss(
cls_outputs, box_outputs, labels)
total_loss += params['weight_decay'] * tf.add_n(
[tf.nn.l2_loss(v) for v in tf.trainable_variables()])
if mode == tf.estimator.ModeKeys.TRAIN:
total_loss_t = tf.reduce_mean(tf.reshape(total_loss, [1]))
cls_loss_t = tf.reduce_mean(tf.reshape(cls_loss, [1]))
box_loss_t = tf.reduce_mean(tf.reshape(box_loss, [1]))
learning_rate_t = tf.reduce_mean(tf.reshape(learning_rate, [1]))
tf.summary.scalar('total_loss', total_loss_t)
tf.summary.scalar('cls_loss_t', cls_loss_t)
tf.summary.scalar('box_loss_t', box_loss_t)
tf.summary.scalar('learning_rate_t', learning_rate_t)
optimizer = tf.train.MomentumOptimizer(
learning_rate, momentum=ssd_constants.MOMENTUM)
from npu_bridge.estimator.npu.npu_optimizer import NPUDistributedOptimizer
optimizer = NPUDistributedOptimizer(optimizer) # 使用NPU分布式计算,更新梯度
# Batch norm requires update_ops to be added as a train_op dependency.
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
examples_sec_hook = ExamplesPerSecondHook(get_rank_size() * 32, learning_rate, total_loss)
train_op = tf.group(optimizer.minimize(total_loss, global_step),
update_ops)
return model_fn_lib.EstimatorSpec(
mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn(),
training_hooks=[examples_sec_hook])
if mode == tf.estimator.ModeKeys.EVAL:
raise NotImplementedError
def ssd_model_fn(features, labels, mode, params):
"""SSD model."""
return _model_fn(features, labels, mode, params, model=ssd_architecture.ssd)
def default_hparams():
# TODO(taylorrobie): replace params useages with global constants.
return tf.contrib.training.HParams(
num_examples_per_epoch=120000,
lr_warmup_epoch=0.8,
cos_decay_epoch=106,
weight_decay=ssd_constants.WEIGHT_DECAY,
base_learning_rate=ssd_constants.BASE_LEARNING_RATE,
eval_every_checkpoint=False,
transpose_input=False,
use_cocoeval_cc=False
)