Files
2020-10-14 08:55:07 +08:00

703 lines
25 KiB
Python

from __future__ import division
import tensorflow as tf
import numpy as np
from onnx_tf.common import pooling_helper
from onnx_tf.common.tf_helper import tf_shape
from onnx_tf.common.tf_helper import tf_product
class DilatedPooling(object):
"""
This class implements two main methods:
dilated_pool:
calculates a max or average pool over the input
dilated_maxpool_with_argmax:
calculates a maxpool over the input and returns the
indices/argmax of the selected values
In addition to the standard features of pooling operations in
Tensorflow, these methods support dilations, ceil mode, SAME_LOWER and
explicit padding.
Dilations are partly supported in Tensorflow in `tf.nn.pool` and
`tf.nn.dilation2d`. The code will try to use the Tensoflow build-in
functions as much as poosible.
In cases, not supported by Tensorflow there is a custom algorith of
dilated pooling `_remove_dilations`.
The idea behind `_remove_dilations` is to transform the input N-D data
into a supported input for the standard tf.nn.pool operation.
This is achieved by calculating N-D indicies for the values which will
be selected from the input when applying the dilations and
then extracting the values using tf.gather_nd. Next step is to execute
`tf.nn.pool` on this new input data with **strides=kernel_shape** and
no dilations. The resulting pool will be the result we are looking for.
In case of `deilated_maxpool_with_argmax` an additional step is needed
to recalculated the resulting indices back into the original
data indices. It is done with `_calc_orig_argmax`
Here is a simple example of how the algorithm works:
kernel_shape = [3]
strides = [2]
dilations = [3]
Input 1D data:
x-----x-----x-----x-----x-----x-----x-----x-----x-----x-----x
| * | | ** | * | | ** | * | | ** | |
| 10 | 9 | 30 | 7 | 6 | 15 | 16 | 17 | 18 | 19 |
x-----x-----x-----x-----x-----x-----x-----x-----x-----x-----x
(0) (1) (2) (3) (4) (5) (6) (7) (8) (9)
where * represents the values selected during the first sliding window
step and ** during the second sliding window step
the resulting indices will be:
[0, 3, 6, 2, 5, 8]
| | | |
First Second
step step
after tf.gather_nd operation we get a new input data with
removed dilations:
[10, 7, 16, 30, 15, 18]
and apllying tf.nn.maxpool (or avgpool) with strides = kernel_shape = 3
will result into:
[16, 30]
which is the result of the dilated maxpooling.
Here is pseudo code of the algorithm with comments:
FUNCTION _remove_dilations:
/* Calculate N-D index of the values to be selected by the
dilations and strides */
/* Do a loop over the input spatial dimensions starting from the
last (most internal) going up to the first dimension
On every step of the loop calculate the input indices and
"combine" them with the already calculated indices from the
previous dimensions using cartesian product.
*/
LOOP with **dimension** from **dimensions_count** to **0**:
// Initialize empty gather_nd index
gather_ind = []
// Calculate the output size for the current dimension
dim_filter_size = (dim_kernel_size - 1) * dim_dilations
dim_output_size = (((dim_input_size - dim_filter_size) //
dim_strides) + 1) * dim_kernel_size)
/* For every output index, calculate the corresponding index
into the input data */
dim_input_indices = range(0, dim_output_size)
dim_input_indices = calculate_input_indicies(dim_input_indices)
/* combine the calculated indices with the previous dimensions
*/
gather_ind = cartesian_product(dim_input_indices, gather_ind)
END LOOP
/* For example for 2D input the resulting gather_ind will
look like this:
[[y1, x1], [y2, x2], ..., [yn, xm]]
where:
n is the height
m is the width and
[xi, yi] are the 2D indices in the input data
*/
new_data = tf.gather_nd(input, gather_ind)
reshape new_data to the correct output shape
RETURN new_data
Before executing _remove_dilations the code will apply paddings to the
input data if needed. Padding is done using tf.pad with -inf values.
Check `_remove_dilations` code for more details explanation of the
implementation
In case of dilated_maxpool_with_argmax the returned indices from
tf.nn.max_pool_with_argmax will point into our "no dilations" data.
That is why they need to be mapped back to the original input data.
It is done with `_calc_orig_argmax` function which will apply the same
calculations, that are used in _remove_dilations when calculating the
input data indices from output indices (check `_calc_orig_argmax` for
detailed inline comments explaining the calculations)
"""
def __init__(self,
input,
kernel_shape,
strides,
dilations,
padding="VALID",
ceil_mode=False,
count_include_pad=False,
pooling_type="MAX",
p=2):
self.input = tf.convert_to_tensor(input)
self.kernel_shape = kernel_shape
self.strides = strides
self.dilations = dilations
self.padding = padding
self.is_explicit_padding = type(padding) is list
self.ceil_mode = ceil_mode
self.count_include_pad = count_include_pad
self.pooling_type = pooling_type.upper()
self.p = p
self.is_known_shape = self.input.shape.is_fully_defined()
self.spatial_size = len(kernel_shape)
self.input_rank = self.spatial_size + 2
# if the rank is not defined, set it to the calculated input_rank
# rank should be known for ops like tf.gather_nd
if not input.shape.rank:
input.set_shape([None] * self.input_rank)
self.orig_input_shape = tf_shape(input)
self.input_shape = self.orig_input_shape
if pooling_type.startswith("MAX"):
self.padding_constant = input.dtype.min
else:
self.padding_constant = 0
def _calc_input_ind(self, output_ind, kernel, dilation, stride):
"""
This function maps index from the output of _remove_dilations
to index from the original input along single axis. It calculates
the index inside the input data from the index of the output.
It is used to generate the correct indexes of the values to be
extracted by gather_nd.
Args:
output_ind: vector with indices from the output to be mapped
kernel: kernel size along the axis
dilation: dilations along the axis
stride: strides along the axis
Return:
input_ind: calculated indices
The formula is:
input_ind = (output_ind // kernel) * stride +
(output_ind % kernel) * dilation
Example:
If we have following 2D input to _remove_dilations:
[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 12, 13, 14, 15]]
and Kernel = [2, 2], Dilations: [2, 2], Strides: [1, 1]
the output of _remove_dilations will have shape [4, 4] and
_calc_input_ind will be called twice for the two axis 0 (along
height) and axis 1 (along width) with
output_ind = [0, 1, 2, 3]
which will result in:
input_ind = [0, 2, 1, 3]
"""
return (output_ind // kernel) * (stride - kernel * dilation) + \
output_ind * dilation
def _calc_orig_argmax(self, ind):
"""
Map result argxmax to the original input indices
Maps indices generated by maxpool_with_argmax on top of the
dilation reduced input to the orignal input indices
"""
in_width = self.orig_input_shape[2]
num_channels = self.orig_input_shape[3]
output_width = self.output_shape[2]
# mod_floor op is not implemented on GPU
# implement it using: a % b = a - (a // b) * b
# inRow = (ind // num_channels) // output_width
# inCol = (ind // num_channels) % output_width
# ind_channel = ind % num_channels
ind_nhw = ind // num_channels
inRow = ind_nhw // output_width
inCol = ind_nhw - (ind_nhw // output_width) * output_width
ind_channel = ind - ind_nhw * num_channels
row = self._calc_input_ind(inRow, self.kernel_shape[0], self.dilations[0],
self.strides[0]) - self.pads[0]
col = self._calc_input_ind(inCol, self.kernel_shape[1], self.dilations[1],
self.strides[1]) - self.pads[2]
new_ind = num_channels * (row * in_width + col) + ind_channel
return new_ind
def _remove_dilations(self):
"""
This method removes the dilations by extracting the values from
the input for every sliding window according to the dilations,
strides and kernel size and generates output that can be used by
pooling operations with strides = kernel_shape to accomplish
dilated pooling
Example:
Input: [[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 12, 13, 14, 15]]
Kernel: [2, 2]
Dilations: [2, 2]
Strides: [1, 1]
Will return:
[[ 0, 2, 1, 3],
[ 8, 10, 9, 11],
[ 4, 6, 5, 7],
[ 12, 14, 13, 15]]
After max_pool2d with kernel_shape = strides = [2, 2]
the result is:
[[ 10, 11],
[ 14, 15]]
"""
input_shape = tf_shape(self.input)
in_spatial_shape = input_shape[1:self.spatial_size + 1]
channels_count = input_shape[self.spatial_size + 1]
# Initialize gather_ind with the range of channels
# e.g. [0 1]
gather_ind = tf.range(channels_count, dtype=tf.int64)
# convert the vector to column vector
# in the following logic we use column vectors
gather_ind = tf.expand_dims(gather_ind, 1)
# initilize the output_shape with zeros
# self.output_shape will contain the shape of the
# output tensor after the loop below is executed
self.output_shape = [0] * (self.spatial_size + 2)
self.output_shape[0] = input_shape[0]
"""
Loop over the input spatial dimensions starting from the
last (most internal) going up to the first dimension
On every step of the loop calculate the output indices and
map them to the input indices using `_calc_input_ind`,
then "combine" with the already calculated indices from the
previous dimensions using cartesian product.
For the following example input:
Input: [[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11],
[ 12, 13, 14, 15]]
Kernel: [2, 2]
Dilations: [2, 2]
Strides: [1, 1]
these are the steps that will be executed:
1. Initilize gather_ind = [[0]] # we have only 1 channel
2. Loop step 0 (axis 1):
filter_size = 3
output_size = 4
dim_ind = [[0]
[2]
[1]
[3]]
gather_ind = [[0 0]
[2 0]
[1 0]
[3 0]]
3. Loop step 1 (axis 0):
filter_size = 3
output_size = 4
dim_ind = [[0]
[2]
[1]
[3]]
gather_ind = [[0 0 0]
[0 2 0]
[0 1 0]
[0 3 0]
[2 0 0]
[2 2 0]
[2 1 0]
[2 3 0]
[1 0 0]
[1 2 0]
[1 1 0]
[1 3 0]
[3 0 0]
[3 2 0]
[3 1 0]
[3 3 0]]
These are the indices used for gather_nd operation to collect
the values from the input data.
"""
for dim in range(self.spatial_size - 1, -1, -1):
filter_size = (self.kernel_shape[dim] - 1) * \
self.dilations[dim] + 1
output_size = ((
(in_spatial_shape[dim] - filter_size) // self.strides[dim]) + 1
) * self.kernel_shape[dim]
self.output_shape[dim + 1] = output_size
# initialize the output dimension index with the range of the
# dimension output size (e.g. 4): [0, 1, 2, 3]
dim_ind = tf.range(output_size)
# calculate the matching indices in the input data
# [0, 1, 2, 3] will calculate to [0, 2, 1, 3]
# from the above example
dim_ind = self._calc_input_ind(dim_ind, self.kernel_shape[dim],
self.dilations[dim], self.strides[dim])
# convert to column vector
dim_ind = tf.expand_dims(dim_ind, 1)
# "combine" current dimension indices with the previous dimensions
# using cartesian product
gather_ind = tf_product(dim_ind, gather_ind)
# The result from the above loop for 2D data will be:
# [[y1, x1, c], [y2, x2, c], ..., [yn, xm, c]] where n is the height,
# m is the width and c is the channel number.
# set the channels count in the output_shape
self.output_shape[self.spatial_size + 1] = channels_count
# expand the dimensions to match the input dimensions + 1
for x in range(self.spatial_size):
gather_ind = tf.expand_dims(gather_ind, 0)
# dublicate the indices for every batch
gather_ind = tf.tile(gather_ind,
[input_shape[0]] + [1] * (self.spatial_size + 1))
# extract the selected values from the input
output = tf.gather_nd(self.input, gather_ind, batch_dims=1)
# reshape the output to the correct shape calculated earlier
output = tf.reshape(output, self.output_shape)
return output
def _calc_pads_same(self, in_spatial_shape):
"""
Calculate SAME_* paddings.
"""
pad_ops = pooling_helper.pad_numpy_ops if self.is_known_shape else \
pooling_helper.pad_tf_ops
return pooling_helper.calc_pads_same(in_spatial_shape, self.kernel_shape,
self.strides, self.dilations,
self.padding, pad_ops, 2)
def _calc_pads_explicit(self):
"""
Calculate explicit padding
"""
assert type(self.padding) is list
pads = []
for i in range(self.spatial_size):
pads += [self.padding[i], self.padding[i + self.spatial_size]]
return pads
def _calc_pads_ceil_mode(self, in_spatial_shape):
"""
Calculate padding in ceil_mode
"""
pads = []
for i in range(self.spatial_size):
dim_size = in_spatial_shape[i]
filter_size = (self.kernel_shape[i] - 1) * self.dilations[i] + 1
out_size = (dim_size - filter_size) / self.strides[i]
if self.is_known_shape:
pad_size = (np.ceil(out_size) - np.floor(out_size)).astype(np.int64)
else:
pad_size = tf.cast(
tf.math.ceil(out_size) - tf.math.floor(out_size), tf.int64)
pads += [0, pad_size * self.strides[i]]
return pads
def _calc_pads(self, in_spatial_shape):
if self.is_known_shape:
pads = np.zeros([self.spatial_size * 2], np.int64)
else:
pads = tf.zeros([self.spatial_size * 2], tf.int64)
# check for explicit padding
if type(self.padding) is list:
pads += self._calc_pads_explicit()
elif self.padding.lower().startswith("same"):
pads += self._calc_pads_same(in_spatial_shape)
# when padding is set to SAME, ceil_mode will not do anything
# because output sizes will be multiple of the strides
if self.ceil_mode and (type(self.padding) is list or
not self.padding.lower().startswith("same")):
new_spatial_shape = [
in_spatial_shape[i] + pads[i * 2] + pads[i * 2 + 1]
for i in range(self.spatial_size)
]
pads += self._calc_pads_ceil_mode(new_spatial_shape)
return pads
def _pad_input(self):
"""
Pad the input according to the parameters
"""
# check if we need to do any padding at all
if not self.ceil_mode and ((type(self.padding) is list and
self.padding == [0] * self.spatial_size * 2) or
self.padding == "VALID"):
self.pads = np.array([0] * self.spatial_size * 2)
return (self.input, self.pads)
in_spatial_shape = self.input_shape[1:self.spatial_size + 1]
pads = self._calc_pads(in_spatial_shape)
if self.is_known_shape and np.count_nonzero(pads) == 0:
self.pads = pads
return (self.input, pads)
tf_paddings = [[0, 0]]
for i in range(self.spatial_size):
tf_paddings += [[pads[i * 2], pads[i * 2 + 1]]]
tf_paddings += [[0, 0]]
self.input = tf.pad(
self.input,
tf_paddings,
mode='CONSTANT',
constant_values=self.padding_constant)
# update input shape and pads values
self.input_shape = tf_shape(self.input)
self.pads = pads
def _calc_argmax_without_padding(self, ind):
"""
Calculate the original indices as they would be without padding
"""
in_width = self.orig_input_shape[2]
padded_width = self.input_shape[2]
num_channels = self.input_shape[3]
# mod_floor op is not implemented on GPU
# implement it using: a % b = a - (a // b) * b
# ind_nhw = ind // num_channels
# ind_channel = ind % num_channels
ind_nhw = ind // num_channels
ind_channel = ind - ind_nhw * num_channels
new_ind = (ind_nhw // padded_width) * (self.pads[2] + self.pads[3])
new_ind = ind_nhw - new_ind - self.pads[0] * in_width - self.pads[2]
new_ind = num_channels * new_ind + ind_channel
return new_ind
def dilated_maxpool_with_argmax(self, force_custom_impl=False):
"""
Do a dilated maxpool and return indices/argmax
"""
# Tensorflow does not support maxpool_with_argmax on
# spatial_size != 2
assert self.spatial_size == 2
if list(self.dilations) != [1] * self.spatial_size or \
force_custom_impl:
# pad the input
self._pad_input()
new_input = self._remove_dilations()
kernel_shape = [1] + list(self.kernel_shape) + [1]
pooled, new_ind = tf.nn.max_pool_with_argmax(
new_input, ksize=kernel_shape, strides=kernel_shape, padding="VALID")
new_ind = self._calc_orig_argmax(new_ind)
else:
self.pads = np.array([0] * self.spatial_size * 2)
if type(self.padding) is list or \
self.padding.lower() == "same_lower":
# pad the input
self._pad_input()
padding_ = "VALID"
elif self.padding.lower() == "same_upper":
padding_ = "SAME"
else:
padding_ = self.padding
strides = [1] + list(self.strides) + [1]
kernel_shape = [1] + list(self.kernel_shape) + [1]
pooled, new_ind = tf.nn.max_pool_with_argmax(
self.input, ksize=kernel_shape, strides=strides, padding=padding_)
# if there was padding, recalculate the returned index
# to exclude the padding
if np.count_nonzero(self.pads) != 0:
new_ind = self._calc_argmax_without_padding(new_ind)
return (pooled, new_ind)
def _lp_pool(self, input, ksize, strides, padding):
window_size = np.prod(ksize)
input = tf.math.pow(tf.math.abs(input), self.p) * window_size
pooled = tf.nn.avg_pool_v2(input, ksize=ksize, strides=strides,
padding=padding)
pooled = tf.math.pow(pooled, 1.0 / self.p)
return pooled
def dilated_pool(self, force_custom_impl=False):
"""
Does N-D dilated max/avg pooling. Pads the input if explicit or
SAME_* padding is provided or ceil_mode is True
"""
assert self.is_supported()
if self.is_explicit_padding or self.padding.lower() == "same_lower" \
or (self.padding.lower() == "same_upper" and
self.count_include_pad) or self.pooling_type.upper() == "LP":
# pad the input
self._pad_input()
padding_ = "VALID"
elif self.padding.lower() == "same_upper":
padding_ = "SAME"
else:
padding_ = self.padding
# if maxpool op with dialtions != 1 and spatial_size == 2
# we can use tf.nn.dilation2d directly
if self.spatial_size == 2 and self.pooling_type.startswith("MAX") \
and self.dilations != [1] * self.spatial_size and \
not force_custom_impl:
strides = [1] + list(self.strides) + [1]
dilations = [1] + list(self.dilations) + [1]
filter = tf.zeros(
[self.kernel_shape[0], self.kernel_shape[1], self.input_shape[3]],
self.input.dtype)
pooled = tf.nn.dilation2d(
input=self.input,
filters=filter,
strides=strides,
dilations=dilations,
padding=padding_)
# if spatial_size < 4 and strides == 1 or dilation == 1 use tf.nn.pool
elif self.spatial_size < 4 and (self.strides == [1] * self.spatial_size or
self.dilations == [1] * self.spatial_size) and \
not force_custom_impl:
# if strides == 1 and not LpPool use tf.nn.pool directly
if self.strides == [1] * self.spatial_size and self.pooling_type != "LP":
pooled = tf.nn.pool(
self.input,
window_shape=self.kernel_shape,
dilations=self.dilations,
strides=self.strides,
padding=padding_,
pooling_type=self.pooling_type)
else:
# othwerwise check the pooling_type and use the correct op
if self.pooling_type.startswith("MAX"):
op = tf.nn.max_pool_v2
elif self.pooling_type == "AVG":
op = tf.nn.avg_pool_v2
elif self.pooling_type == "LP":
op = self._lp_pool
else:
raise ValueError("%d-D %s pooling is not supported." %
(self.spatial_size, self.pooling_type))
pooled = op(self.input, ksize=self.kernel_shape, strides=self.strides,
padding=padding_)
# in any other case we use custom implementation _remove_dilations
# to reduce atrous/dilated pooling into regular pooling and selecting
# only the values of the input that should have been selected by
# applying the strides and dilations. Then use tf.nn.pool with
# strides = kernel_shape and no dilations
else:
if padding_ == "SAME":
# pad the input
self._pad_input()
input_ = self._remove_dilations()
if self.pooling_type=="LP":
pooled = self._lp_pool(
input_,
ksize=self.kernel_shape,
strides=self.kernel_shape,
padding="VALID")
else:
pooled = tf.nn.pool(
input_,
window_shape=self.kernel_shape,
strides=self.kernel_shape,
padding="VALID",
pooling_type=self.pooling_type)
return pooled
def is_supported(self):
"""
Function to check if the current set of arguments are
supported for average pool
"""
# check for maxpool
if self.pooling_type.startswith("MAX") or \
self.pooling_type=="LP":
return True
else:
# if count_include_pad is true it is fully supported
if self.count_include_pad:
return True
# ceil mode is not supported
elif self.ceil_mode:
return False
# explicit padding with padding values set to 0 is supported
elif (self.is_explicit_padding and
self.padding == [0] * self.spatial_size * 2):
return True
# "valid" and "same_upper" auto padding is supported
elif (not self.is_explicit_padding and
self.padding.lower() in ["valid", "same_upper"]):
return True
# any other case is not supported
else:
return False