add pt2tf tool

2020-09-23 09:09:49 +08:00
parent 7f7b7df65d
commit 18aefa4dd0
407 changed files with 16211 additions and 0 deletions
@@ -0,0 +1,702 @@
+from __future__ import division
+
+import tensorflow as tf
+import numpy as np
+
+from onnx_tf.common import pooling_helper
+from onnx_tf.common.tf_helper import tf_shape
+from onnx_tf.common.tf_helper import tf_product
+
+
+class DilatedPooling(object):
+  """
+        This class implements two main methods:
+            dilated_pool:
+                calculates a max or average pool over the input
+
+            dilated_maxpool_with_argmax:
+                calculates a maxpool over the input and returns the
+                indices/argmax of the selected values
+
+        In addition to the standard features of pooling operations in
+        Tensorflow, these methods support dilations, ceil mode, SAME_LOWER and
+        explicit padding.
+
+        Dilations are partly supported in Tensorflow in `tf.nn.pool` and
+        `tf.nn.dilation2d`. The code will try to use the Tensoflow build-in
+        functions as much as poosible.
+
+        In cases, not supported by Tensorflow there is a custom algorith of
+        dilated pooling `_remove_dilations`.
+
+        The idea behind `_remove_dilations` is to transform the input N-D data
+        into a supported input for the standard tf.nn.pool operation.
+        This is achieved by calculating N-D indicies for the values which will
+        be selected from the input when applying the dilations and
+        then extracting the values using tf.gather_nd. Next step is to execute
+        `tf.nn.pool` on this new input data with **strides=kernel_shape** and
+        no dilations. The resulting pool will be the result we are looking for.
+
+        In case of `deilated_maxpool_with_argmax` an additional step is needed
+        to recalculated the resulting indices back into the original
+        data indices. It is done with `_calc_orig_argmax`
+
+        Here is a simple example of how the algorithm works:
+
+        kernel_shape = [3]
+        strides = [2]
+        dilations = [3]
+
+        Input 1D data:
+
+            x-----x-----x-----x-----x-----x-----x-----x-----x-----x-----x
+            |  *  |     | **  |  *  |     | **  |  *  |     | **  |     |
+            | 10  |  9  | 30  |  7  |  6  | 15  | 16  | 17  | 18  | 19  |
+            x-----x-----x-----x-----x-----x-----x-----x-----x-----x-----x
+              (0)   (1)   (2)   (3)   (4)   (5)   (6)   (7)   (8)   (9)
+
+        where * represents the values selected during the first sliding window
+        step and ** during the second sliding window step
+
+        the resulting indices will be:
+
+            [0, 3, 6, 2, 5, 8]
+             |     |  |     |
+              First    Second
+              step     step
+
+        after tf.gather_nd operation we get a new input data with
+        removed dilations:
+
+            [10, 7, 16, 30, 15, 18]
+
+        and apllying tf.nn.maxpool (or avgpool) with strides = kernel_shape = 3
+        will result into:
+
+            [16, 30]
+
+        which is the result of the dilated maxpooling.
+
+        Here is pseudo code of the algorithm with comments:
+
+        FUNCTION _remove_dilations:
+            /* Calculate N-D index of the values to be selected by the
+               dilations and strides */
+
+            /* Do a loop over the input spatial dimensions starting from the
+               last (most internal) going up to the first dimension
+
+               On every step of the loop calculate the input indices and
+               "combine" them with the already calculated indices from the
+               previous dimensions using cartesian product.
+            */
+            LOOP with **dimension** from **dimensions_count** to **0**:
+
+                // Initialize empty gather_nd index
+                gather_ind = []
+
+                // Calculate the output size for the current dimension
+                dim_filter_size = (dim_kernel_size - 1) * dim_dilations
+                dim_output_size = (((dim_input_size - dim_filter_size) //
+                                   dim_strides) + 1) * dim_kernel_size)
+
+                /* For every output index, calculate the corresponding index
+                   into the input data */
+                dim_input_indices = range(0, dim_output_size)
+                dim_input_indices = calculate_input_indicies(dim_input_indices)
+
+                /* combine the calculated indices with the previous dimensions
+                */
+                gather_ind = cartesian_product(dim_input_indices, gather_ind)
+            END LOOP
+
+            /* For example for 2D input the resulting gather_ind will
+               look like this:
+
+               [[y1, x1], [y2, x2], ..., [yn, xm]]
+
+               where:
+               n is the height
+               m is the width and
+               [xi, yi] are the 2D indices in the input data
+            */
+
+            new_data = tf.gather_nd(input, gather_ind)
+
+            reshape new_data to the correct output shape
+
+            RETURN new_data
+
+
+        Before executing _remove_dilations the code will apply paddings to the
+        input data if needed. Padding is done using tf.pad with -inf values.
+        Check `_remove_dilations` code for more details explanation of the
+        implementation
+
+        In case of dilated_maxpool_with_argmax the returned indices from
+        tf.nn.max_pool_with_argmax will point into our "no dilations" data.
+        That is why they need to be mapped back to the original input data.
+        It is done with `_calc_orig_argmax` function which will apply the same
+        calculations, that are used in _remove_dilations when calculating the
+        input data indices from output indices (check `_calc_orig_argmax` for
+        detailed inline comments explaining the calculations)
+
+    """
+
+  def __init__(self,
+               input,
+               kernel_shape,
+               strides,
+               dilations,
+               padding="VALID",
+               ceil_mode=False,
+               count_include_pad=False,
+               pooling_type="MAX",
+               p=2):
+    self.input = tf.convert_to_tensor(input)
+
+    self.kernel_shape = kernel_shape
+    self.strides = strides
+    self.dilations = dilations
+    self.padding = padding
+    self.is_explicit_padding = type(padding) is list
+    self.ceil_mode = ceil_mode
+    self.count_include_pad = count_include_pad
+    self.pooling_type = pooling_type.upper()
+    self.p = p
+
+    self.is_known_shape = self.input.shape.is_fully_defined()
+    self.spatial_size = len(kernel_shape)
+    self.input_rank = self.spatial_size + 2
+
+    # if the rank is not defined, set it to the calculated input_rank
+    # rank should be known for ops like tf.gather_nd
+    if not input.shape.rank:
+      input.set_shape([None] * self.input_rank)
+    self.orig_input_shape = tf_shape(input)
+    self.input_shape = self.orig_input_shape
+
+    if pooling_type.startswith("MAX"):
+      self.padding_constant = input.dtype.min
+    else:
+      self.padding_constant = 0
+
+  def _calc_input_ind(self, output_ind, kernel, dilation, stride):
+    """
+            This function maps index from the output of _remove_dilations
+            to index from the original input along single axis. It calculates
+            the index inside the input data from the index of the output.
+            It is used to generate the correct indexes of the values to be
+            extracted by gather_nd.
+
+            Args:
+                output_ind: vector with indices from the output to be mapped
+                kernel:     kernel size along the axis
+                dilation:   dilations along the axis
+                stride:     strides along the axis
+            Return:
+                input_ind: calculated indices
+
+            The formula is:
+                input_ind = (output_ind // kernel) * stride +
+                            (output_ind % kernel) * dilation
+
+            Example:
+              If we have following 2D input to _remove_dilations:
+                         [[  0,  1,  2,  3],
+                          [  4,  5,  6,  7],
+                          [  8,  9, 10, 11],
+                          [ 12, 13, 14, 15]]
+              and Kernel = [2, 2], Dilations: [2, 2], Strides: [1, 1]
+
+              the output of _remove_dilations will have shape [4, 4] and
+              _calc_input_ind will be called twice for the two axis 0 (along
+              height) and axis 1 (along width) with
+
+                  output_ind = [0, 1, 2, 3]
+
+              which will result in:
+
+                  input_ind = [0, 2, 1, 3]
+        """
+    return (output_ind // kernel) * (stride - kernel * dilation) + \
+        output_ind * dilation
+
+  def _calc_orig_argmax(self, ind):
+    """
+            Map result argxmax to the original input indices
+
+            Maps indices generated by maxpool_with_argmax on top of the
+            dilation reduced input to the orignal input indices
+        """
+
+    in_width = self.orig_input_shape[2]
+    num_channels = self.orig_input_shape[3]
+    output_width = self.output_shape[2]
+
+    # mod_floor op is not implemented on GPU
+    # implement it using: a % b = a - (a // b) * b
+
+    # inRow = (ind // num_channels) // output_width
+    # inCol = (ind // num_channels) % output_width
+    # ind_channel = ind % num_channels
+
+    ind_nhw = ind // num_channels
+
+    inRow = ind_nhw // output_width
+    inCol = ind_nhw - (ind_nhw // output_width) * output_width
+
+    ind_channel = ind - ind_nhw * num_channels
+
+    row = self._calc_input_ind(inRow, self.kernel_shape[0], self.dilations[0],
+                               self.strides[0]) - self.pads[0]
+    col = self._calc_input_ind(inCol, self.kernel_shape[1], self.dilations[1],
+                               self.strides[1]) - self.pads[2]
+
+    new_ind = num_channels * (row * in_width + col) + ind_channel
+    return new_ind
+
+  def _remove_dilations(self):
+    """
+            This method removes the dilations by extracting the values from
+            the input for every sliding window according to the dilations,
+            strides and kernel size and generates output that can be used by
+            pooling operations with strides = kernel_shape to accomplish
+            dilated pooling
+
+            Example:
+              Input:     [[  0,  1,  2,  3],
+                          [  4,  5,  6,  7],
+                          [  8,  9, 10, 11],
+                          [ 12, 13, 14, 15]]
+
+              Kernel:    [2, 2]
+              Dilations: [2, 2]
+              Strides:   [1, 1]
+
+              Will return:
+                         [[  0,  2,  1,  3],
+                          [  8, 10,  9, 11],
+                          [  4,  6,  5,  7],
+                          [ 12, 14, 13, 15]]
+
+              After max_pool2d with kernel_shape = strides = [2, 2]
+              the result is:
+                         [[ 10, 11],
+                          [ 14, 15]]
+        """
+
+    input_shape = tf_shape(self.input)
+    in_spatial_shape = input_shape[1:self.spatial_size + 1]
+
+    channels_count = input_shape[self.spatial_size + 1]
+    # Initialize gather_ind with the range of channels
+    # e.g. [0 1]
+    gather_ind = tf.range(channels_count, dtype=tf.int64)
+    # convert the vector to column vector
+    # in the following logic we use column vectors
+    gather_ind = tf.expand_dims(gather_ind, 1)
+
+    # initilize the output_shape with zeros
+    # self.output_shape will contain the shape of the
+    # output tensor after the loop below is executed
+    self.output_shape = [0] * (self.spatial_size + 2)
+    self.output_shape[0] = input_shape[0]
+    """
+            Loop over the input spatial dimensions starting from the
+            last (most internal) going up to the first dimension
+
+            On every step of the loop calculate the output indices and
+            map them to the input indices using `_calc_input_ind`,
+            then "combine" with the already calculated indices from the
+            previous dimensions using cartesian product.
+
+            For the following example input:
+
+              Input:     [[  0,  1,  2,  3],
+                          [  4,  5,  6,  7],
+                          [  8,  9, 10, 11],
+                          [ 12, 13, 14, 15]]
+
+              Kernel:    [2, 2]
+              Dilations: [2, 2]
+              Strides:   [1, 1]
+
+            these are the steps that will be executed:
+
+            1. Initilize gather_ind = [[0]]     # we have only 1 channel
+
+            2. Loop step 0 (axis 1):
+                  filter_size = 3
+                  output_size = 4
+                  dim_ind = [[0]
+                             [2]
+                             [1]
+                             [3]]
+
+                  gather_ind = [[0 0]
+                                [2 0]
+                                [1 0]
+                                [3 0]]
+
+            3. Loop step 1 (axis 0):
+                  filter_size = 3
+                  output_size = 4
+                  dim_ind = [[0]
+                             [2]
+                             [1]
+                             [3]]
+
+                  gather_ind = [[0 0 0]
+                                [0 2 0]
+                                [0 1 0]
+                                [0 3 0]
+                                [2 0 0]
+                                [2 2 0]
+                                [2 1 0]
+                                [2 3 0]
+                                [1 0 0]
+                                [1 2 0]
+                                [1 1 0]
+                                [1 3 0]
+                                [3 0 0]
+                                [3 2 0]
+                                [3 1 0]
+                                [3 3 0]]
+
+            These are the indices used for gather_nd operation to collect
+            the values from the input data.
+        """
+
+    for dim in range(self.spatial_size - 1, -1, -1):
+      filter_size = (self.kernel_shape[dim] - 1) * \
+                     self.dilations[dim] + 1
+      output_size = ((
+          (in_spatial_shape[dim] - filter_size) // self.strides[dim]) + 1
+                    ) * self.kernel_shape[dim]
+      self.output_shape[dim + 1] = output_size
+
+      # initialize the output dimension index with the range of the
+      # dimension output size (e.g. 4): [0, 1, 2, 3]
+      dim_ind = tf.range(output_size)
+
+      # calculate the matching indices in the input data
+      # [0, 1, 2, 3] will calculate to [0, 2, 1, 3]
+      # from the above example
+      dim_ind = self._calc_input_ind(dim_ind, self.kernel_shape[dim],
+                                     self.dilations[dim], self.strides[dim])
+      # convert to column vector
+      dim_ind = tf.expand_dims(dim_ind, 1)
+
+      # "combine" current dimension indices with the previous dimensions
+      # using cartesian product
+      gather_ind = tf_product(dim_ind, gather_ind)
+
+    # The result from the above loop for 2D data will be:
+    # [[y1, x1, c], [y2, x2, c], ..., [yn, xm, c]] where n is the height,
+    # m is the width and c is the channel number.
+
+    # set the channels count in the output_shape
+    self.output_shape[self.spatial_size + 1] = channels_count
+
+    # expand the dimensions to match the input dimensions + 1
+    for x in range(self.spatial_size):
+      gather_ind = tf.expand_dims(gather_ind, 0)
+    # dublicate the indices for every batch
+    gather_ind = tf.tile(gather_ind,
+                         [input_shape[0]] + [1] * (self.spatial_size + 1))
+
+    # extract the selected values from the input
+    output = tf.gather_nd(self.input, gather_ind, batch_dims=1)
+    # reshape the output to the correct shape calculated earlier
+    output = tf.reshape(output, self.output_shape)
+
+    return output
+
+  def _calc_pads_same(self, in_spatial_shape):
+    """
+            Calculate SAME_* paddings.
+        """
+
+    pad_ops = pooling_helper.pad_numpy_ops if self.is_known_shape else \
+        pooling_helper.pad_tf_ops
+
+    return pooling_helper.calc_pads_same(in_spatial_shape, self.kernel_shape,
+                                         self.strides, self.dilations,
+                                         self.padding, pad_ops, 2)
+
+  def _calc_pads_explicit(self):
+    """
+            Calculate explicit padding
+        """
+    assert type(self.padding) is list
+
+    pads = []
+    for i in range(self.spatial_size):
+      pads += [self.padding[i], self.padding[i + self.spatial_size]]
+    return pads
+
+  def _calc_pads_ceil_mode(self, in_spatial_shape):
+    """
+            Calculate padding in ceil_mode
+        """
+
+    pads = []
+    for i in range(self.spatial_size):
+      dim_size = in_spatial_shape[i]
+      filter_size = (self.kernel_shape[i] - 1) * self.dilations[i] + 1
+      out_size = (dim_size - filter_size) / self.strides[i]
+      if self.is_known_shape:
+        pad_size = (np.ceil(out_size) - np.floor(out_size)).astype(np.int64)
+      else:
+        pad_size = tf.cast(
+            tf.math.ceil(out_size) - tf.math.floor(out_size), tf.int64)
+
+      pads += [0, pad_size * self.strides[i]]
+    return pads
+
+  def _calc_pads(self, in_spatial_shape):
+    if self.is_known_shape:
+      pads = np.zeros([self.spatial_size * 2], np.int64)
+    else:
+      pads = tf.zeros([self.spatial_size * 2], tf.int64)
+
+    # check for explicit padding
+    if type(self.padding) is list:
+      pads += self._calc_pads_explicit()
+    elif self.padding.lower().startswith("same"):
+      pads += self._calc_pads_same(in_spatial_shape)
+
+    # when padding is set to SAME, ceil_mode will not do anything
+    # because output sizes will be multiple of the strides
+    if self.ceil_mode and (type(self.padding) is list or
+                           not self.padding.lower().startswith("same")):
+      new_spatial_shape = [
+          in_spatial_shape[i] + pads[i * 2] + pads[i * 2 + 1]
+          for i in range(self.spatial_size)
+      ]
+      pads += self._calc_pads_ceil_mode(new_spatial_shape)
+    return pads
+
+  def _pad_input(self):
+    """
+            Pad the input according to the parameters
+        """
+    # check if we need to do any padding at all
+    if not self.ceil_mode and ((type(self.padding) is list and
+                                self.padding == [0] * self.spatial_size * 2) or
+                               self.padding == "VALID"):
+      self.pads = np.array([0] * self.spatial_size * 2)
+      return (self.input, self.pads)
+
+    in_spatial_shape = self.input_shape[1:self.spatial_size + 1]
+    pads = self._calc_pads(in_spatial_shape)
+
+    if self.is_known_shape and np.count_nonzero(pads) == 0:
+      self.pads = pads
+      return (self.input, pads)
+
+    tf_paddings = [[0, 0]]
+    for i in range(self.spatial_size):
+      tf_paddings += [[pads[i * 2], pads[i * 2 + 1]]]
+    tf_paddings += [[0, 0]]
+
+    self.input = tf.pad(
+        self.input,
+        tf_paddings,
+        mode='CONSTANT',
+        constant_values=self.padding_constant)
+    # update input shape and pads values
+    self.input_shape = tf_shape(self.input)
+    self.pads = pads
+
+  def _calc_argmax_without_padding(self, ind):
+    """
+            Calculate the original indices as they would be without padding
+        """
+    in_width = self.orig_input_shape[2]
+    padded_width = self.input_shape[2]
+    num_channels = self.input_shape[3]
+
+    # mod_floor op is not implemented on GPU
+    # implement it using: a % b = a - (a // b) * b
+
+    # ind_nhw = ind // num_channels
+    # ind_channel = ind % num_channels
+
+    ind_nhw = ind // num_channels
+    ind_channel = ind - ind_nhw * num_channels
+
+    new_ind = (ind_nhw // padded_width) * (self.pads[2] + self.pads[3])
+    new_ind = ind_nhw - new_ind - self.pads[0] * in_width - self.pads[2]
+    new_ind = num_channels * new_ind + ind_channel
+    return new_ind
+
+  def dilated_maxpool_with_argmax(self, force_custom_impl=False):
+    """
+            Do a dilated maxpool and return indices/argmax
+        """
+    # Tensorflow does not support maxpool_with_argmax on
+    # spatial_size != 2
+    assert self.spatial_size == 2
+
+    if list(self.dilations) != [1] * self.spatial_size or \
+       force_custom_impl:
+      # pad the input
+      self._pad_input()
+
+      new_input = self._remove_dilations()
+      kernel_shape = [1] + list(self.kernel_shape) + [1]
+      pooled, new_ind = tf.nn.max_pool_with_argmax(
+          new_input, ksize=kernel_shape, strides=kernel_shape, padding="VALID")
+      new_ind = self._calc_orig_argmax(new_ind)
+    else:
+      self.pads = np.array([0] * self.spatial_size * 2)
+      if type(self.padding) is list or \
+        self.padding.lower() == "same_lower":
+        # pad the input
+        self._pad_input()
+
+        padding_ = "VALID"
+      elif self.padding.lower() == "same_upper":
+        padding_ = "SAME"
+      else:
+        padding_ = self.padding
+
+      strides = [1] + list(self.strides) + [1]
+      kernel_shape = [1] + list(self.kernel_shape) + [1]
+      pooled, new_ind = tf.nn.max_pool_with_argmax(
+          self.input, ksize=kernel_shape, strides=strides, padding=padding_)
+      # if there was padding, recalculate the returned index
+      # to exclude the padding
+      if np.count_nonzero(self.pads) != 0:
+        new_ind = self._calc_argmax_without_padding(new_ind)
+
+    return (pooled, new_ind)
+
+  def _lp_pool(self, input, ksize, strides, padding):
+    window_size = np.prod(ksize)
+
+    input = tf.math.pow(tf.math.abs(input), self.p) * window_size
+    pooled = tf.nn.avg_pool_v2(input, ksize=ksize, strides=strides,
+                               padding=padding)
+    pooled = tf.math.pow(pooled, 1.0 / self.p)
+
+    return pooled
+
+  def dilated_pool(self, force_custom_impl=False):
+    """
+            Does N-D dilated max/avg pooling. Pads the input if explicit or
+            SAME_* padding is provided or ceil_mode is True
+        """
+
+    assert self.is_supported()
+
+    if self.is_explicit_padding or self.padding.lower() == "same_lower" \
+            or (self.padding.lower() == "same_upper" and
+                self.count_include_pad) or self.pooling_type.upper() == "LP":
+      # pad the input
+      self._pad_input()
+
+      padding_ = "VALID"
+    elif self.padding.lower() == "same_upper":
+      padding_ = "SAME"
+    else:
+      padding_ = self.padding
+
+    # if maxpool op with dialtions != 1 and spatial_size == 2
+    # we can use tf.nn.dilation2d directly
+    if self.spatial_size == 2 and self.pooling_type.startswith("MAX") \
+            and self.dilations != [1] * self.spatial_size and \
+            not force_custom_impl:
+      strides = [1] + list(self.strides) + [1]
+      dilations = [1] + list(self.dilations) + [1]
+
+      filter = tf.zeros(
+          [self.kernel_shape[0], self.kernel_shape[1], self.input_shape[3]],
+          self.input.dtype)
+      pooled = tf.nn.dilation2d(
+          input=self.input,
+          filters=filter,
+          strides=strides,
+          dilations=dilations,
+          padding=padding_)
+    # if spatial_size < 4 and strides == 1 or dilation == 1 use tf.nn.pool
+    elif self.spatial_size < 4 and (self.strides == [1] * self.spatial_size or
+            self.dilations == [1] * self.spatial_size) and \
+            not force_custom_impl:
+      # if strides == 1 and not LpPool use tf.nn.pool directly
+      if self.strides == [1] * self.spatial_size and self.pooling_type != "LP":
+        pooled = tf.nn.pool(
+            self.input,
+            window_shape=self.kernel_shape,
+            dilations=self.dilations,
+            strides=self.strides,
+            padding=padding_,
+            pooling_type=self.pooling_type)
+      else:
+        # othwerwise check the pooling_type and use the correct op
+        if self.pooling_type.startswith("MAX"):
+          op = tf.nn.max_pool_v2
+        elif self.pooling_type == "AVG":
+          op = tf.nn.avg_pool_v2
+        elif self.pooling_type == "LP":
+          op = self._lp_pool
+        else:
+          raise ValueError("%d-D %s pooling is not supported." %
+                           (self.spatial_size, self.pooling_type))
+        pooled = op(self.input, ksize=self.kernel_shape, strides=self.strides,
+                    padding=padding_)
+    # in any other case we use custom implementation _remove_dilations
+    # to reduce atrous/dilated pooling into regular pooling and selecting
+    # only the values of the input that should have been selected by
+    # applying the strides and dilations. Then use tf.nn.pool with
+    # strides = kernel_shape and no dilations
+    else:
+      if padding_ == "SAME":
+        # pad the input
+        self._pad_input()
+      input_ = self._remove_dilations()
+      if self.pooling_type=="LP":
+        pooled = self._lp_pool(
+            input_,
+            ksize=self.kernel_shape,
+            strides=self.kernel_shape,
+            padding="VALID")
+
+      else:
+        pooled = tf.nn.pool(
+            input_,
+            window_shape=self.kernel_shape,
+            strides=self.kernel_shape,
+            padding="VALID",
+            pooling_type=self.pooling_type)
+    return pooled
+
+  def is_supported(self):
+    """
+            Function to check if the current set of arguments are
+            supported for average pool
+        """
+    # check for maxpool
+    if self.pooling_type.startswith("MAX") or \
+       self.pooling_type=="LP":
+      return True
+    else:
+      # if count_include_pad is true it is fully supported
+      if self.count_include_pad:
+        return True
+      # ceil mode is not supported
+      elif self.ceil_mode:
+        return False
+      # explicit padding with padding values set to 0 is supported
+      elif (self.is_explicit_padding and
+            self.padding == [0] * self.spatial_size * 2):
+        return True
+      # "valid" and "same_upper" auto padding is supported
+      elif (not self.is_explicit_padding and
+            self.padding.lower() in ["valid", "same_upper"]):
+        return True
+      # any other case is not supported
+      else:
+        return False