LCOV - coverage_filtered.info - nntrainer/layers/conv2d_transpose

LCOV - code coverage report

Current view:	top level - nntrainer/layers - conv2d_transpose_layer.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	0.0 %	228	0
Test Date:	2025-12-14 20:38:17	Functions:	0.0 %	14	0

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2024 UGyeong Song <thddnrud@snu.ac.kr>
       4              :  *
       5              :  * @file   conv2d_transpose_layer.h
       6              :  * @date   13 October 2024
       7              :  * @see    https://github.com/nnstreamer/nntrainer
       8              :  * @author UGyeong Song <thddnrud@snu.ac.kr>
       9              :  * @bug    No known bugs except for NYI items
      10              :  * @brief  This is Transposed Convolution Layer Class for Neural Network
      11              :  *
      12              :  */
      13              : #include <algorithm>
      14              : #include <cstring>
      15              : #include <limits>
      16              : #include <string>
      17              : 
      18              : #include <conv2d_transpose_layer.h>
      19              : #include <cpu_backend.h>
      20              : #include <layer_context.h>
      21              : #include <lazy_tensor.h>
      22              : #include <nntr_threads.h>
      23              : #include <nntrainer_error.h>
      24              : #include <nntrainer_log.h>
      25              : #include <node_exporter.h>
      26              : #include <profiler.h>
      27              : #include <tensor_dim.h>
      28              : #include <thread>
      29              : #include <util_func.h>
      30              : 
      31              : namespace nntrainer {
      32              : 
      33              : static constexpr size_t SINGLE_INOUT_IDX = 0;
      34              : 
      35              : namespace {
      36              : 
      37            0 : static TensorDim calcCol2ImOutputDim(const TensorDim &out,
      38              :                                      const TensorDim &kdim) {
      39              : 
      40            0 :   return TensorDim({kdim.getFeatureLen(), out.width() * out.height()});
      41              : } // [in_channel*kernel_h*kernel_w, out_w*out_h]
      42              : 
      43              : /**
      44              :  * @brief     reconstruct image data from 2d column matrix
      45              :  *
      46              :  * @param[in] in input data
      47              :  * @param[in] kdim kernel dimesion for define number of row
      48              :  * @param[in] padding padding information
      49              :  * @param[in] mstride stride value : x, y direction
      50              :  * @param[in] dilation kernel dilation factor : x, y each
      51              :  * @param[out] image image tensor to put
      52              :  */
      53            0 : static void col2im_transpose(
      54              :   const Tensor &col_matrix, const TensorDim &kdim,
      55              :   const std::array<unsigned, 4> &padding,
      56              :   const std::array<props::Stride, CONV2D_TRANSPOSE_DIM> &mstride,
      57              :   const std::array<props::Dilation, CONV2D_TRANSPOSE_DIM> &dilation,
      58              :   Tensor &image) {
      59            0 :   auto [pt, pb, pl, pr] = padding;
      60              : 
      61            0 :   unsigned int channel = image.channel();
      62            0 :   int in_height = image.height();
      63            0 :   int in_width = image.width();
      64              : 
      65            0 :   unsigned int k_height = kdim.height();
      66            0 :   unsigned int k_width = kdim.width();
      67              : 
      68              :   /// effective kernel height considering dilation
      69            0 :   unsigned int eff_k_height = (k_height - 1) * dilation[0] + 1;
      70              :   /// effective kernel width considering dilation
      71            0 :   unsigned int eff_k_width = (k_width - 1) * dilation[1] + 1;
      72              : 
      73            0 :   unsigned int height = (in_height - 1) * mstride[0] + eff_k_height;
      74            0 :   unsigned int width = (in_width - 1) * mstride[1] + eff_k_height;
      75              : 
      76            0 :   unsigned int out_height = height - pt - pb; // col_matrix.height
      77            0 :   unsigned int out_width = width - pl - pr;   // col_matrix.width
      78              : 
      79            0 :   image.setZero();
      80              : 
      81              :   int h_stride_end = height - eff_k_height - pt;
      82              :   int w_stride_end = width - eff_k_width - pl;
      83              : 
      84              :   /// get a patch, size of kernel
      85              :   /// hs is height_strided, ws is width_strided
      86            0 :   unsigned int owidth = col_matrix.width();
      87              :   unsigned int base_im_w = 0;
      88              : 
      89              :   unsigned int H = k_height;
      90              :   unsigned int W = k_width;
      91            0 :   unsigned int C = image.channel();
      92              : 
      93              :   int out_i = -1;
      94            0 :   for (unsigned int oh = 0; oh < out_height; ++oh) {
      95            0 :     for (unsigned int ow = 0; ow < out_width; ++ow) {
      96            0 :       out_i++;
      97              :       int out_j = -1;
      98              :       // half_cpu o = bias->buf[oc];
      99            0 :       for (unsigned int c = 0; c < C; ++c) {
     100            0 :         for (unsigned int r = 0; r < H; ++r) {
     101            0 :           for (unsigned int s = 0; s < W; ++s) {
     102            0 :             out_j++;
     103            0 :             if ((oh - (r * dilation[0] - pt)) % mstride[0] != 0)
     104            0 :               continue;
     105            0 :             if ((ow - (s * dilation[1] - pl)) % mstride[1] != 0)
     106            0 :               continue;
     107            0 :             unsigned int h = (oh - (r * dilation[0] - pt)) / mstride[0];
     108            0 :             unsigned int w = (ow - (s * dilation[1] - pl)) / mstride[1];
     109            0 :             if (h >= H || w >= W)
     110            0 :               continue;
     111            0 :             float *val = image.getAddress<float>(0, c, h, w);
     112            0 :             *val += col_matrix.getValue<float>(0, 0, out_i, out_j);
     113              :             // out_data[(out_i)*owidth + out_j] += in.getValue<float>(0,c,h,w)
     114              :           }
     115              :         }
     116              :       }
     117              :     }
     118              :   }
     119            0 : }
     120              : 
     121              : /**
     122              :  * @brief       reform the data to 2d matrix
     123              :  * a region is sampled considering @a padding, @a mstride of unit, @a kdim
     124              :  * Each region is mapped to one column
     125              :  *
     126              :  * @param [in] in input data
     127              :  * @param [in] kdim kernel dimension for defined number of row
     128              :  * @param [in] padding padding information
     129              :  * @param [in] mstride stride value : x, y direction
     130              :  * @param [in] dilation kernel dilation factor : x, y each
     131              :  * @param [out] out out tensor
     132              :  */
     133            0 : static void im2col_transpose(
     134              :   const Tensor &in, const TensorDim &kdim,
     135              :   const std::array<unsigned int, 4> &padding,
     136              :   const std::array<props::Stride, CONV2D_TRANSPOSE_DIM> &mstride,
     137              :   const std::array<props::Dilation, CONV2D_TRANSPOSE_DIM> &dilation,
     138              :   Tensor &out) {
     139            0 :   auto [pt, pb, pl, pr] = padding;
     140              : 
     141            0 :   unsigned int channel = in.channel();
     142            0 :   int in_height = in.height();
     143            0 :   int in_width = in.width();
     144              : 
     145            0 :   unsigned int k_height = kdim.height();
     146            0 :   unsigned int k_width = kdim.width();
     147              : 
     148              :   /// effective kernel height considering dilation
     149            0 :   unsigned int eff_k_height = (k_height - 1) * dilation[0] + 1;
     150              :   /// effective kernel width considering dilation
     151            0 :   unsigned int eff_k_width = (k_width - 1) * dilation[1] + 1;
     152              : 
     153            0 :   unsigned int height = (in_height - 1) * mstride[0] + eff_k_height;
     154            0 :   unsigned int width = (in_width - 1) * mstride[1] + eff_k_height;
     155              : 
     156            0 :   unsigned int out_height = height - pt - pb;
     157            0 :   unsigned int out_width = width - pl - pr;
     158              : 
     159            0 :   out.reshape(
     160            0 :     TensorDim({out_height * out_width, in.channel() * k_height * k_width}));
     161              : 
     162              :   float *out_data = out.getData();
     163              : 
     164              :   int h_stride_end = height - eff_k_height - pt;
     165              :   int w_stride_end = width - eff_k_width - pl;
     166              : 
     167              :   /// get a patch, size of kernel
     168            0 :   unsigned int owidth = out.width();
     169              :   unsigned int base_im_w = 0;
     170              : 
     171              :   unsigned int H = k_height;
     172              :   unsigned int W = k_width;
     173            0 :   unsigned int C = in.channel();
     174              : 
     175              :   int out_i = -1;
     176            0 :   for (unsigned int oh = 0; oh < out_height; ++oh) {
     177            0 :     for (unsigned int ow = 0; ow < out_width; ++ow) {
     178            0 :       out_i++;
     179              :       int out_j = -1;
     180              :       // half_cpu o = bias->buf[oc];
     181            0 :       for (unsigned int c = 0; c < C; ++c) {
     182            0 :         for (unsigned int r = 0; r < H; ++r) {
     183            0 :           for (unsigned int s = 0; s < W; ++s) {
     184            0 :             out_j++;
     185            0 :             if ((oh - (r * dilation[0] - pt)) % mstride[0] != 0)
     186            0 :               continue;
     187            0 :             if ((ow - (s * dilation[1] - pl)) % mstride[1] != 0)
     188            0 :               continue;
     189            0 :             unsigned int h = (oh - (r * dilation[0] - pt)) / mstride[0];
     190            0 :             unsigned int w = (ow - (s * dilation[1] - pl)) / mstride[1];
     191            0 :             if (h >= H || w >= W)
     192            0 :               continue;
     193            0 :             out_data[(out_i)*owidth + out_j] += in.getValue<float>(0, c, h, w);
     194              :           }
     195              :         }
     196              :       }
     197              :     }
     198              :   }
     199            0 : }
     200              : 
     201              : } // namespace
     202              : 
     203              : enum ConvParams { weight, bias };
     204              : 
     205            0 : Conv2DTransposeLayer::Conv2DTransposeLayer(
     206            0 :   const std::array<unsigned int, CONV2D_TRANSPOSE_DIM * 2> &padding_) :
     207              :   LayerImpl(),
     208            0 :   padding(padding_),
     209              :   conv_props(
     210            0 :     props::FilterSize(), std::array<props::KernelSize, CONV2D_TRANSPOSE_DIM>(),
     211            0 :     std::array<props::Stride, CONV2D_TRANSPOSE_DIM>(), props::Padding2D(),
     212            0 :     std::array<props::Dilation, CONV2D_TRANSPOSE_DIM>()) {
     213              :   wt_idx.fill(std::numeric_limits<unsigned>::max());
     214            0 : }
     215              : 
     216            0 : void Conv2DTransposeLayer::finalize(InitLayerContext &context) {
     217            0 :   NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
     218              :     << "Convolution layer takes only one input";
     219              : 
     220              :   const TensorDim &in_dim = context.getInputDimensions()[0];
     221              : 
     222              :   auto &weight_regularizer =
     223              :     std::get<props::WeightRegularizer>(*layer_impl_props);
     224              :   auto &weight_regularizer_constant =
     225              :     std::get<props::WeightRegularizerConstant>(*layer_impl_props);
     226              :   auto &weight_initializer =
     227              :     std::get<props::WeightInitializer>(*layer_impl_props);
     228              :   auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
     229              :   auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
     230              :   auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
     231              :   auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
     232              : 
     233            0 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     234              :   auto &kernel_size =
     235              :     std::get<std::array<props::KernelSize, CONV2D_TRANSPOSE_DIM>>(conv_props);
     236              :   auto &stride =
     237              :     std::get<std::array<props::Stride, CONV2D_TRANSPOSE_DIM>>(conv_props);
     238              :   auto &dilation =
     239              :     std::get<std::array<props::Dilation, CONV2D_TRANSPOSE_DIM>>(conv_props);
     240              : 
     241              :   TensorDim kernel_dim =
     242            0 :     TensorDim(filter_size, in_dim.channel(), kernel_size[0], kernel_size[1]);
     243            0 :   TensorDim bias_dim = TensorDim(1, filter_size, 1, 1);
     244              : 
     245              :   padding = std::get<props::Padding2D>(conv_props)
     246            0 :               .compute(in_dim, kernel_dim, {stride[0], stride[1]},
     247              :                        {dilation[0], dilation[1]});
     248              : 
     249            0 :   wt_idx[ConvParams::weight] = context.requestWeight(
     250              :     kernel_dim, weight_initializer, weight_regularizer,
     251              :     weight_regularizer_constant, weight_decay, "filter", true, 0);
     252              : 
     253            0 :   if (disable_bias.empty() || disable_bias.get() == false) {
     254            0 :     wt_idx[ConvParams::bias] =
     255            0 :       context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE,
     256              :                             1.0f, bias_decay, "bias", true, 0);
     257              :   }
     258              : 
     259            0 :   auto [pt, pb, pl, pr] = padding;
     260              : 
     261            0 :   unsigned int channel = in_dim.channel();
     262            0 :   int in_height = in_dim.height();
     263            0 :   int in_width = in_dim.width();
     264              : 
     265            0 :   unsigned int k_height = kernel_size[0];
     266              :   unsigned int k_width = kernel_size[1];
     267              : 
     268              :   /// effective kernel height considering dilation
     269            0 :   unsigned int eff_k_height = (k_height - 1) * dilation[0] + 1;
     270              :   /// effective kernel width considering dilation
     271              :   unsigned int eff_k_width = (k_width - 1) * dilation[1] + 1;
     272              : 
     273            0 :   unsigned int height = (in_height - 1) * stride[0] + eff_k_height;
     274            0 :   unsigned int width = (in_width - 1) * stride[1] + eff_k_height;
     275              : 
     276            0 :   unsigned int out_height = height - pt - pb;
     277            0 :   unsigned int out_width = width - pl - pr;
     278              : 
     279            0 :   TensorDim out_dim;
     280            0 :   out_dim.batch(in_dim.batch());
     281            0 :   out_dim.channel(filter_size);
     282            0 :   out_dim.height(out_height);
     283            0 :   out_dim.width(out_width);
     284            0 :   context.setOutputDimensions({out_dim});
     285              : 
     286            0 :   NNTR_THROW_IF(height < kernel_size[0] || width < kernel_size[1],
     287              :                 std::invalid_argument)
     288              :     << "Failed to initialize: in size + padding is smaller than effective "
     289              :        "kernel";
     290              : 
     291              :   unsigned int IM = std::numeric_limits<int>::max();
     292              : 
     293            0 :   NNTR_THROW_IF(height - padding[0] - kernel_size[0] > IM ||
     294              :                   width - padding[2] - kernel_size[1] > IM,
     295              :                 std::invalid_argument)
     296              :     << "Failed to initialize: Calculated patch end is over int max";
     297            0 : }
     298              : 
     299            0 : void Conv2DTransposeLayer::forwarding(RunLayerContext &context, bool training) {
     300              :   int status = ML_ERROR_NONE;
     301              : 
     302            0 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     303              :   auto &stride =
     304              :     std::get<std::array<props::Stride, CONV2D_TRANSPOSE_DIM>>(conv_props);
     305              :   auto &dilation =
     306              :     std::get<std::array<props::Dilation, CONV2D_TRANSPOSE_DIM>>(conv_props);
     307              : 
     308            0 :   Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
     309            0 :   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
     310              : 
     311            0 :   Tensor &filter_kernel = context.getWeight(wt_idx[ConvParams::weight]);
     312              : 
     313              :   /** Calculate Convolution 2D Transpose
     314              :    *
     315              :    * This is the 2D Matrix Shape [ height ] x [ width ]
     316              :    *   . Height : filter_size
     317              :    *   . Width  : Input Channel * Kernel_size[0] * Kernel_size[1]
     318              :    *
     319              :    *                              imKernel
     320              :    *                        +------|------|------+
     321              :    *                        |------|------|------|
     322              :    * [filter_size (height)] |------|------|------|
     323              :    *                        |------|------|------|
     324              :    *                        +------|------|------+
     325              :    *                     [Input Channel * Kernel_size[0]
     326              :    *                       * Kernel_size[1] (width)]
     327              :    *
     328              :    *
     329              :    * After im2Col with channel_mode true (in : input)
     330              :    *
     331              :    * This is the 2D Matrix Shape [ height ] x [ width ]
     332              :    *   . Height : Input Channel * Kernel_size[0] * Kernel_size[1]
     333              :    *   . Width  : output_dim.height * output_dim.width
     334              :    *
     335              :    *                      +-|-|-|-|      |-|-|-|-+
     336              :    *   [Input Channel     | | | | |      | | | | |
     337              :    *   * Kernel_size[0]   |_|_|_|_|      |_|_|_|_|
     338              :    *  * Kenel_size[1]     | | | | | .... | | | | |
     339              :    *    (height)]         |_|_|_|_|      |_|_|_|_|
     340              :    *                      | | | | |      | | | | |
     341              :    *                      +_|_|_|_|      |_|_|_|_+
     342              :    *                     [ output_dim.height
     343              :    *                      * output_dim.width (width) ]
     344              :    *
     345              :    * Output Dimention
     346              :    *   -> [Channel ( = filter_size = output_dim.channel )]
     347              :    *       x [output_dim.height x output_dim.width]
     348              :    */
     349            0 :   const TensorDim &in_dim = input_.getDim();
     350            0 :   const TensorDim &out_dim = hidden_.getDim();
     351            0 :   const TensorDim &filter_dim = filter_kernel.getDim();
     352            0 :   TensorDim filter_dim_squeezed{filter_kernel.batch(),
     353            0 :                                 filter_kernel.getDim().getFeatureLen()};
     354              : 
     355            0 :   filter_kernel.reshape(filter_dim_squeezed);
     356              : 
     357              :   /**
     358              :    * Below sets the pad area values to zero
     359              :    * it is faster to do this way than seting selective area to zero
     360              :    */
     361            0 :   auto forwarding_job = [&](unsigned int s, unsigned int e, unsigned int pid,
     362              :                             void *user_data) {
     363              :     Tensor result = Tensor(
     364            0 :       calcCol2ImOutputDim(out_dim, filter_dim)); // result is temporary data
     365            0 :     result.setZero();
     366            0 :     for (unsigned int b = s; b < e; ++b) {
     367            0 :       Tensor out = hidden_.getBatchSlice(b, 1);
     368            0 :       out.reshape({filter_size, out_dim.width() * out_dim.height()});
     369            0 :       Tensor in_sub = input_.getBatchSlice(b, 1);
     370              : 
     371            0 :       im2col_transpose(in_sub, filter_dim, padding, stride, dilation, result);
     372            0 :       filter_kernel.dot(result, out, false, true);
     373            0 :     }
     374            0 :     result.deallocate();
     375            0 :   };
     376              : 
     377            0 :   auto workers = ParallelBatch(forwarding_job, in_dim.batch(), nullptr);
     378              : 
     379            0 :   if (workers.getNumWorkers() > 1) {
     380            0 :     workers.run();
     381              :   } else {
     382            0 :     forwarding_job(0, in_dim.batch(), 0, nullptr);
     383              :   }
     384              : 
     385            0 :   filter_kernel.reshape(filter_dim);
     386              :   if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
     387            0 :       disable_bias.empty() || disable_bias.get() == false) {
     388            0 :     Tensor &bias_kernel = context.getWeight(wt_idx[ConvParams::bias]);
     389            0 :     status = hidden_.add_i(bias_kernel);
     390            0 :     if (status != ML_ERROR_NONE) {
     391            0 :       throw std::invalid_argument("[Conv2DTranspose] adding bias failed");
     392              :     }
     393              :   }
     394            0 : }
     395              : 
     396            0 : void Conv2DTransposeLayer::calcDerivative(RunLayerContext &context) {
     397            0 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     398              :   auto &stride =
     399              :     std::get<std::array<props::Stride, CONV2D_TRANSPOSE_DIM>>(conv_props);
     400              :   auto &dilation =
     401              :     std::get<std::array<props::Dilation, CONV2D_TRANSPOSE_DIM>>(conv_props);
     402              : 
     403            0 :   const Tensor &derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
     404            0 :   Tensor &input_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
     405            0 :   Tensor &filter_kernel = context.getWeight(wt_idx[ConvParams::weight]);
     406              : 
     407            0 :   TensorDim filter_dim = filter_kernel.getDim();
     408            0 :   TensorDim filter_dim_squeezed{filter_kernel.batch(),
     409            0 :                                 filter_kernel.getDim().getFeatureLen()};
     410              : 
     411            0 :   filter_kernel.reshape(filter_dim_squeezed);
     412              : 
     413              :   /// for each batch
     414              :   /// filter_kernel^T X derivaitive  -> column matrix
     415              :   /// col2im(column matrix) to reconstruct the original image
     416              : 
     417            0 :   auto compute_derivative = [&](unsigned int s, unsigned int e,
     418              :                                 unsigned int pid, void *user_data) {
     419              :     Tensor result =
     420            0 :       Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
     421              : 
     422            0 :     for (unsigned int b = s; b < e; ++b) {
     423            0 :       Tensor deriv_sub = derivative.getBatchSlice(b, 1);
     424            0 :       Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1);
     425            0 :       deriv_sub.reshape(
     426            0 :         {filter_size, derivative.width() * derivative.height()});
     427            0 :       filter_kernel.dot(deriv_sub, result, true, false);
     428            0 :       col2im_transpose(result, filter_dim, padding, stride, dilation,
     429              :                        in_deriv_sub);
     430            0 :     }
     431            0 :     result.deallocate();
     432            0 :   };
     433              : 
     434            0 :   auto workers = ParallelBatch(compute_derivative, derivative.batch(), nullptr);
     435              : 
     436            0 :   if (workers.getNumWorkers() > 1) {
     437            0 :     workers.run();
     438              :   } else {
     439            0 :     compute_derivative(0, derivative.batch(), 0, nullptr);
     440              :   }
     441              : 
     442            0 :   filter_kernel.reshape(filter_dim);
     443            0 : }
     444              : 
     445            0 : void Conv2DTransposeLayer::calcGradient(RunLayerContext &context) {
     446            0 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     447              :   auto &stride =
     448              :     std::get<std::array<props::Stride, CONV2D_TRANSPOSE_DIM>>(conv_props);
     449              :   auto &dilation =
     450              :     std::get<std::array<props::Dilation, CONV2D_TRANSPOSE_DIM>>(conv_props);
     451              : 
     452            0 :   const Tensor &derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
     453            0 :   Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
     454              : 
     455            0 :   Tensor &delK = context.getWeightGrad(wt_idx[ConvParams::weight]);
     456            0 :   delK.setZero();
     457              : 
     458            0 :   TensorDim filter_dim = delK.getDim();
     459            0 :   TensorDim filter_dim_squeezed{filter_dim.batch(), filter_dim.getFeatureLen()};
     460              : 
     461            0 :   delK.reshape(filter_dim_squeezed);
     462              : 
     463              :   /**
     464              :    * no need to set zero for im2col_result, as its lifespan is ITERATION,
     465              :    * so its zero padded values will still be zero
     466              :    */
     467              : 
     468            0 :   TensorDim out_dim_squeezed{filter_size,
     469            0 :                              derivative.width() * derivative.height()};
     470            0 :   auto workers = ParallelBatch(input_.batch());
     471              :   /// input -(im2col)-> column_matrix -> filter x (column_matrix) = output
     472              :   /// so delK = dy x column_matrix ^ T;
     473            0 :   if (workers.getNumWorkers() > 1) {
     474              : 
     475            0 :     TensorDim delK_ext = filter_dim_squeezed;
     476            0 :     delK_ext.batch(input_.batch());
     477              : 
     478            0 :     Tensor delK_par = Tensor(delK_ext);
     479            0 :     delK_par.setZero();
     480              : 
     481            0 :     auto calc_grad_job = [&](unsigned int s, unsigned int e, unsigned int pid,
     482              :                              void *user_data) {
     483              :       Tensor result =
     484            0 :         Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
     485            0 :       result.setZero();
     486            0 :       for (unsigned int b = s; b < e; ++b) {
     487            0 :         Tensor deriv_sub = derivative.getBatchSlice(b, 1);
     488            0 :         Tensor delK_sub = delK_par.getBatchSlice(b, 1);
     489            0 :         deriv_sub.reshape(out_dim_squeezed);
     490              : 
     491            0 :         Tensor in_sub = input_.getBatchSlice(b, 1);
     492              : 
     493              :         /**
     494              :          * @todo this result can be cached from the forward iteration at the
     495              :          * expense of memory. In this case, memory of im2col_result must be
     496              :          * saved for the whole batch. try this while benchmarking.
     497              :          */
     498            0 :         im2col_transpose(in_sub, filter_dim, padding, stride, dilation, result);
     499            0 :         deriv_sub.dot(result, delK_sub, false, false);
     500            0 :       }
     501            0 :       result.deallocate();
     502            0 :     };
     503              : 
     504            0 :     workers.setCallback(calc_grad_job, nullptr);
     505              : 
     506            0 :     workers.run();
     507              : 
     508            0 :     for (unsigned int b = 0; b < input_.batch(); ++b) {
     509            0 :       Tensor delK_sub = delK_par.getBatchSlice(b, 1);
     510            0 :       delK.add_i(delK_sub);
     511            0 :     }
     512              : 
     513            0 :   } else {
     514              :     Tensor result =
     515            0 :       Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
     516            0 :     result.setZero();
     517              : 
     518            0 :     for (unsigned int b = 0; b < input_.batch(); ++b) {
     519            0 :       Tensor deriv_sub = derivative.getBatchSlice(b, 1);
     520            0 :       deriv_sub.reshape(out_dim_squeezed);
     521              : 
     522            0 :       Tensor in_sub = input_.getBatchSlice(b, 1);
     523              : 
     524              :       /**
     525              :        * @todo this result can be cached from the forward iteration at the
     526              :        * expense of memory. In this case, memory of im2col_result must be saved
     527              :        * for the whole batch. try this while benchmarking.
     528              :        */
     529            0 :       im2col_transpose(in_sub, filter_dim, padding, stride, dilation, result);
     530            0 :       deriv_sub.dot(result, delK, false, false, b == 0 ? 0.0f : 1.0f);
     531            0 :     }
     532            0 :     result.deallocate();
     533            0 :   }
     534            0 :   delK.reshape(filter_dim);
     535              :   if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
     536            0 :       disable_bias.empty() || disable_bias.get() == false) {
     537            0 :     Tensor &delBias = context.getWeightGrad(wt_idx[ConvParams::bias]);
     538            0 :     derivative.sum({0, 2, 3}, delBias);
     539              :   }
     540            0 : }
     541              : 
     542            0 : void Conv2DTransposeLayer::exportTo(
     543              :   Exporter &exporter, const ml::train::ExportMethods &method) const {
     544            0 :   LayerImpl::exportTo(exporter, method);
     545            0 :   exporter.saveResult(conv_props, method, this);
     546            0 : }
     547              : 
     548            0 : void Conv2DTransposeLayer::setProperty(const std::vector<std::string> &values) {
     549            0 :   auto remain_props = loadProperties(values, conv_props);
     550            0 :   LayerImpl::setProperty(remain_props);
     551            0 : }
     552              : 
     553              : } /* namespace nntrainer */

Generated by: LCOV version 2.0-1