LCOV - coverage_filtered.info - nntrainer/layers/conv2d

LCOV - code coverage report

Current view:	top level - nntrainer/layers - conv2d_layer.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	88.3 %	247	218
Test Date:	2025-12-14 20:38:17	Functions:	87.5 %	16	14

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
       4              :  *
       5              :  * @file   conv2d_layer.h
       6              :  * @date   02 June 2020
       7              :  * @see    https://github.com/nnstreamer/nntrainer
       8              :  * @author Jijoong Moon <jijoong.moon@samsung.com>
       9              :  * @author Jihoon Lee <jhoon.it.lee@samsung.com>
      10              :  * @bug    No known bugs except for NYI items
      11              :  * @brief  This is Convolution Layer Class for Neural Network
      12              :  *
      13              :  */
      14              : #include <algorithm>
      15              : #include <cstring>
      16              : #include <limits>
      17              : #include <string>
      18              : 
      19              : #include <conv2d_layer.h>
      20              : #include <cpu_backend.h>
      21              : #include <layer_context.h>
      22              : #include <lazy_tensor.h>
      23              : #include <nntr_threads.h>
      24              : #include <nntrainer_error.h>
      25              : #include <nntrainer_log.h>
      26              : #include <node_exporter.h>
      27              : #include <profiler.h>
      28              : #include <tensor_dim.h>
      29              : #include <thread>
      30              : #include <util_func.h>
      31              : 
      32              : namespace nntrainer {
      33              : 
      34              : static constexpr size_t SINGLE_INOUT_IDX = 0;
      35              : 
      36              : namespace {
      37              : 
      38          906 : static TensorDim calcCol2ImOutputDim(const TensorDim &out,
      39              :                                      const TensorDim &kdim) {
      40              : 
      41         1812 :   return TensorDim({kdim.getFeatureLen(), out.width() * out.height()},
      42         1812 :                    out.getTensorType());
      43              : }
      44              : 
      45              : /**
      46              :  * @brief     reconstruct image data from 2d column matrix
      47              :  *
      48              :  * @param[in] in input data
      49              :  * @param[in] kdim kernel dimesion for define number of row
      50              :  * @param[in] padding padding information
      51              :  * @param[in] mstride stride value : x, y direction
      52              :  * @param[in] dilation kernel dilation factor : x, y each
      53              :  * @param[out] image image tensor to put
      54              :  */
      55          961 : static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
      56              :                    const std::array<unsigned, 4> &padding,
      57              :                    const std::array<props::Stride, CONV2D_DIM> &mstride,
      58              :                    const std::array<props::Dilation, CONV2D_DIM> &dilation,
      59              :                    Tensor &image) {
      60              : 
      61          961 :   auto pt = padding[0];
      62          961 :   auto pb = padding[1];
      63          961 :   auto pl = padding[2];
      64          961 :   auto pr = padding[3];
      65              : 
      66          961 :   unsigned k_height = kdim.height();
      67          961 :   unsigned k_width = kdim.width();
      68              : 
      69              :   /// effective kernel height considering dilation
      70          961 :   unsigned eff_k_height = (k_height - 1) * dilation[0] + 1;
      71              :   /// effective kernel width considering dilation
      72          961 :   unsigned eff_k_width = (k_width - 1) * dilation[1] + 1;
      73              : 
      74          961 :   unsigned im_channel = image.channel();
      75          961 :   int im_height = image.height();
      76          961 :   int im_width = image.width();
      77              : 
      78          961 :   unsigned hstride = mstride[0];
      79          961 :   unsigned wstride = mstride[1];
      80              : 
      81          961 :   unsigned hdilation = dilation[0];
      82          961 :   unsigned wdilation = dilation[1];
      83              : 
      84              :   /// image considering padding
      85          961 :   unsigned im_eff_height = im_height + pt + pb;
      86          961 :   unsigned im_eff_width = im_width + pl + pr;
      87          961 :   image.setZero();
      88              : 
      89          961 :   int h_stride_end = im_eff_height - eff_k_height - pt;
      90          961 :   int w_stride_end = im_eff_width - eff_k_width - pl;
      91              : 
      92              :   /** @todo We need to implement way to use this kind of function to work inside
      93              :    * of Tensor. Then we could remove to access the getData or getValue which has
      94              :    * dependecy of data type.
      95              :    */
      96          961 :   auto apply_data = [&]<typename T>(T *val) {
      97              :     unsigned col_w = 0;
      98         6094 :     for (int hs = -(int)pt; hs <= h_stride_end; hs += hstride) {
      99        40918 :       for (int ws = -(int)pl; ws <= w_stride_end; ws += wstride) {
     100              :         unsigned col_h = 0;
     101        35785 :         int patch_height_end = hs + eff_k_height;
     102        35785 :         int patch_width_end = ws + eff_k_width;
     103       278217 :         for (unsigned c = 0; c < im_channel; c++) {
     104      1330534 :           for (int h = hs; h < patch_height_end; h += hdilation) {
     105      1088102 :             if (h < 0 || im_height <= h) {
     106         5876 :               col_h += k_width;
     107         5876 :               continue;
     108              :             }
     109      6165092 :             for (int w = ws; w < patch_width_end; w += wdilation) {
     110      5082866 :               if (w < 0 || im_width <= w) {
     111        17124 :                 col_h++;
     112        17124 :                 continue;
     113              :               }
     114              : 
     115      5065742 :               val = image.getAddress<T>(0, c, h, w);
     116      5065742 :               *val += col_matrix.getValue<T>(0, 0, col_h, col_w);
     117      5065742 :               col_h++;
     118              :             }
     119              :           }
     120              :         }
     121        35785 :         col_w++;
     122              :       }
     123              :     }
     124         1922 :   };
     125              : 
     126          961 :   if (image.getDataType() == nntrainer::Tdatatype::FP32) {
     127              :     float val;
     128          961 :     apply_data(&val);
     129              :   }
     130              : #ifdef ENABLE_FP16
     131              :   else if (image.getDataType() == nntrainer::Tdatatype::FP16) {
     132              :     _FP16 val;
     133              :     apply_data(&val);
     134              :   }
     135              : #endif
     136              :   else {
     137            0 :     throw std::runtime_error("Not supported datatype");
     138              :   }
     139          961 : }
     140              : 
     141              : /**
     142              :  * @brief     reform the data to 2d matrix
     143              :  * a region is sampled considering @a padding, @a mstride of unit @a kdim
     144              :  * Each region is mapped to one column,
     145              :  * if channel mode, kernel channel is considered part of kernel feature
     146              :  * if not, kernel channel is consider part of output dimension
     147              :  *
     148              :  * @param[in] in input data
     149              :  * @param[in] kdim kernel dimesion for define number of row
     150              :  * @param[in] padding padding information
     151              :  * @param[in] mstride stride value : x, y direction
     152              :  * @param[in] dilation kernel dilation factor : x, y each
     153              :  * @param[out] out out tensor, padding set each time for now
     154              :  * @note if out is initialized tensor, setting padding is skipped.
     155              :  */
     156         4330 : static void im2col(const Tensor &in, const TensorDim &kdim,
     157              :                    const std::array<unsigned int, 4> &padding,
     158              :                    const std::array<props::Stride, CONV2D_DIM> &mstride,
     159              :                    const std::array<props::Dilation, CONV2D_DIM> &dilation,
     160              :                    Tensor &out) {
     161              :   /// for channel last mode, this is deprecated for now, leaving here on
     162              :   /// purpose.
     163              :   /** @code
     164              :   //   ================ initialize part ====================
     165              :   //   out_height -= 2;
     166              :   //   out =
     167              :   //     Tensor(k_height * k_width, in.channel() * (out_height) *
     168              :   //     (out_width));
     169              :   //   unsigned int im_w = 0;
     170              :   //   ================ loop part ====================
     171              :   //   if (eff_k_height > height || eff_k_width > width)
     172              :   //     throw std::runtime_error("Kernel shape bigger than input shape");
     173              : 
     174              :   //   for (unsigned int c = 0; c < channel; ++c) {
     175              :   //     for (unsigned int hs = 0; hs <= height - eff_k_height; hs +=
     176              :   //     mstride[0]) {
     177              :   //       for (unsigned int ws = 0; ws <= width - eff_k_width; ws +=
     178              :   //       mstride[1]) {
     179              :   //         unsigned int im_h = 0;
     180              :   //         unsigned int patch_height_end = eff_k_height + hs;
     181              :   //         unsigned int patch_width_end = eff_k_width + ws;
     182              : 
     183              :   //         for (unsigned int h = hs; h < patch_height_end; h += dilation[0]) {
     184              :   //           if (h < ph || in_height + ph <= h) {
     185              :   //             im_h += k_width;
     186              :   //             continue;
     187              :   //           }
     188              : 
     189              :   //           for (unsigned int w = ws; w < patch_width_end; w += dilation[1])
     190              :   //           {
     191              :   //             if (w < pw || in_width + pw <= w) {
     192              :   //               im_h++;
     193              :   //               continue;
     194              :   //             }
     195              : 
     196              :   //             float val = in.getValue(0, c, h - ph, w - pw);
     197              :   //             out.setValue(0, 0, im_h, im_w, val);
     198              :   //             im_h++;
     199              :   //           }
     200              :   //         }
     201              :   //         im_w++;
     202              :   //       }
     203              :   //     }
     204              :   //   }
     205              :   */
     206              : 
     207         4330 :   auto pt = padding[0];
     208         4330 :   auto pb = padding[1];
     209         4330 :   auto pl = padding[2];
     210         4330 :   auto pr = padding[3];
     211              : 
     212         4330 :   unsigned int channel = in.channel();
     213         4330 :   int in_height = in.height();
     214         4330 :   int in_width = in.width();
     215         4330 :   unsigned int height = in_height + pt + pb;
     216         4330 :   unsigned int width = in_width + pl + pr;
     217         4330 :   unsigned int k_height = kdim.height();
     218         4330 :   unsigned int k_width = kdim.width();
     219              : 
     220              :   /// effective kernel height considering dilation
     221         4330 :   unsigned int eff_k_height = (k_height - 1) * dilation[0] + 1;
     222              :   /// effective kernel width considering dilation
     223         4330 :   unsigned int eff_k_width = (k_width - 1) * dilation[1] + 1;
     224              : 
     225         4330 :   unsigned int out_height = (height - eff_k_height) / mstride[0] + 1;
     226         4330 :   unsigned int out_width = (width - eff_k_width) / mstride[1] + 1;
     227              : 
     228         4330 :   out.reshape(
     229         4330 :     TensorDim({out_height * out_width, in.channel() * k_height * k_width},
     230              :               in.getTensorType()));
     231              :   // float *out_data = out.getData();
     232              : 
     233         4330 :   auto apply_data = [&]<typename T>(T *out_data) {
     234         4330 :     int h_stride_end = height - eff_k_height - pt;
     235         4330 :     int w_stride_end = width - eff_k_width - pl;
     236              : 
     237              :     /// get a patch, size of kernel
     238              :     /// hs is height_strided, ws is width_strided
     239         4330 :     unsigned int owidth = out.width();
     240              :     unsigned int base_im_w = 0;
     241        54395 :     for (int hs = -(int)pt; hs <= h_stride_end; hs += mstride[0]) {
     242              :       unsigned int base_im_h = 0;
     243        50065 :       int patch_height_end = eff_k_height + hs;
     244              :       /// map the patch to a single line looping through channel
     245              :       // We need to optimize this padding & copy. May be use multi threads, or
     246              :       // SIMD
     247       173239 :       for (unsigned int c = 0; c < channel; ++c) {
     248       694664 :         for (int h = hs; h < patch_height_end; h += dilation[0]) {
     249       571490 :           if (h < 0 || in_height <= h) {
     250         2682 :             base_im_h += k_width;
     251         2682 :             continue;
     252              :           }
     253              : 
     254              :           unsigned int im_w = base_im_w;
     255      8010544 :           for (int ws = -(int)pl; ws <= w_stride_end; ws += mstride[1]) {
     256              :             unsigned int im_h = base_im_h;
     257      7441736 :             int patch_width_end = eff_k_width + ws;
     258              : 
     259     43589352 :             for (int w = ws; w < patch_width_end; w += dilation[1]) {
     260     36147616 :               if (w < 0 || in_width <= w) {
     261        57852 :                 im_h++;
     262        57852 :                 continue;
     263              :               }
     264     36089764 :               out_data[im_w * owidth + im_h] = in.getValue<T>(0, c, h, w);
     265     36089764 :               im_h++;
     266              :             }
     267      7441736 :             im_w++;
     268              :           }
     269       568808 :           base_im_h += k_width;
     270              :         }
     271              :       }
     272        50065 :       base_im_w += out_width;
     273              :     }
     274         8660 :   };
     275              : 
     276         4330 :   if (out.getDataType() == nntrainer::Tdatatype::FP32) {
     277              :     float *out_data = out.getData<float>();
     278         4330 :     apply_data(out_data);
     279              :   }
     280              : #ifdef ENABLE_FP16
     281              :   else if (out.getDataType() == nntrainer::Tdatatype::FP16) {
     282              :     _FP16 *out_data = out.getData<_FP16>();
     283              :     apply_data(out_data);
     284              :   }
     285              : #endif
     286              :   else {
     287            0 :     throw std::runtime_error("Not supported datatype");
     288              :   }
     289         4330 : }
     290              : } // namespace
     291              : 
     292              : enum ConvParams { weight, bias };
     293              : 
     294          192 : Conv2DLayer::Conv2DLayer(
     295          192 :   const std::array<unsigned int, CONV2D_DIM * 2> &padding_) :
     296              :   LayerImpl(),
     297          192 :   padding(padding_),
     298          576 :   conv_props(props::FilterSize(), std::array<props::KernelSize, CONV2D_DIM>(),
     299          768 :              std::array<props::Stride, CONV2D_DIM>(), props::Padding2D(),
     300          384 :              std::array<props::Dilation, CONV2D_DIM>()) {
     301              :   wt_idx.fill(std::numeric_limits<unsigned>::max());
     302          192 : }
     303              : 
     304          149 : void Conv2DLayer::finalize(InitLayerContext &context) {
     305          149 :   NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
     306              :     << "Convolution layer takes only one input";
     307              : 
     308              :   const TensorDim &in_dim = context.getInputDimensions()[0];
     309              : 
     310              :   auto &weight_regularizer =
     311              :     std::get<props::WeightRegularizer>(*layer_impl_props);
     312              :   auto &weight_regularizer_constant =
     313              :     std::get<props::WeightRegularizerConstant>(*layer_impl_props);
     314              :   auto &weight_initializer =
     315              :     std::get<props::WeightInitializer>(*layer_impl_props);
     316              :   auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
     317              :   auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
     318              :   auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
     319              :   auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
     320              : 
     321          149 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     322              :   auto &kernel_size =
     323              :     std::get<std::array<props::KernelSize, CONV2D_DIM>>(conv_props);
     324              :   auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
     325              :   auto &dilation =
     326              :     std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
     327              : 
     328              :   auto in_t_type = in_dim.getTensorType();
     329          149 :   in_t_type.data_type = context.getWeightDataType();
     330              : 
     331              :   TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(),
     332          149 :                                    kernel_size[0], kernel_size[1], in_t_type);
     333              : 
     334          149 :   TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type);
     335              : 
     336              :   padding = std::get<props::Padding2D>(conv_props)
     337          149 :               .compute(in_dim, kernel_dim, {stride[0], stride[1]},
     338              :                        {dilation[0], dilation[1]});
     339              : 
     340          149 :   wt_idx[ConvParams::weight] = context.requestWeight(
     341              :     kernel_dim, weight_initializer, weight_regularizer,
     342              :     weight_regularizer_constant, weight_decay, "filter", true, 0);
     343              : 
     344          149 :   if (disable_bias.empty() || disable_bias.get() == false) {
     345          149 :     wt_idx[ConvParams::bias] =
     346          298 :       context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE,
     347              :                             1.0f, bias_decay, "bias", true, 0);
     348              :   }
     349              : 
     350              :   // this output_dim must be the same with dimension of hidden
     351          149 :   unsigned int eff_in_height = in_dim.height() + padding[0] + padding[1];
     352          149 :   unsigned int eff_in_width = in_dim.width() + padding[2] + padding[3];
     353              : 
     354          149 :   unsigned int eff_k_height = (kernel_size[0] - 1) * dilation[0] + 1;
     355          149 :   unsigned int eff_k_width = (kernel_size[1] - 1) * dilation[1] + 1;
     356              : 
     357          149 :   TensorDim out_dim;
     358          149 :   out_dim.batch(in_dim.batch());
     359          149 :   out_dim.channel(filter_size);
     360          149 :   out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1);
     361          149 :   out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1);
     362              : 
     363              :   out_dim.setTensorType(in_dim.getTensorType());
     364              : 
     365          149 :   context.setOutputDimensions({out_dim});
     366              : 
     367          298 :   NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1],
     368              :                 std::invalid_argument)
     369              :     << "Failed to initialize: in size + padding is smaller than effective "
     370              :        "kernel";
     371              : 
     372              :   unsigned int IM = std::numeric_limits<int>::max();
     373              : 
     374          146 :   NNTR_THROW_IF(eff_in_height - padding[0] - kernel_size[0] > IM ||
     375              :                   eff_in_width - padding[2] - kernel_size[1] > IM,
     376              :                 std::invalid_argument)
     377              :     << "Failed to initialize: Calculated patch end is over int max";
     378          146 : }
     379              : 
     380          480 : void Conv2DLayer::forwarding(RunLayerContext &context, bool training) {
     381              :   int status = ML_ERROR_NONE;
     382              : 
     383          480 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     384              :   auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
     385              :   auto &dilation =
     386              :     std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
     387              : 
     388          480 :   Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
     389          480 :   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
     390              : 
     391          480 :   Tensor &filter_kernel = context.getWeight(wt_idx[ConvParams::weight]);
     392              : 
     393              :   /** Calculate Convolution 2D
     394              :    *
     395              :    * This is the 2D Matrix Shape [ height ] x [ width ]
     396              :    *   . Height : filter_size
     397              :    *   . Width  : Input Channel * Kernel_size[0] * Kernel_size[1]
     398              :    *
     399              :    *                              imKernel
     400              :    *                        +------|------|------+
     401              :    *                        |------|------|------|
     402              :    * [filter_size (height)] |------|------|------|
     403              :    *                        |------|------|------|
     404              :    *                        +------|------|------+
     405              :    *                     [Input Channel * Kernel_size[0]
     406              :    *                       * Kernel_size[1] (width)]
     407              :    *
     408              :    *
     409              :    * After im2Col with channel_mode true (in : input)
     410              :    *
     411              :    * This is the 2D Matrix Shape [ height ] x [ width ]
     412              :    *   . Height : Input Channel * Kernel_size[0] * Kernel_size[1]
     413              :    *   . Width  : output_dim.height * output_dim.width
     414              :    *
     415              :    *                      +-|-|-|-|      |-|-|-|-+
     416              :    *   [Input Channel     | | | | |      | | | | |
     417              :    *   * Kernel_size[0]   |_|_|_|_|      |_|_|_|_|
     418              :    *  * Kenel_size[1]     | | | | | .... | | | | |
     419              :    *    (height)]         |_|_|_|_|      |_|_|_|_|
     420              :    *                      | | | | |      | | | | |
     421              :    *                      +_|_|_|_|      |_|_|_|_+
     422              :    *                     [ output_dim.height
     423              :    *                      * output_dim.width (width) ]
     424              :    *
     425              :    * Output Dimention
     426              :    *   -> [Channel ( = filter_size = output_dim.channel )]
     427              :    *       x [output_dim.height x output_dim.width]
     428              :    */
     429          480 :   const TensorDim &in_dim = input_.getDim();
     430          480 :   const TensorDim &out_dim = hidden_.getDim();
     431          480 :   const TensorDim &filter_dim = filter_kernel.getDim();
     432          480 :   TensorDim filter_dim_squeezed{filter_kernel.batch(),
     433          480 :                                 filter_kernel.getDim().getFeatureLen()};
     434              : 
     435          480 :   filter_dim_squeezed.setTensorType(filter_kernel.getTensorType());
     436              : 
     437          480 :   filter_kernel.reshape(filter_dim_squeezed);
     438              : 
     439              :   /**
     440              :    * Below sets the pad area values to zero
     441              :    * it is faster to do this way than seting selective area to zero
     442              :    */
     443          480 :   auto forwarding_job = [&](unsigned int s, unsigned int e, unsigned int pid,
     444              :                             void *user_data) {
     445          480 :     Tensor result = Tensor(calcCol2ImOutputDim(out_dim, filter_dim));
     446          480 :     result.setZero();
     447         3368 :     for (unsigned int b = s; b < e; ++b) {
     448         2888 :       Tensor out = hidden_.getBatchSlice(b, 1);
     449         2888 :       out.reshape({filter_size, out_dim.width() * out_dim.height()});
     450         2888 :       Tensor in_sub = input_.getBatchSlice(b, 1);
     451              : 
     452         2888 :       im2col(in_sub, filter_dim, padding, stride, dilation, result);
     453              :       // filter kernel is (K, CRS), result is (CRS, OH*OW)
     454         2888 :       filter_kernel.dot(result, out, false, true);
     455         2888 :     }
     456          480 :     result.deallocate();
     457          480 :   };
     458              : 
     459          960 :   auto workers = ParallelBatch(forwarding_job, in_dim.batch(), nullptr);
     460              : 
     461          480 :   if (workers.getNumWorkers() > 1) {
     462            0 :     workers.run();
     463              :   } else {
     464          480 :     forwarding_job(0, in_dim.batch(), 0, nullptr);
     465              :   }
     466              : 
     467          480 :   filter_kernel.reshape(filter_dim);
     468              :   if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
     469          480 :       disable_bias.empty() || disable_bias.get() == false) {
     470          480 :     Tensor &bias_kernel = context.getWeight(wt_idx[ConvParams::bias]);
     471          480 :     status = hidden_.add_i(bias_kernel);
     472          480 :     if (status != ML_ERROR_NONE) {
     473            0 :       throw std::invalid_argument("[Conv2D] adding bias failed");
     474              :     }
     475              :   }
     476          480 : }
     477              : 
     478          205 : void Conv2DLayer::calcDerivative(RunLayerContext &context) {
     479          205 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     480              :   auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
     481              :   auto &dilation =
     482              :     std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
     483              : 
     484          205 :   const Tensor &derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
     485          205 :   Tensor &input_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
     486          205 :   Tensor &filter_kernel = context.getWeight(wt_idx[ConvParams::weight]);
     487              : 
     488          205 :   TensorDim filter_dim = filter_kernel.getDim();
     489          205 :   TensorDim filter_dim_squeezed{filter_kernel.batch(),
     490          205 :                                 filter_kernel.getDim().getFeatureLen()};
     491              : 
     492          205 :   filter_kernel.reshape(filter_dim_squeezed);
     493              : 
     494              :   /// for each batch
     495              :   /// filter_kernel^T X derivaitive  -> column matrix
     496              :   /// col2im(column matrix) to reconstruct the original image
     497              : 
     498          205 :   auto compute_derivative = [&](unsigned int s, unsigned int e,
     499              :                                 unsigned int pid, void *user_data) {
     500              :     Tensor result =
     501          205 :       Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
     502              : 
     503         1166 :     for (unsigned int b = s; b < e; ++b) {
     504          961 :       Tensor deriv_sub = derivative.getBatchSlice(b, 1);
     505          961 :       Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1);
     506          961 :       deriv_sub.reshape(
     507          961 :         {filter_size, derivative.width() * derivative.height()});
     508              :       // filter_kernel is (K, CRS), deriv_sub is (K, OH*OW), result is (CRS,
     509              :       // OH*OW)
     510          961 :       filter_kernel.dot(deriv_sub, result, true, false);
     511          961 :       col2im(result, filter_dim, padding, stride, dilation, in_deriv_sub);
     512              :       // in_derv_sub is (C,H,W)
     513          961 :     }
     514          205 :     result.deallocate();
     515          205 :   };
     516              : 
     517          410 :   auto workers = ParallelBatch(compute_derivative, derivative.batch(), nullptr);
     518              : 
     519          205 :   if (workers.getNumWorkers() > 1) {
     520            0 :     workers.run();
     521              :   } else {
     522          205 :     compute_derivative(0, derivative.batch(), 0, nullptr);
     523              :   }
     524              : 
     525          205 :   filter_kernel.reshape(filter_dim);
     526          205 : }
     527              : 
     528          221 : void Conv2DLayer::calcGradient(RunLayerContext &context) {
     529          221 :   unsigned int filter_size = std::get<props::FilterSize>(conv_props);
     530              :   auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
     531              :   auto &dilation =
     532              :     std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
     533              : 
     534          221 :   const Tensor &derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
     535          221 :   Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
     536              : 
     537          221 :   Tensor &delK = context.getWeightGrad(wt_idx[ConvParams::weight]);
     538          221 :   delK.setZero();
     539              : 
     540          221 :   TensorDim filter_dim = delK.getDim();
     541          221 :   TensorDim filter_dim_squeezed{filter_dim.batch(), filter_dim.getFeatureLen()};
     542              : 
     543          221 :   delK.reshape(filter_dim_squeezed);
     544              : 
     545              :   /**
     546              :    * no need to set zero for im2col_result, as its lifespan is ITERATION,
     547              :    * so its zero padded values will still be zero
     548              :    */
     549              : 
     550          221 :   TensorDim out_dim_squeezed{filter_size,
     551          221 :                              derivative.width() * derivative.height()};
     552          221 :   auto workers = ParallelBatch(input_.batch());
     553              :   /// input -(im2col)-> column_matrix -> filter x (column_matrix) = output
     554              :   /// so delK = dy x column_matrix ^ T;
     555          221 :   if (workers.getNumWorkers() > 1) {
     556              : 
     557            0 :     TensorDim delK_ext = filter_dim_squeezed;
     558            0 :     delK_ext.batch(input_.batch());
     559              : 
     560            0 :     Tensor delK_par = Tensor(delK_ext);
     561            0 :     delK_par.setZero();
     562              : 
     563            0 :     auto calc_grad_job = [&](unsigned int s, unsigned int e, unsigned int pid,
     564              :                              void *user_data) {
     565              :       Tensor result =
     566            0 :         Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
     567            0 :       result.setZero();
     568            0 :       for (unsigned int b = s; b < e; ++b) {
     569            0 :         Tensor deriv_sub = derivative.getBatchSlice(b, 1);
     570            0 :         Tensor delK_sub = delK_par.getBatchSlice(b, 1);
     571            0 :         deriv_sub.reshape(out_dim_squeezed);
     572              : 
     573            0 :         Tensor in_sub = input_.getBatchSlice(b, 1);
     574              : 
     575              :         /**
     576              :          * @todo this result can be cached from the forward iteration at the
     577              :          * expense of memory. In this case, memory of im2col_result must be
     578              :          * saved for the whole batch. try this while benchmarking.
     579              :          */
     580              :         // deriv_sub is (K, OH*OW) and result is (CRS, OH*OW)
     581            0 :         im2col(in_sub, filter_dim, padding, stride, dilation, result);
     582            0 :         deriv_sub.dot(result, delK_sub, false, false);
     583            0 :       }
     584            0 :       result.deallocate();
     585            0 :     };
     586              : 
     587            0 :     workers.setCallback(calc_grad_job, nullptr);
     588              : 
     589            0 :     workers.run();
     590              : 
     591            0 :     for (unsigned int b = 0; b < input_.batch(); ++b) {
     592            0 :       Tensor delK_sub = delK_par.getBatchSlice(b, 1);
     593            0 :       delK.add_i(delK_sub);
     594            0 :     }
     595              : 
     596            0 :   } else {
     597              :     Tensor result =
     598          221 :       Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
     599          221 :     result.setZero();
     600              : 
     601         1663 :     for (unsigned int b = 0; b < input_.batch(); ++b) {
     602         1442 :       Tensor deriv_sub = derivative.getBatchSlice(b, 1);
     603         1442 :       deriv_sub.reshape(out_dim_squeezed);
     604              : 
     605         1442 :       Tensor in_sub = input_.getBatchSlice(b, 1);
     606              : 
     607              :       /**
     608              :        * @todo this result can be cached from the forward iteration at the
     609              :        * expense of memory. In this case, memory of im2col_result must be saved
     610              :        * for the whole batch. try this while benchmarking.
     611              :        */
     612         1442 :       im2col(in_sub, filter_dim, padding, stride, dilation, result);
     613         2663 :       deriv_sub.dot(result, delK, false, false, b == 0 ? 0.0f : 1.0f);
     614         1442 :     }
     615          221 :     result.deallocate();
     616          221 :   }
     617          221 :   delK.reshape(filter_dim);
     618              :   if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
     619          221 :       disable_bias.empty() || disable_bias.get() == false) {
     620          221 :     Tensor &delBias = context.getWeightGrad(wt_idx[ConvParams::bias]);
     621          221 :     delBias.setZero();
     622          221 :     derivative.sum({0, 2, 3}, delBias);
     623              :   }
     624          221 : }
     625              : 
     626           42 : void Conv2DLayer::exportTo(Exporter &exporter,
     627              :                            const ml::train::ExportMethods &method) const {
     628           42 :   LayerImpl::exportTo(exporter, method);
     629           42 :   exporter.saveResult(conv_props, method, this);
     630           42 : }
     631              : 
     632          811 : void Conv2DLayer::setProperty(const std::vector<std::string> &values) {
     633          811 :   auto remain_props = loadProperties(values, conv_props);
     634          809 :   LayerImpl::setProperty(remain_props);
     635          809 : }
     636              : 
     637              : } /* namespace nntrainer */

Generated by: LCOV version 2.0-1