Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
4 : *
5 : * @file conv2d_layer.h
6 : * @date 02 June 2020
7 : * @see https://github.com/nnstreamer/nntrainer
8 : * @author Jijoong Moon <jijoong.moon@samsung.com>
9 : * @author Jihoon Lee <jhoon.it.lee@samsung.com>
10 : * @bug No known bugs except for NYI items
11 : * @brief This is Convolution Layer Class for Neural Network
12 : *
13 : */
14 : #include <algorithm>
15 : #include <cstring>
16 : #include <limits>
17 : #include <string>
18 :
19 : #include <conv2d_layer.h>
20 : #include <cpu_backend.h>
21 : #include <layer_context.h>
22 : #include <lazy_tensor.h>
23 : #include <nntr_threads.h>
24 : #include <nntrainer_error.h>
25 : #include <nntrainer_log.h>
26 : #include <node_exporter.h>
27 : #include <profiler.h>
28 : #include <tensor_dim.h>
29 : #include <thread>
30 : #include <util_func.h>
31 :
32 : namespace nntrainer {
33 :
34 : static constexpr size_t SINGLE_INOUT_IDX = 0;
35 :
36 : namespace {
37 :
38 906 : static TensorDim calcCol2ImOutputDim(const TensorDim &out,
39 : const TensorDim &kdim) {
40 :
41 1812 : return TensorDim({kdim.getFeatureLen(), out.width() * out.height()},
42 1812 : out.getTensorType());
43 : }
44 :
45 : /**
46 : * @brief reconstruct image data from 2d column matrix
47 : *
48 : * @param[in] in input data
49 : * @param[in] kdim kernel dimesion for define number of row
50 : * @param[in] padding padding information
51 : * @param[in] mstride stride value : x, y direction
52 : * @param[in] dilation kernel dilation factor : x, y each
53 : * @param[out] image image tensor to put
54 : */
55 961 : static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
56 : const std::array<unsigned, 4> &padding,
57 : const std::array<props::Stride, CONV2D_DIM> &mstride,
58 : const std::array<props::Dilation, CONV2D_DIM> &dilation,
59 : Tensor &image) {
60 :
61 961 : auto pt = padding[0];
62 961 : auto pb = padding[1];
63 961 : auto pl = padding[2];
64 961 : auto pr = padding[3];
65 :
66 961 : unsigned k_height = kdim.height();
67 961 : unsigned k_width = kdim.width();
68 :
69 : /// effective kernel height considering dilation
70 961 : unsigned eff_k_height = (k_height - 1) * dilation[0] + 1;
71 : /// effective kernel width considering dilation
72 961 : unsigned eff_k_width = (k_width - 1) * dilation[1] + 1;
73 :
74 961 : unsigned im_channel = image.channel();
75 961 : int im_height = image.height();
76 961 : int im_width = image.width();
77 :
78 961 : unsigned hstride = mstride[0];
79 961 : unsigned wstride = mstride[1];
80 :
81 961 : unsigned hdilation = dilation[0];
82 961 : unsigned wdilation = dilation[1];
83 :
84 : /// image considering padding
85 961 : unsigned im_eff_height = im_height + pt + pb;
86 961 : unsigned im_eff_width = im_width + pl + pr;
87 961 : image.setZero();
88 :
89 961 : int h_stride_end = im_eff_height - eff_k_height - pt;
90 961 : int w_stride_end = im_eff_width - eff_k_width - pl;
91 :
92 : /** @todo We need to implement way to use this kind of function to work inside
93 : * of Tensor. Then we could remove to access the getData or getValue which has
94 : * dependecy of data type.
95 : */
96 961 : auto apply_data = [&]<typename T>(T *val) {
97 : unsigned col_w = 0;
98 6094 : for (int hs = -(int)pt; hs <= h_stride_end; hs += hstride) {
99 40918 : for (int ws = -(int)pl; ws <= w_stride_end; ws += wstride) {
100 : unsigned col_h = 0;
101 35785 : int patch_height_end = hs + eff_k_height;
102 35785 : int patch_width_end = ws + eff_k_width;
103 278217 : for (unsigned c = 0; c < im_channel; c++) {
104 1330534 : for (int h = hs; h < patch_height_end; h += hdilation) {
105 1088102 : if (h < 0 || im_height <= h) {
106 5876 : col_h += k_width;
107 5876 : continue;
108 : }
109 6165092 : for (int w = ws; w < patch_width_end; w += wdilation) {
110 5082866 : if (w < 0 || im_width <= w) {
111 17124 : col_h++;
112 17124 : continue;
113 : }
114 :
115 5065742 : val = image.getAddress<T>(0, c, h, w);
116 5065742 : *val += col_matrix.getValue<T>(0, 0, col_h, col_w);
117 5065742 : col_h++;
118 : }
119 : }
120 : }
121 35785 : col_w++;
122 : }
123 : }
124 1922 : };
125 :
126 961 : if (image.getDataType() == nntrainer::Tdatatype::FP32) {
127 : float val;
128 961 : apply_data(&val);
129 : }
130 : #ifdef ENABLE_FP16
131 : else if (image.getDataType() == nntrainer::Tdatatype::FP16) {
132 : _FP16 val;
133 : apply_data(&val);
134 : }
135 : #endif
136 : else {
137 0 : throw std::runtime_error("Not supported datatype");
138 : }
139 961 : }
140 :
141 : /**
142 : * @brief reform the data to 2d matrix
143 : * a region is sampled considering @a padding, @a mstride of unit @a kdim
144 : * Each region is mapped to one column,
145 : * if channel mode, kernel channel is considered part of kernel feature
146 : * if not, kernel channel is consider part of output dimension
147 : *
148 : * @param[in] in input data
149 : * @param[in] kdim kernel dimesion for define number of row
150 : * @param[in] padding padding information
151 : * @param[in] mstride stride value : x, y direction
152 : * @param[in] dilation kernel dilation factor : x, y each
153 : * @param[out] out out tensor, padding set each time for now
154 : * @note if out is initialized tensor, setting padding is skipped.
155 : */
156 4330 : static void im2col(const Tensor &in, const TensorDim &kdim,
157 : const std::array<unsigned int, 4> &padding,
158 : const std::array<props::Stride, CONV2D_DIM> &mstride,
159 : const std::array<props::Dilation, CONV2D_DIM> &dilation,
160 : Tensor &out) {
161 : /// for channel last mode, this is deprecated for now, leaving here on
162 : /// purpose.
163 : /** @code
164 : // ================ initialize part ====================
165 : // out_height -= 2;
166 : // out =
167 : // Tensor(k_height * k_width, in.channel() * (out_height) *
168 : // (out_width));
169 : // unsigned int im_w = 0;
170 : // ================ loop part ====================
171 : // if (eff_k_height > height || eff_k_width > width)
172 : // throw std::runtime_error("Kernel shape bigger than input shape");
173 :
174 : // for (unsigned int c = 0; c < channel; ++c) {
175 : // for (unsigned int hs = 0; hs <= height - eff_k_height; hs +=
176 : // mstride[0]) {
177 : // for (unsigned int ws = 0; ws <= width - eff_k_width; ws +=
178 : // mstride[1]) {
179 : // unsigned int im_h = 0;
180 : // unsigned int patch_height_end = eff_k_height + hs;
181 : // unsigned int patch_width_end = eff_k_width + ws;
182 :
183 : // for (unsigned int h = hs; h < patch_height_end; h += dilation[0]) {
184 : // if (h < ph || in_height + ph <= h) {
185 : // im_h += k_width;
186 : // continue;
187 : // }
188 :
189 : // for (unsigned int w = ws; w < patch_width_end; w += dilation[1])
190 : // {
191 : // if (w < pw || in_width + pw <= w) {
192 : // im_h++;
193 : // continue;
194 : // }
195 :
196 : // float val = in.getValue(0, c, h - ph, w - pw);
197 : // out.setValue(0, 0, im_h, im_w, val);
198 : // im_h++;
199 : // }
200 : // }
201 : // im_w++;
202 : // }
203 : // }
204 : // }
205 : */
206 :
207 4330 : auto pt = padding[0];
208 4330 : auto pb = padding[1];
209 4330 : auto pl = padding[2];
210 4330 : auto pr = padding[3];
211 :
212 4330 : unsigned int channel = in.channel();
213 4330 : int in_height = in.height();
214 4330 : int in_width = in.width();
215 4330 : unsigned int height = in_height + pt + pb;
216 4330 : unsigned int width = in_width + pl + pr;
217 4330 : unsigned int k_height = kdim.height();
218 4330 : unsigned int k_width = kdim.width();
219 :
220 : /// effective kernel height considering dilation
221 4330 : unsigned int eff_k_height = (k_height - 1) * dilation[0] + 1;
222 : /// effective kernel width considering dilation
223 4330 : unsigned int eff_k_width = (k_width - 1) * dilation[1] + 1;
224 :
225 4330 : unsigned int out_height = (height - eff_k_height) / mstride[0] + 1;
226 4330 : unsigned int out_width = (width - eff_k_width) / mstride[1] + 1;
227 :
228 4330 : out.reshape(
229 4330 : TensorDim({out_height * out_width, in.channel() * k_height * k_width},
230 : in.getTensorType()));
231 : // float *out_data = out.getData();
232 :
233 4330 : auto apply_data = [&]<typename T>(T *out_data) {
234 4330 : int h_stride_end = height - eff_k_height - pt;
235 4330 : int w_stride_end = width - eff_k_width - pl;
236 :
237 : /// get a patch, size of kernel
238 : /// hs is height_strided, ws is width_strided
239 4330 : unsigned int owidth = out.width();
240 : unsigned int base_im_w = 0;
241 54395 : for (int hs = -(int)pt; hs <= h_stride_end; hs += mstride[0]) {
242 : unsigned int base_im_h = 0;
243 50065 : int patch_height_end = eff_k_height + hs;
244 : /// map the patch to a single line looping through channel
245 : // We need to optimize this padding & copy. May be use multi threads, or
246 : // SIMD
247 173239 : for (unsigned int c = 0; c < channel; ++c) {
248 694664 : for (int h = hs; h < patch_height_end; h += dilation[0]) {
249 571490 : if (h < 0 || in_height <= h) {
250 2682 : base_im_h += k_width;
251 2682 : continue;
252 : }
253 :
254 : unsigned int im_w = base_im_w;
255 8010544 : for (int ws = -(int)pl; ws <= w_stride_end; ws += mstride[1]) {
256 : unsigned int im_h = base_im_h;
257 7441736 : int patch_width_end = eff_k_width + ws;
258 :
259 43589352 : for (int w = ws; w < patch_width_end; w += dilation[1]) {
260 36147616 : if (w < 0 || in_width <= w) {
261 57852 : im_h++;
262 57852 : continue;
263 : }
264 36089764 : out_data[im_w * owidth + im_h] = in.getValue<T>(0, c, h, w);
265 36089764 : im_h++;
266 : }
267 7441736 : im_w++;
268 : }
269 568808 : base_im_h += k_width;
270 : }
271 : }
272 50065 : base_im_w += out_width;
273 : }
274 8660 : };
275 :
276 4330 : if (out.getDataType() == nntrainer::Tdatatype::FP32) {
277 : float *out_data = out.getData<float>();
278 4330 : apply_data(out_data);
279 : }
280 : #ifdef ENABLE_FP16
281 : else if (out.getDataType() == nntrainer::Tdatatype::FP16) {
282 : _FP16 *out_data = out.getData<_FP16>();
283 : apply_data(out_data);
284 : }
285 : #endif
286 : else {
287 0 : throw std::runtime_error("Not supported datatype");
288 : }
289 4330 : }
290 : } // namespace
291 :
292 : enum ConvParams { weight, bias };
293 :
294 192 : Conv2DLayer::Conv2DLayer(
295 192 : const std::array<unsigned int, CONV2D_DIM * 2> &padding_) :
296 : LayerImpl(),
297 192 : padding(padding_),
298 576 : conv_props(props::FilterSize(), std::array<props::KernelSize, CONV2D_DIM>(),
299 768 : std::array<props::Stride, CONV2D_DIM>(), props::Padding2D(),
300 384 : std::array<props::Dilation, CONV2D_DIM>()) {
301 : wt_idx.fill(std::numeric_limits<unsigned>::max());
302 192 : }
303 :
304 149 : void Conv2DLayer::finalize(InitLayerContext &context) {
305 149 : NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
306 : << "Convolution layer takes only one input";
307 :
308 : const TensorDim &in_dim = context.getInputDimensions()[0];
309 :
310 : auto &weight_regularizer =
311 : std::get<props::WeightRegularizer>(*layer_impl_props);
312 : auto &weight_regularizer_constant =
313 : std::get<props::WeightRegularizerConstant>(*layer_impl_props);
314 : auto &weight_initializer =
315 : std::get<props::WeightInitializer>(*layer_impl_props);
316 : auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
317 : auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
318 : auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
319 : auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
320 :
321 149 : unsigned int filter_size = std::get<props::FilterSize>(conv_props);
322 : auto &kernel_size =
323 : std::get<std::array<props::KernelSize, CONV2D_DIM>>(conv_props);
324 : auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
325 : auto &dilation =
326 : std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
327 :
328 : auto in_t_type = in_dim.getTensorType();
329 149 : in_t_type.data_type = context.getWeightDataType();
330 :
331 : TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(),
332 149 : kernel_size[0], kernel_size[1], in_t_type);
333 :
334 149 : TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type);
335 :
336 : padding = std::get<props::Padding2D>(conv_props)
337 149 : .compute(in_dim, kernel_dim, {stride[0], stride[1]},
338 : {dilation[0], dilation[1]});
339 :
340 149 : wt_idx[ConvParams::weight] = context.requestWeight(
341 : kernel_dim, weight_initializer, weight_regularizer,
342 : weight_regularizer_constant, weight_decay, "filter", true, 0);
343 :
344 149 : if (disable_bias.empty() || disable_bias.get() == false) {
345 149 : wt_idx[ConvParams::bias] =
346 298 : context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE,
347 : 1.0f, bias_decay, "bias", true, 0);
348 : }
349 :
350 : // this output_dim must be the same with dimension of hidden
351 149 : unsigned int eff_in_height = in_dim.height() + padding[0] + padding[1];
352 149 : unsigned int eff_in_width = in_dim.width() + padding[2] + padding[3];
353 :
354 149 : unsigned int eff_k_height = (kernel_size[0] - 1) * dilation[0] + 1;
355 149 : unsigned int eff_k_width = (kernel_size[1] - 1) * dilation[1] + 1;
356 :
357 149 : TensorDim out_dim;
358 149 : out_dim.batch(in_dim.batch());
359 149 : out_dim.channel(filter_size);
360 149 : out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1);
361 149 : out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1);
362 :
363 : out_dim.setTensorType(in_dim.getTensorType());
364 :
365 149 : context.setOutputDimensions({out_dim});
366 :
367 298 : NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1],
368 : std::invalid_argument)
369 : << "Failed to initialize: in size + padding is smaller than effective "
370 : "kernel";
371 :
372 : unsigned int IM = std::numeric_limits<int>::max();
373 :
374 146 : NNTR_THROW_IF(eff_in_height - padding[0] - kernel_size[0] > IM ||
375 : eff_in_width - padding[2] - kernel_size[1] > IM,
376 : std::invalid_argument)
377 : << "Failed to initialize: Calculated patch end is over int max";
378 146 : }
379 :
380 480 : void Conv2DLayer::forwarding(RunLayerContext &context, bool training) {
381 : int status = ML_ERROR_NONE;
382 :
383 480 : unsigned int filter_size = std::get<props::FilterSize>(conv_props);
384 : auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
385 : auto &dilation =
386 : std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
387 :
388 480 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
389 480 : Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
390 :
391 480 : Tensor &filter_kernel = context.getWeight(wt_idx[ConvParams::weight]);
392 :
393 : /** Calculate Convolution 2D
394 : *
395 : * This is the 2D Matrix Shape [ height ] x [ width ]
396 : * . Height : filter_size
397 : * . Width : Input Channel * Kernel_size[0] * Kernel_size[1]
398 : *
399 : * imKernel
400 : * +------|------|------+
401 : * |------|------|------|
402 : * [filter_size (height)] |------|------|------|
403 : * |------|------|------|
404 : * +------|------|------+
405 : * [Input Channel * Kernel_size[0]
406 : * * Kernel_size[1] (width)]
407 : *
408 : *
409 : * After im2Col with channel_mode true (in : input)
410 : *
411 : * This is the 2D Matrix Shape [ height ] x [ width ]
412 : * . Height : Input Channel * Kernel_size[0] * Kernel_size[1]
413 : * . Width : output_dim.height * output_dim.width
414 : *
415 : * +-|-|-|-| |-|-|-|-+
416 : * [Input Channel | | | | | | | | | |
417 : * * Kernel_size[0] |_|_|_|_| |_|_|_|_|
418 : * * Kenel_size[1] | | | | | .... | | | | |
419 : * (height)] |_|_|_|_| |_|_|_|_|
420 : * | | | | | | | | | |
421 : * +_|_|_|_| |_|_|_|_+
422 : * [ output_dim.height
423 : * * output_dim.width (width) ]
424 : *
425 : * Output Dimention
426 : * -> [Channel ( = filter_size = output_dim.channel )]
427 : * x [output_dim.height x output_dim.width]
428 : */
429 480 : const TensorDim &in_dim = input_.getDim();
430 480 : const TensorDim &out_dim = hidden_.getDim();
431 480 : const TensorDim &filter_dim = filter_kernel.getDim();
432 480 : TensorDim filter_dim_squeezed{filter_kernel.batch(),
433 480 : filter_kernel.getDim().getFeatureLen()};
434 :
435 480 : filter_dim_squeezed.setTensorType(filter_kernel.getTensorType());
436 :
437 480 : filter_kernel.reshape(filter_dim_squeezed);
438 :
439 : /**
440 : * Below sets the pad area values to zero
441 : * it is faster to do this way than seting selective area to zero
442 : */
443 480 : auto forwarding_job = [&](unsigned int s, unsigned int e, unsigned int pid,
444 : void *user_data) {
445 480 : Tensor result = Tensor(calcCol2ImOutputDim(out_dim, filter_dim));
446 480 : result.setZero();
447 3368 : for (unsigned int b = s; b < e; ++b) {
448 2888 : Tensor out = hidden_.getBatchSlice(b, 1);
449 2888 : out.reshape({filter_size, out_dim.width() * out_dim.height()});
450 2888 : Tensor in_sub = input_.getBatchSlice(b, 1);
451 :
452 2888 : im2col(in_sub, filter_dim, padding, stride, dilation, result);
453 : // filter kernel is (K, CRS), result is (CRS, OH*OW)
454 2888 : filter_kernel.dot(result, out, false, true);
455 2888 : }
456 480 : result.deallocate();
457 480 : };
458 :
459 960 : auto workers = ParallelBatch(forwarding_job, in_dim.batch(), nullptr);
460 :
461 480 : if (workers.getNumWorkers() > 1) {
462 0 : workers.run();
463 : } else {
464 480 : forwarding_job(0, in_dim.batch(), 0, nullptr);
465 : }
466 :
467 480 : filter_kernel.reshape(filter_dim);
468 : if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
469 480 : disable_bias.empty() || disable_bias.get() == false) {
470 480 : Tensor &bias_kernel = context.getWeight(wt_idx[ConvParams::bias]);
471 480 : status = hidden_.add_i(bias_kernel);
472 480 : if (status != ML_ERROR_NONE) {
473 0 : throw std::invalid_argument("[Conv2D] adding bias failed");
474 : }
475 : }
476 480 : }
477 :
478 205 : void Conv2DLayer::calcDerivative(RunLayerContext &context) {
479 205 : unsigned int filter_size = std::get<props::FilterSize>(conv_props);
480 : auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
481 : auto &dilation =
482 : std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
483 :
484 205 : const Tensor &derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
485 205 : Tensor &input_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
486 205 : Tensor &filter_kernel = context.getWeight(wt_idx[ConvParams::weight]);
487 :
488 205 : TensorDim filter_dim = filter_kernel.getDim();
489 205 : TensorDim filter_dim_squeezed{filter_kernel.batch(),
490 205 : filter_kernel.getDim().getFeatureLen()};
491 :
492 205 : filter_kernel.reshape(filter_dim_squeezed);
493 :
494 : /// for each batch
495 : /// filter_kernel^T X derivaitive -> column matrix
496 : /// col2im(column matrix) to reconstruct the original image
497 :
498 205 : auto compute_derivative = [&](unsigned int s, unsigned int e,
499 : unsigned int pid, void *user_data) {
500 : Tensor result =
501 205 : Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
502 :
503 1166 : for (unsigned int b = s; b < e; ++b) {
504 961 : Tensor deriv_sub = derivative.getBatchSlice(b, 1);
505 961 : Tensor in_deriv_sub = input_derivative.getBatchSlice(b, 1);
506 961 : deriv_sub.reshape(
507 961 : {filter_size, derivative.width() * derivative.height()});
508 : // filter_kernel is (K, CRS), deriv_sub is (K, OH*OW), result is (CRS,
509 : // OH*OW)
510 961 : filter_kernel.dot(deriv_sub, result, true, false);
511 961 : col2im(result, filter_dim, padding, stride, dilation, in_deriv_sub);
512 : // in_derv_sub is (C,H,W)
513 961 : }
514 205 : result.deallocate();
515 205 : };
516 :
517 410 : auto workers = ParallelBatch(compute_derivative, derivative.batch(), nullptr);
518 :
519 205 : if (workers.getNumWorkers() > 1) {
520 0 : workers.run();
521 : } else {
522 205 : compute_derivative(0, derivative.batch(), 0, nullptr);
523 : }
524 :
525 205 : filter_kernel.reshape(filter_dim);
526 205 : }
527 :
528 221 : void Conv2DLayer::calcGradient(RunLayerContext &context) {
529 221 : unsigned int filter_size = std::get<props::FilterSize>(conv_props);
530 : auto &stride = std::get<std::array<props::Stride, CONV2D_DIM>>(conv_props);
531 : auto &dilation =
532 : std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
533 :
534 221 : const Tensor &derivative = context.getIncomingDerivative(SINGLE_INOUT_IDX);
535 221 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
536 :
537 221 : Tensor &delK = context.getWeightGrad(wt_idx[ConvParams::weight]);
538 221 : delK.setZero();
539 :
540 221 : TensorDim filter_dim = delK.getDim();
541 221 : TensorDim filter_dim_squeezed{filter_dim.batch(), filter_dim.getFeatureLen()};
542 :
543 221 : delK.reshape(filter_dim_squeezed);
544 :
545 : /**
546 : * no need to set zero for im2col_result, as its lifespan is ITERATION,
547 : * so its zero padded values will still be zero
548 : */
549 :
550 221 : TensorDim out_dim_squeezed{filter_size,
551 221 : derivative.width() * derivative.height()};
552 221 : auto workers = ParallelBatch(input_.batch());
553 : /// input -(im2col)-> column_matrix -> filter x (column_matrix) = output
554 : /// so delK = dy x column_matrix ^ T;
555 221 : if (workers.getNumWorkers() > 1) {
556 :
557 0 : TensorDim delK_ext = filter_dim_squeezed;
558 0 : delK_ext.batch(input_.batch());
559 :
560 0 : Tensor delK_par = Tensor(delK_ext);
561 0 : delK_par.setZero();
562 :
563 0 : auto calc_grad_job = [&](unsigned int s, unsigned int e, unsigned int pid,
564 : void *user_data) {
565 : Tensor result =
566 0 : Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
567 0 : result.setZero();
568 0 : for (unsigned int b = s; b < e; ++b) {
569 0 : Tensor deriv_sub = derivative.getBatchSlice(b, 1);
570 0 : Tensor delK_sub = delK_par.getBatchSlice(b, 1);
571 0 : deriv_sub.reshape(out_dim_squeezed);
572 :
573 0 : Tensor in_sub = input_.getBatchSlice(b, 1);
574 :
575 : /**
576 : * @todo this result can be cached from the forward iteration at the
577 : * expense of memory. In this case, memory of im2col_result must be
578 : * saved for the whole batch. try this while benchmarking.
579 : */
580 : // deriv_sub is (K, OH*OW) and result is (CRS, OH*OW)
581 0 : im2col(in_sub, filter_dim, padding, stride, dilation, result);
582 0 : deriv_sub.dot(result, delK_sub, false, false);
583 0 : }
584 0 : result.deallocate();
585 0 : };
586 :
587 0 : workers.setCallback(calc_grad_job, nullptr);
588 :
589 0 : workers.run();
590 :
591 0 : for (unsigned int b = 0; b < input_.batch(); ++b) {
592 0 : Tensor delK_sub = delK_par.getBatchSlice(b, 1);
593 0 : delK.add_i(delK_sub);
594 0 : }
595 :
596 0 : } else {
597 : Tensor result =
598 221 : Tensor(calcCol2ImOutputDim(derivative.getDim(), filter_dim));
599 221 : result.setZero();
600 :
601 1663 : for (unsigned int b = 0; b < input_.batch(); ++b) {
602 1442 : Tensor deriv_sub = derivative.getBatchSlice(b, 1);
603 1442 : deriv_sub.reshape(out_dim_squeezed);
604 :
605 1442 : Tensor in_sub = input_.getBatchSlice(b, 1);
606 :
607 : /**
608 : * @todo this result can be cached from the forward iteration at the
609 : * expense of memory. In this case, memory of im2col_result must be saved
610 : * for the whole batch. try this while benchmarking.
611 : */
612 1442 : im2col(in_sub, filter_dim, padding, stride, dilation, result);
613 2663 : deriv_sub.dot(result, delK, false, false, b == 0 ? 0.0f : 1.0f);
614 1442 : }
615 221 : result.deallocate();
616 221 : }
617 221 : delK.reshape(filter_dim);
618 : if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
619 221 : disable_bias.empty() || disable_bias.get() == false) {
620 221 : Tensor &delBias = context.getWeightGrad(wt_idx[ConvParams::bias]);
621 221 : delBias.setZero();
622 221 : derivative.sum({0, 2, 3}, delBias);
623 : }
624 221 : }
625 :
626 42 : void Conv2DLayer::exportTo(Exporter &exporter,
627 : const ml::train::ExportMethods &method) const {
628 42 : LayerImpl::exportTo(exporter, method);
629 42 : exporter.saveResult(conv_props, method, this);
630 42 : }
631 :
632 811 : void Conv2DLayer::setProperty(const std::vector<std::string> &values) {
633 811 : auto remain_props = loadProperties(values, conv_props);
634 809 : LayerImpl::setProperty(remain_props);
635 809 : }
636 :
637 : } /* namespace nntrainer */
|