Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
4 : *
5 : * @file time_dist.cpp
6 : * @date 01 April 2021
7 : * @brief This is Time Distributed Layer Class of Neural Network
8 : * @see https://github.com/nnstreamer/nntrainer
9 : * @author Jijoong Moon <jijoong.moon@samsung.com>
10 : * @bug No known bugs except for NYI items
11 : *
12 : */
13 :
14 : #include <layer_context.h>
15 : #include <nntrainer_error.h>
16 : #include <nntrainer_log.h>
17 : #include <time_dist.h>
18 : #include <util_func.h>
19 : #include <weight.h>
20 :
21 : namespace nntrainer {
22 :
23 : static constexpr size_t SINGLE_INOUT_IDX = 0;
24 :
25 0 : static void reshape(Tensor &m) {
26 0 : TensorDim d = m.getDim();
27 0 : m.reshape({d[2], d[1], d[0], d[3]});
28 0 : }
29 :
30 0 : void TimeDistLayer::setPosition(RunLayerContext &context) {
31 0 : positions[0] = context.getInput(SINGLE_INOUT_IDX).getData();
32 0 : positions[2] = context.getOutput(SINGLE_INOUT_IDX).getData();
33 : /** TODO: use mode of execution here */
34 : try {
35 0 : positions[1] = context.getOutgoingDerivative(SINGLE_INOUT_IDX).getData();
36 0 : positions[3] =
37 0 : (float *)context.getIncomingDerivative(SINGLE_INOUT_IDX).getData();
38 0 : } catch (...) {
39 : /** in case of training, these tensors will not exist */
40 0 : }
41 0 : }
42 :
43 0 : void TimeDistLayer::transposeInOut(RunLayerContext &context) {
44 : // Position[0] : net_input.variable
45 0 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
46 0 : input_.copy(transposeTensor(input_));
47 :
48 : // Position[1] : net_input.gradient
49 0 : Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
50 0 : if (ret_.getData() != positions[0]) {
51 0 : ret_.copy(transposeTensor(ret_));
52 : } else {
53 0 : reshape(ret_);
54 : }
55 :
56 : // Position[2] : net_hidden.variable
57 0 : Tensor &hval_ = context.getOutput(SINGLE_INOUT_IDX);
58 0 : if (hval_.getData() != positions[0] && hval_.getData() != positions[1]) {
59 0 : hval_.copy(transposeTensor(hval_));
60 : } else {
61 0 : reshape(hval_);
62 : }
63 :
64 : // Position[3] : net_hidden.gradient
65 : bool trans = true;
66 :
67 : /// @fixme: below will be propably wrong as this changes incoming derivative.
68 : /// other layer referring to this will have wrong output grad information.
69 0 : Tensor &derivative_ = context.getOutputGradUnsafe(SINGLE_INOUT_IDX);
70 0 : for (unsigned int i = 0; i < 3; ++i) {
71 0 : if (derivative_.getData() == positions[i]) {
72 : trans = false;
73 : break;
74 : }
75 : }
76 0 : if (trans)
77 0 : derivative_.copy(transposeTensor(derivative_));
78 : else
79 0 : reshape(derivative_);
80 0 : }
81 :
82 0 : Tensor TimeDistLayer::transposeTensor(Tensor &m) {
83 0 : TensorDim dim = m.getDim();
84 : // Assume the channel is 1. Time Dimension is h. It transpose [b, 1, h, w] to
85 : // [h, 1, b, w ] and nntrainer only support 1,2,3 transpose. So we do reshape
86 : // first to make [1, b,h, w]
87 : // TODO:
88 : // If we do {1, dim[0]*dim[1], dim[2], dim[3]} and transpose to {1, dim[2],
89 : // dim[0]*dim[1], dim[3]}. Then reshpae to {dim[2], dim[0], dim[1], dim[3]}
90 : // then we could support the case which dim[1] is not 1. But we need to change
91 : // some other places of code to support.
92 : //
93 0 : if (dim[1] != 1)
94 : throw std::invalid_argument(
95 0 : "Channel of Time distributed layer must be 1 for now");
96 :
97 0 : m.reshape({dim[1], dim[0], dim[2], dim[3]});
98 0 : Tensor in = m.transpose("1:0:2");
99 0 : in.reshape({dim[2], dim[1], dim[0], dim[3]});
100 0 : m.reshape(dim);
101 0 : in.setName(m.getName() + "_trans");
102 :
103 0 : return in;
104 0 : }
105 :
106 2 : void TimeDistLayer::finalize(InitLayerContext &context) {
107 2 : NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
108 : << "Time distributed layer takes only one input";
109 :
110 2 : if (!dist_layer) {
111 0 : throw std::invalid_argument("distributed layer is not set properly");
112 : }
113 :
114 : const TensorDim &input_dim = context.getInputDimensions()[0];
115 2 : if (input_dim.channel() != 1) {
116 : throw std::invalid_argument(
117 0 : "only 1 channel is allow for time distributed layer");
118 : }
119 :
120 : /**
121 : * simulate an InitLayerContext, and then replicate its effect onto the
122 : * actual context
123 : */
124 2 : TensorDim dist_dim = input_dim;
125 2 : dist_dim.height(1);
126 : InitLayerContext dist_context({dist_dim}, {}, context.getInPlace(),
127 2 : context.getName());
128 :
129 : // During forwarding and backwarding, it set the input and output buffer of
130 : // dist_layer properly
131 : // dist_layer will use forwarding_with_val and backwarding_with_val
132 2 : dist_layer->finalize(dist_context);
133 :
134 2 : TensorDim output_dim = dist_context.getOutSpecs()[0].variable_spec.dim;
135 : // input_dim.height is number of time iteration
136 2 : output_dim.height(input_dim.height());
137 2 : context.setOutputDimensions({output_dim});
138 :
139 : /** real setting of context */
140 2 : fillLayerInitContext(context, dist_context);
141 6 : }
142 :
143 0 : void TimeDistLayer::fillWeightsFromContext(RunLayerContext &context) {
144 0 : weights_wrapper.resize(context.getNumWeights());
145 :
146 : /** create weights */
147 0 : for (unsigned int idx = 0; idx < context.getNumWeights(); idx++) {
148 0 : if (context.weightHasGradient(idx)) {
149 0 : weights_wrapper[idx] =
150 0 : Weight(context.getWeight(idx), context.getWeightGrad(idx),
151 0 : context.getWeightName(idx));
152 : } else {
153 0 : weights_wrapper[idx] =
154 0 : Weight(context.getWeight(idx), Tensor(), context.getWeightName(idx));
155 : }
156 : }
157 0 : }
158 :
159 0 : void TimeDistLayer::fillTensorsFromContext(RunLayerContext &context) {
160 0 : tensors_wrapper.resize(context.getNumTensors());
161 :
162 : /** create tensors */
163 0 : for (unsigned int idx = 0; idx < context.getNumTensors(); idx++) {
164 0 : if (context.tensorHasGradient(idx)) {
165 0 : tensors_wrapper[idx] =
166 0 : Var_Grad(context.getTensor(idx), context.getTensorGrad(idx),
167 0 : context.getTensorName(idx));
168 : } else {
169 0 : tensors_wrapper[idx] =
170 0 : Var_Grad(context.getTensor(idx), Tensor(), context.getTensorName(idx));
171 : }
172 : }
173 0 : }
174 :
175 0 : std::vector<Weight *> TimeDistLayer::getWeightsForContext() {
176 : /** create weights for context */
177 : std::vector<Weight *> weights_for_context;
178 0 : for (auto &w : weights_wrapper)
179 0 : weights_for_context.push_back(&w);
180 :
181 0 : return weights_for_context;
182 0 : }
183 :
184 0 : std::vector<Var_Grad *> TimeDistLayer::getTensorsForContext() {
185 : /** create tensors for context */
186 : std::vector<Var_Grad *> tensors_for_context;
187 0 : for (auto &t : tensors_wrapper)
188 0 : tensors_for_context.push_back(&t);
189 :
190 0 : return tensors_for_context;
191 0 : }
192 :
193 0 : void TimeDistLayer::forwarding(RunLayerContext &context, bool training) {
194 0 : setPosition(context);
195 :
196 0 : Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
197 0 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
198 : // input_.dim = [ b, 1, h, w ]
199 :
200 0 : Tensor h_g;
201 :
202 0 : const TensorDim &ho_dim = hidden_.getDim();
203 0 : const TensorDim &in_dim = input_.getDim();
204 :
205 : // TODO: This transposed Input Tensor could be resued for backwarding
206 0 : Tensor in = transposeTensor(input_);
207 :
208 0 : Tensor out = Tensor({ho_dim[2], 1, ho_dim[0], ho_dim[3]}, true,
209 0 : Initializer::NONE, context.getName() + ":inter_output");
210 :
211 0 : TensorDim i_dim = in_dim;
212 0 : i_dim.channel(1);
213 0 : i_dim.height(1);
214 :
215 0 : TensorDim h_dim = ho_dim;
216 0 : h_dim.channel(1);
217 0 : h_dim.height(1);
218 :
219 0 : if (dist_layer->requireLabel() &&
220 0 : context.isLabelAvailable(SINGLE_INOUT_IDX)) {
221 0 : Tensor &hidden_g = context.getLabel(SINGLE_INOUT_IDX);
222 0 : h_g = transposeTensor(hidden_g);
223 : }
224 :
225 0 : Var_Grad in_var(i_dim, Initializer::NONE, false, false, "input");
226 : Var_Grad out_var(h_dim, Initializer::NONE,
227 0 : dist_layer->requireLabel() &&
228 0 : context.isLabelAvailable(SINGLE_INOUT_IDX),
229 0 : false, "output");
230 :
231 0 : fillWeightsFromContext(context);
232 0 : fillTensorsFromContext(context);
233 :
234 0 : for (unsigned int i = 0; i < in_dim.height(); ++i) {
235 : //
236 : // Iterate Height Direction. The dimension of in is input_[ b, 1, 1, width
237 : // ]. The dimension of out is hidden_[ b, 1, 1, width ]
238 : //
239 0 : Tensor label_iter;
240 :
241 : Tensor in_iter = in.getSharedDataTensor(
242 0 : i_dim, i * in_dim.batch() * in_dim.width(), true, in.getName());
243 : Tensor out_iter = out.getSharedDataTensor(
244 0 : h_dim, i * ho_dim.batch() * ho_dim.width(), true, out.getName());
245 :
246 0 : in_var.initializeVariable(in_iter);
247 0 : out_var.initializeVariable(out_iter);
248 :
249 0 : if (dist_layer->requireLabel() &&
250 0 : context.isLabelAvailable(SINGLE_INOUT_IDX)) {
251 0 : label_iter = h_g.getSharedDataTensor(
252 0 : h_dim, i * ho_dim.batch() * ho_dim.width(), true, h_g.getName());
253 0 : out_var.initializeGradient(label_iter);
254 : }
255 :
256 : RunLayerContext dist_context(
257 : context.getName(), context.getTrainable(), context.getLoss(),
258 0 : context.getInPlace(), context.getLossScale(), context.getContextData(),
259 0 : false, getWeightsForContext(), {&in_var}, {&out_var},
260 0 : getTensorsForContext());
261 :
262 0 : dist_layer->forwarding(dist_context, training);
263 0 : }
264 :
265 0 : hidden_.copy(transposeTensor(out));
266 0 : clearFromContext();
267 0 : }
268 :
269 0 : void TimeDistLayer::calcDerivative(RunLayerContext &context) {
270 : /// @fixme: this will be probably wrong as this mutates incoming derivative,
271 : /// we will need the layer to copy and paste instead of transpose and override
272 0 : Tensor &derivative_ = context.getOutputGradUnsafe(SINGLE_INOUT_IDX);
273 0 : Tensor &hval_ = context.getOutput(SINGLE_INOUT_IDX);
274 0 : Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
275 0 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
276 :
277 0 : TensorDim der_dim = derivative_.getDim();
278 0 : TensorDim ret_dim = ret_.getDim();
279 :
280 0 : TensorDim r_dim = {ret_dim[2], 1, 1, ret_dim[3]};
281 0 : TensorDim d_dim = {der_dim[2], 1, 1, der_dim[3]};
282 :
283 0 : Var_Grad in_var(r_dim, Initializer::NONE, true, false, "input");
284 0 : Var_Grad out_var(d_dim, Initializer::NONE, true, false, "output");
285 :
286 0 : fillWeightsFromContext(context);
287 0 : fillTensorsFromContext(context);
288 :
289 0 : for (unsigned int i = 0; i < der_dim[0]; ++i) {
290 : Tensor ret_iter = ret_.getSharedDataTensor(
291 0 : r_dim, i * r_dim.batch() * r_dim.width(), true, ret_.getName());
292 : Tensor in_iter = input_.getSharedDataTensor(
293 0 : r_dim, i * r_dim.batch() * r_dim.width(), true, input_.getName());
294 : Tensor d_iter = derivative_.getSharedDataTensor(
295 0 : d_dim, i * d_dim.batch() * d_dim.width(), true, derivative_.getName());
296 : Tensor hval_iter = hval_.getSharedDataTensor(
297 0 : d_dim, i * d_dim.batch() * d_dim.width(), true, hval_.getName());
298 :
299 0 : in_var.initializeGradient(ret_iter);
300 0 : in_var.initializeVariable(in_iter);
301 0 : out_var.initializeGradient(d_iter);
302 0 : out_var.initializeVariable(hval_iter);
303 :
304 : RunLayerContext dist_context(
305 : context.getName(), context.getTrainable(), context.getLoss(),
306 0 : context.getInPlace(), context.getLossScale(), context.getContextData(),
307 0 : false, getWeightsForContext(), {&in_var}, {&out_var},
308 0 : getTensorsForContext());
309 :
310 0 : dist_layer->calcDerivative(dist_context);
311 0 : }
312 :
313 0 : ret_.copy(transposeTensor(ret_));
314 : // We are not going to transpose the data. The Date is not used anymore.
315 : // It will be overwritten at next iteration
316 : // Just reshpae the tensors
317 0 : hval_.reshape({der_dim[2], 1, der_dim[0], der_dim[3]});
318 0 : derivative_.reshape({der_dim[2], 1, der_dim[0], der_dim[3]});
319 0 : input_.reshape({ret_dim[2], 1, ret_dim[0], ret_dim[3]});
320 0 : clearFromContext();
321 0 : }
322 :
323 0 : void TimeDistLayer::calcGradient(RunLayerContext &context) {
324 : // Even if the dist_layer->getNumWeights() == 0, We do transpose here
325 : // for the calculation of derivatives and overwrite original tensors.
326 : // And use them in calcDerivatives() without transpose.
327 0 : transposeInOut(context);
328 :
329 0 : if (context.getNumWeights() == 0)
330 0 : return;
331 :
332 0 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
333 0 : const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
334 :
335 0 : TensorDim der_dim = derivative_.getDim();
336 0 : TensorDim in_dim = input_.getDim();
337 :
338 0 : TensorDim i_dim = {in_dim[2], 1, 1, in_dim[3]};
339 0 : TensorDim d_dim = {der_dim[2], 1, 1, der_dim[3]};
340 :
341 0 : fillWeightsFromContext(context);
342 0 : fillTensorsFromContext(context);
343 :
344 0 : for (unsigned int i = 0; i < der_dim[0]; ++i) {
345 : Tensor in_iter = input_.getSharedDataTensor(
346 0 : i_dim, i * i_dim.batch() * i_dim.width(), true, input_.getName());
347 : Tensor d_iter = derivative_.getSharedDataTensor(
348 0 : d_dim, i * d_dim.batch() * d_dim.width(), true, derivative_.getName());
349 :
350 0 : Var_Grad in_var(i_dim, Initializer::NONE, true, false, "input");
351 0 : Var_Grad out_var(d_dim, Initializer::NONE, true, false, "output");
352 :
353 0 : in_var.initializeVariable(in_iter);
354 0 : out_var.initializeGradient(d_iter);
355 :
356 : RunLayerContext dist_context(
357 : context.getName(), context.getTrainable(), context.getLoss(),
358 0 : context.getInPlace(), context.getLossScale(), context.getContextData(),
359 0 : false, getWeightsForContext(), {&in_var}, {&out_var},
360 0 : getTensorsForContext());
361 :
362 0 : dist_layer->calcGradient(dist_context);
363 0 : }
364 0 : clearFromContext();
365 0 : }
366 :
367 2 : void TimeDistLayer::fillLayerInitContext(InitLayerContext &context,
368 : const InitLayerContext &dist_context) {
369 : /** real set the input flags */
370 : auto const &input_dims = context.getInputDimensions();
371 4 : for (unsigned int idx = 0; idx < dist_context.getNumInputs(); idx++) {
372 2 : context.setDynDimFlagInputDimension(idx, input_dims[idx].getDynDimFlag());
373 2 : context.setEffDimFlagInputDimension(idx, input_dims[idx].getEffDimFlag());
374 : }
375 :
376 : /** real request of tensors */
377 2 : for (auto const &ts : dist_context.getTensorsSpec())
378 : context.requestTensor(ts);
379 :
380 : /** real request of weights */
381 4 : for (auto const &ws : dist_context.getWeightsSpec())
382 : context.requestWeight(ws);
383 2 : }
384 :
385 0 : void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) {
386 0 : if (context.getNumTensors() > 0) {
387 0 : const TensorDim &out_dim = context.getOutput(SINGLE_INOUT_IDX).getDim();
388 0 : const TensorDim &in_dim = context.getInput(SINGLE_INOUT_IDX).getDim();
389 :
390 0 : TensorDim i_dim = {in_dim[2], 1, 1, in_dim[3]};
391 0 : TensorDim o_dim = {out_dim[2], 1, 1, out_dim[3]};
392 :
393 0 : Var_Grad in_var(i_dim, Initializer::NONE, true, false, "input");
394 0 : Var_Grad out_var(o_dim, Initializer::NONE, true, false, "output");
395 :
396 0 : fillWeightsFromContext(context);
397 0 : fillTensorsFromContext(context);
398 :
399 : RunLayerContext dist_context(
400 : context.getName(), context.getTrainable(), context.getLoss(),
401 0 : context.getInPlace(), context.getLossScale(), context.getContextData(),
402 0 : false, getWeightsForContext(), {&in_var}, {&out_var},
403 0 : getTensorsForContext());
404 :
405 0 : dist_layer->setBatch(dist_context, batch);
406 :
407 0 : for (unsigned int idx = 0; idx < dist_context.getNumTensors(); idx++) {
408 0 : context.updateTensor(idx, dist_context.getTensor(idx).getDim().batch());
409 : }
410 :
411 0 : clearFromContext();
412 0 : }
413 0 : }
414 :
415 : } /* namespace nntrainer */
|