Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2020 Parichay Kapoor <pk.kapoor@samsung.com>
4 : *
5 : * @file manager.cpp
6 : * @date 2 Dec 2020
7 : * @brief This is NNtrainer manager for all weights, i/o and intermediate
8 : * tensors
9 : * @see https://github.com/nnstreamer/nntrainer
10 : * @author Parichay Kapoor <pk.kapoor@samsung.com>
11 : * @author Jihoon Lee <jhoon.it.lee@samsung.com>
12 : * @bug No known bugs except for NYI items
13 : *
14 : */
15 :
16 : #ifdef __ANDROID__
17 : #include <android/sharedmem.h>
18 : #endif
19 :
20 : #ifdef DEBUG
21 : #include <cassert>
22 : #endif
23 : #include <fcntl.h>
24 : #include <functional>
25 : #include <limits>
26 : #include <stdexcept>
27 : #include <sys/stat.h>
28 : #include <vector>
29 :
30 : #if !defined(_WIN32)
31 : #include <sys/mman.h>
32 : #include <unistd.h>
33 : #endif
34 :
35 : #include <activation_layer.h>
36 : #include <basic_planner.h>
37 : #include <bn_layer.h>
38 : #include <graph_node.h>
39 : #include <grucell.h>
40 : #include <layer_node.h>
41 : #include <layer_normalization_layer.h>
42 : #include <loss/cross_entropy_sigmoid_loss_layer.h>
43 : #include <loss/cross_entropy_softmax_loss_layer.h>
44 : #include <loss/mse_loss_layer.h>
45 : #include <manager.h>
46 : #include <multiout_layer.h>
47 : #include <nntrainer_log.h>
48 : #include <optimized_v1_planner.h>
49 : #include <optimized_v2_planner.h>
50 : #include <optimized_v3_planner.h>
51 : #include <tensor_pool.h>
52 : #include <tensor_wrap_specs.h>
53 : #include <util_func.h>
54 : #include <var_grad.h>
55 :
56 : #include "utils/mman_windows.h"
57 :
58 : namespace nntrainer {
59 :
60 0 : MMapedMemory::MMapedMemory(size_t size, bool allocate_fd_) :
61 0 : fd(-1), buf(nullptr), buf_size(0), allocate_fd(allocate_fd_) {
62 :
63 : #ifndef __ANDROID__
64 0 : if (allocate_fd) {
65 : /// @todo create a file in tmpfs and bind to memfs
66 : /// memfd_create is not available for number of platforms so this is
67 : /// commented
68 : // auto fd_ = memfd_create("", 0);
69 : // if (fd_ < 0) {
70 : // throw std::runtime_error("[Manager] creating mem fd failed");
71 : // }
72 : // if (ftruncate(fd_, size) < 0) {
73 : // throw std::runtime_error("[Manager] truncating fd failed");
74 : // }
75 0 : ml_logi("[MMapedMemory] fd creation is not supported in this platform");
76 0 : allocate_fd = false;
77 : }
78 : #endif
79 : int fd_ = -1;
80 : void *buf_ = nullptr;
81 :
82 0 : if (allocate_fd) {
83 : #ifdef __ANDROID__
84 : /// unfortunately, memfd_create is not supported before android level 30
85 : fd_ = ASharedMemory_create("", size);
86 : if (fd_ < 0) {
87 : throw std::runtime_error("[MMapedMemory] creating mem fd failed");
88 : }
89 :
90 : if (ASharedMemory_setProt(fd_, PROT_READ | PROT_WRITE) < 0) {
91 : // unlink / close the given fd here
92 : close(fd_);
93 : throw std::runtime_error("[MMapedMemory] Setting prot failed");
94 : }
95 :
96 : buf_ = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0);
97 : #endif
98 : } else {
99 0 : buf_ = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
100 : fd_, 0);
101 : }
102 :
103 0 : if (buf_ == MAP_FAILED) {
104 : #ifdef __ANDROID__
105 : if (fd_ != -1) {
106 : // unlink / close the given fd here
107 : close(fd_);
108 : }
109 : #endif
110 :
111 0 : throw std::runtime_error("[MMapedMemory] mmap failed");
112 : }
113 :
114 0 : fd = fd_;
115 0 : buf = buf_;
116 0 : buf_size = size;
117 :
118 0 : ml_logd("[MMapedMemory] memory acquired size: %zu, fd: %d, addr: %p",
119 : buf_size, fd, buf);
120 0 : }
121 :
122 0 : MMapedMemory::~MMapedMemory() noexcept {
123 : #ifdef DEBUG
124 : assert(buf_size > 0 && fd > 0);
125 : #endif
126 :
127 0 : if (fd != -1) {
128 0 : if (close(fd) < 0) {
129 0 : ml_logw("[MMapedMemory] closing fd failed on destruction please check");
130 : }
131 : }
132 :
133 0 : if (buf != nullptr) {
134 0 : if (munmap(buf, buf_size) < 0) {
135 0 : ml_logw("[MMapedMemory] munmap failed on destruction please check");
136 : }
137 : }
138 :
139 : /// keeping the invariant although this is not necessary as of now
140 0 : fd = -1;
141 0 : buf = nullptr;
142 0 : buf_size = 0;
143 0 : ml_logd("[MMapedMemory] buf released");
144 0 : }
145 :
146 1 : void Manager::reinitialize() {
147 : inputs_v2.clear();
148 : outputs_v2.clear();
149 : tensors_v2.clear();
150 1 : tensor_pool.reinitialize();
151 1 : }
152 :
153 1300 : void Manager::allocateWeights(unsigned int max_exec_order_, bool init) {
154 1300 : max_exec_order = max_exec_order_;
155 1300 : if (!weight_pool.isAllocated()) {
156 618 : finalizeTensorPool(weight_pool, 0, max_exec_order_);
157 618 : weight_pool.allocate(init);
158 : }
159 1300 : }
160 :
161 1534 : void Manager::deallocateWeights() { weight_pool.deallocate(); }
162 :
163 21671 : static Tensor *requestTensor_(const TensorSpecV2 &spec,
164 : const GraphNode::ExecutionOrder &exec_order,
165 : const std::string &scope, TensorPool &tp,
166 : bool expose, bool trainable) {
167 : using RT = TensorSpecV2::RequestType;
168 : using LS = TensorLifespan;
169 21671 : NNTR_THROW_IF(spec.request_type == RT::MAYBE_MODIFYING_VIEW,
170 : std::invalid_argument)
171 : << "Modifying view cannot be requested, the request type has to be "
172 : "delegated to either view or unique";
173 :
174 21671 : auto [forward, calc_grad, calc_deriv, apply_grad] = exec_order;
175 :
176 21671 : std::vector<unsigned> order = spec.additional_exec_order;
177 21671 : if (expose) {
178 0 : order.push_back(TensorPool::PERSIST_END_ORDER);
179 : }
180 :
181 21671 : const auto name = scope + ":" + spec.name;
182 21671 : if (enum_class_or(spec.ls, LS::FORWARD_FUNC_LIFESPAN) == spec.ls) {
183 10908 : order.push_back(forward);
184 : }
185 21671 : if (enum_class_or(spec.ls, LS::CALC_GRAD_LIFESPAN) == spec.ls) {
186 9307 : order.push_back(calc_grad);
187 : }
188 21671 : if (enum_class_or(spec.ls, LS::CALC_DERIV_LIFESPAN) == spec.ls) {
189 12226 : order.push_back(calc_deriv);
190 : }
191 21671 : if (enum_class_or(spec.ls, LS::CALC_AGRAD_LIFESPAN) == spec.ls) {
192 0 : order.push_back(apply_grad);
193 : }
194 :
195 21671 : switch (spec.request_type) {
196 1604 : case RT::PLACEHOLDER:
197 1604 : return tp.placeholder(name, spec.dim);
198 8416 : case RT::UNIQUE:
199 8416 : return tp.request(name, spec.dim, order, spec.ls, spec.initializer);
200 0 : case RT::SHARED:
201 0 : return tp.requestOrExtend(name, spec.dim, order, spec.ls, spec.initializer);
202 11651 : case RT::READ_ONLY_VIEW:
203 11651 : return tp.view(name, spec.reference_name, spec.dim, order, spec.ls);
204 0 : case RT::MAYBE_MODIFYING_VIEW:
205 : default:
206 0 : throw std::logic_error("requestTensor_ should not reach here");
207 : }
208 :
209 : return nullptr;
210 21671 : }
211 :
212 5352 : Var_Grad *Manager::requestTensor(const VarGradSpecV2 &spec,
213 : TensorGroupType identify_as,
214 : const GraphNode::ExecutionOrder &exec_order,
215 : const std::string &scope, bool expose_var,
216 : bool expose_grad) {
217 5352 : NNTR_THROW_IF(identify_as == TensorGroupType::WEIGHT, std::invalid_argument)
218 : << "requestTensor with var grad spec cannot be identified as weights, use "
219 : "requestTensor with weight spec instead";
220 :
221 5352 : NNTR_THROW_IF(identify_as == TensorGroupType::INPUT or
222 : identify_as == TensorGroupType::TENSORS,
223 : nntrainer::exception::not_supported)
224 : << "Currently, input and tensors group type is not yet implemented, use "
225 : "requestInputs() requestTensors() instead";
226 :
227 5352 : bool is_train_mode = (exec_mode == ExecutionMode::TRAIN) ? true : false;
228 :
229 5352 : Tensor *var = requestTensor_(spec.variable_spec, exec_order, scope,
230 5352 : tensor_pool, expose_var, false);
231 5216 : Tensor *grad = (spec.gradient_spec && is_train_mode)
232 10564 : ? requestTensor_(*spec.gradient_spec, exec_order, scope,
233 : tensor_pool, expose_grad, false)
234 5352 : : nullptr;
235 :
236 : /// @note as only supporting identify_as == TensorGroupType::output, only
237 : /// saves to outputs for now
238 10704 : outputs_v2.push_back(std::make_unique<Var_Grad>(var, grad));
239 :
240 5352 : return outputs_v2.back().get();
241 : }
242 :
243 4435 : std::vector<Var_Grad *> Manager::requestTensors(
244 : const std::vector<VarGradSpecV2> &specs, TensorGroupType identify_as,
245 : const GraphNode::ExecutionOrder &exec_order, const std::string &scope,
246 : bool expose_var, bool expose_grad) {
247 : std::vector<Var_Grad *> ret;
248 4435 : ret.reserve(specs.size());
249 9787 : for (auto &spec : specs) {
250 5352 : ret.push_back(requestTensor(spec, identify_as, exec_order, scope,
251 : expose_var, expose_grad));
252 : }
253 :
254 4435 : return ret;
255 0 : }
256 :
257 : /**
258 : * @brief Allocate memory for all the managed tensors
259 : */
260 683 : void Manager::allocateTensors(unsigned int max_exec_order_) {
261 683 : allocateWeights(max_exec_order_);
262 :
263 683 : if (!tensor_pool.isAllocated()) {
264 683 : finalizeTensorPool(tensor_pool, 0, max_exec_order_);
265 683 : tensor_pool.allocate();
266 : }
267 683 : }
268 :
269 : /**
270 : * @brief Deallocate memory for all the managed tensors
271 : */
272 2484 : void Manager::deallocateTensors(bool dealloc_weights) {
273 2484 : if (dealloc_weights) {
274 1534 : deallocateWeights();
275 : }
276 :
277 2484 : tensor_pool.deallocate();
278 2484 : }
279 :
280 : #ifdef LAYER_V1
281 : void Manager::initializeTensorsInference(unsigned int max_exec_order_) {
282 : /**
283 : * A single buffer (shared_inout) provides memory for inputs and outputs of a
284 : * layer. Further, the output of layer i shares memory with input with layer
285 : * i+1. So, each alternate layer allocates memory from either the start of the
286 : * buffer or the end of the buffer, and use_first_last tracks this
287 : *
288 : * @note Label for the last layer is not initialized in inference.
289 : * @note Input for the first layer is not initialized in inference.
290 : */
291 : // Initialize shared input/output memory for inference
292 : // @note Memory for label is not allocated here as inference doesnt has label
293 : if (enable_inference_inout_memory_opt)
294 : shared_inout = Tensor(TensorDim({max_shared_inout}), false);
295 :
296 : bool use_first_last = 0;
297 : for (unsigned int idx = 0; idx < in_outs.size(); idx++) {
298 : auto &l_io = in_outs[idx];
299 : unsigned int offset = 0;
300 : bool is_first_layer = idx == 0;
301 :
302 : // For flatten layer, do not assign new memory
303 : if (idx > 0 && is_flat_type[idx])
304 : use_first_last = 1 - use_first_last;
305 :
306 : // In inference mode, do not allocate the memory for the input of the
307 : // first layer. These is the first entry in the in_outs. Inference() will
308 : // override input tensors of the first layer
309 : if (is_first_layer)
310 : continue;
311 :
312 : for (auto &io : l_io) {
313 : Tensor shared_inout_cur = Tensor();
314 : if (enable_inference_inout_memory_opt) {
315 : // if optimized
316 : if (use_first_last) {
317 : // Create tensor with from the front of shared tensor
318 : shared_inout_cur =
319 : shared_inout.getSharedDataTensor(io->getDim(), offset);
320 : } else {
321 : // Create tensor with from the back of shared tensor
322 : shared_inout_cur = shared_inout.getSharedDataTensor(
323 : io->getDim(),
324 : max_shared_inout - io->getDim().getDataLen() - offset);
325 : }
326 : offset += io->getDim().getDataLen();
327 : }
328 : io->initialize(shared_inout_cur, Tensor(), false);
329 : }
330 : use_first_last = 1 - use_first_last;
331 : }
332 : }
333 :
334 : void Manager::initializeTensorsTrain(unsigned int max_exec_order_) {
335 : // Initialize gradients
336 : initializeGradients();
337 :
338 : // Initialize shared derivative memory
339 : if (max_derivative_size > 0 && enable_activation_memory_opt)
340 : shared_deriv = Tensor(TensorDim({max_derivative_size}), false);
341 : for (unsigned int idx = 0; idx < in_outs.size(); idx++) {
342 : auto &l_io = in_outs[idx];
343 : unsigned int offset = 0;
344 : bool is_last_layer = idx == in_outs.size() - 1;
345 :
346 : for (auto &io : l_io) {
347 : // Last layer requires separate memory allocations for output and label
348 : // (deriv)
349 : if (enable_derivative_memory_opt && !is_last_layer) {
350 : // Training Mode with optimizations
351 : if (enable_activation_memory_opt &&
352 : (is_rnn_type[idx] || is_act_type[idx])) {
353 : io->initialize(
354 : Tensor(), shared_deriv.getSharedDataTensor(io->getDim(), offset));
355 : offset += io->getDim().getDataLen();
356 : } else {
357 : io->initializeShared();
358 : }
359 :
360 : } else {
361 : // Training Mode without optimizations
362 : io->initialize(Tensor(), Tensor(), true);
363 : }
364 : }
365 : }
366 : }
367 : #endif
368 :
369 : /**
370 : * @brief Create weights with the given spec
371 : *
372 : */
373 4407 : std::vector<Weight *> Manager::requestWeights(
374 : const GraphNode &node, const std::vector<Weight::Spec> &weights_spec,
375 : bool trainable, const std::vector<std::string> &shared_names) {
376 : const auto [forwarding_order, calcGradient_order, calcDerivative_order,
377 4407 : applyGradient_order] = node.getExecutionOrder();
378 :
379 : std::vector<unsigned int> default_var_exec_order(
380 4407 : {forwarding_order, calcDerivative_order});
381 :
382 : /**
383 : * TODO: This needs to be fixed. calcDerivative does not needs the gradient.
384 : * However, current implementation of loss needs the gradient computation.
385 : * and therefore, if we remove the calcDerivative order, then tests fails.
386 : */
387 : TensorLifespan var_ls;
388 4407 : if (exec_mode != ExecutionMode::INFERENCE) {
389 : var_ls = TensorLifespan::MAX_LIFESPAN;
390 : } else {
391 5 : if (enable_fsu) {
392 : var_ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
393 : } else {
394 : var_ls = TensorLifespan::FORWARD_INFER_LIFESPAN;
395 : }
396 : }
397 :
398 : TensorLifespan grad_ls = TensorLifespan::BACKWARD_FUNC_LIFESPAN;
399 :
400 : std::vector<Weight *> ret;
401 : size_t current_size = weights_v2.size();
402 :
403 9249 : for (unsigned int i = 0; i < weights_spec.size(); ++i) {
404 : auto &[dim_v, dim_g, t_initializer, w_reg, w_reg_const, decay,
405 : clip_by_global_norm, need_gradient, name, axis, loss_scale, is_mixed,
406 : is_virtual] = weights_spec.at(i);
407 :
408 : std::vector<unsigned int> var_exec_order;
409 14510 : for (auto order : default_var_exec_order) {
410 9676 : var_exec_order.push_back(order);
411 9676 : if (exec_mode == ExecutionMode::INFERENCE)
412 : break;
413 : }
414 : // auto var_exec_order = default_var_exec_order;
415 : std::vector<unsigned int> grad_exec_order;
416 :
417 4842 : if (trainable) {
418 4802 : var_exec_order.reserve(var_exec_order.size() + 2);
419 4802 : var_exec_order.push_back(calcGradient_order);
420 4802 : var_exec_order.push_back(applyGradient_order);
421 4802 : grad_exec_order.push_back(calcGradient_order);
422 4802 : grad_exec_order.push_back(applyGradient_order);
423 : }
424 :
425 : /**
426 : * If the weight is supposed to be clip by global norm, extend its exec
427 : * order with the max exec order where it will be used for clipping and then
428 : * applied to the weight.
429 : */
430 9668 : if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) ||
431 4826 : isMixedPrecision()) {
432 16 : grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
433 : // TODO: We need double check if it is OK not to add PERSIST_END_ORDER
434 : // here or add other conditions
435 : // var_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
436 : }
437 :
438 4842 : Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr;
439 4842 : bool is_dependent = !shared_names.empty();
440 4842 : if (is_dependent) {
441 : /// shared_name is used and the original name is discarded
442 : const auto &shared_name = shared_names.at(i);
443 : /** case when shared names are given */
444 1648 : var = weight_pool.requestOrExtend(shared_name, dim_v, var_exec_order,
445 : var_ls, t_initializer);
446 1648 : if (trainable && need_gradient) {
447 : /** We cannot use the tensor scheduling for weight gradient if the
448 : * weight is shared. Weight Sharing means, the gradient is not temporal
449 : * for each layer anymore and it is hard to overwritten.
450 : */
451 1216 : grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
452 : dim_g, grad_exec_order, grad_ls,
453 1216 : Initializer::ZEROS);
454 :
455 1216 : if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
456 0 : TensorDim var32_dim(dim_v);
457 : var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
458 : std::vector<unsigned int> var32_exec_order;
459 0 : var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
460 :
461 0 : var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim,
462 : var32_exec_order, var_ls,
463 0 : Initializer::ZEROS);
464 0 : }
465 : }
466 : } else {
467 : /** case requesting fresh weights */
468 3194 : if (exec_mode == ExecutionMode::INFERENCE && enable_fsu) {
469 0 : for (unsigned int i = 0; i < fsu_lookahead; ++i) {
470 0 : int lah_order = (forwarding_order - (fsu_lookahead - i));
471 0 : var_exec_order.push_back(std::max(lah_order, 0));
472 : }
473 : }
474 3194 : if (is_virtual) {
475 0 : var = weight_pool.request(name, dim_v, var_exec_order,
476 : TensorLifespan::VIRTUAL, t_initializer);
477 : } else {
478 3194 : var = weight_pool.request(name, dim_v, var_exec_order, var_ls,
479 : t_initializer);
480 : }
481 : // }
482 :
483 3194 : if (trainable && need_gradient) {
484 : /** is_wgrad is the index which is true when it is the gradient tensor
485 : * of weight. If it is true, memory planner schedule based on it to
486 : * reduce the memory.
487 : */
488 : bool is_wgrad = true;
489 : // if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm))
490 : // is_wgrad = false;
491 3116 : grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
492 3116 : grad_exec_order, grad_ls, Initializer::ZEROS,
493 : is_wgrad);
494 3116 : if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
495 0 : TensorDim var32_dim(dim_v);
496 : var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
497 : std::vector<unsigned int> var32_exec_order;
498 0 : var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
499 0 : var32 =
500 0 : weight_pool.request(name + ":var32", var32_dim, var32_exec_order,
501 0 : var_ls, Initializer::ZEROS);
502 0 : }
503 : }
504 : }
505 :
506 4842 : weights_v2.emplace_back(std::make_unique<Weight>(
507 : var, grad, var32, w_reg, w_reg_const, decay, is_dependent,
508 : clip_by_global_norm, axis, loss_scale, is_mixed));
509 4842 : }
510 :
511 : std::transform(weights_v2.begin() + current_size, weights_v2.end(),
512 : std::back_inserter(ret),
513 : [](auto const &elem) { return elem.get(); });
514 4407 : return ret;
515 4407 : }
516 :
517 : /**
518 : * @brief Create tensors with the given spec
519 : *
520 : */
521 4435 : std::vector<Var_Grad *> Manager::requestTensors(
522 : const GraphNode &node, const std::vector<Var_Grad::Spec> &tensors_spec,
523 : bool trainable, const std::vector<std::string> &shared_names) {
524 : const auto [forwarding_order, calcGradient_order, calcDerivative_order,
525 4435 : applyGradient_order] = node.getExecutionOrder();
526 :
527 : std::vector<Var_Grad *> ret;
528 : size_t current_size = tensors_v2.size();
529 4435 : bool is_train_mode = (exec_mode == ExecutionMode::TRAIN) ? true : false;
530 :
531 7074 : for (unsigned int i = 0; i < tensors_spec.size(); ++i) {
532 : auto const &[dim, t_init, need_grad, name, tspan, t_engine] =
533 : tensors_spec.at(i);
534 :
535 : std::vector<unsigned int> var_exec_order;
536 : std::vector<unsigned int> grad_exec_order;
537 :
538 : /** usage for tensors */
539 2639 : if (enum_class_logical_and(tspan, TensorLifespan::FORWARD_FUNC_LIFESPAN))
540 2292 : var_exec_order.push_back(forwarding_order);
541 :
542 : /** usage for tensors gradient in backwarding */
543 2639 : if (trainable && is_train_mode &&
544 2637 : enum_class_logical_and(tspan, TensorLifespan::CALC_GRAD_LIFESPAN)) {
545 2215 : var_exec_order.push_back(calcGradient_order);
546 2215 : grad_exec_order.push_back(calcGradient_order);
547 : }
548 :
549 2639 : if (is_train_mode &&
550 2639 : enum_class_logical_and(tspan, TensorLifespan::CALC_DERIV_LIFESPAN)) {
551 2583 : var_exec_order.push_back(calcDerivative_order);
552 2583 : grad_exec_order.push_back(calcDerivative_order);
553 : }
554 :
555 2639 : if (trainable && is_train_mode &&
556 2637 : enum_class_logical_and(tspan, TensorLifespan::CALC_AGRAD_LIFESPAN)) {
557 2215 : var_exec_order.push_back(applyGradient_order);
558 2215 : grad_exec_order.push_back(applyGradient_order);
559 : }
560 :
561 : bool is_dependent = !shared_names.empty();
562 2639 : Tensor *var = nullptr, *grad = nullptr;
563 2639 : if (is_dependent) {
564 : const auto &shared_name = shared_names.at(i);
565 488 : var = tensor_pool.requestOrExtend(shared_name, dim, var_exec_order, tspan,
566 : t_init);
567 488 : if (need_grad && tspan > TensorLifespan::FORWARD_FUNC_LIFESPAN) {
568 488 : grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
569 : dim, grad_exec_order, tspan,
570 976 : Initializer::ZEROS);
571 : }
572 : } else {
573 2151 : var = tensor_pool.request(name, dim, var_exec_order, tspan, t_init);
574 2151 : if (is_train_mode && need_grad &&
575 1088 : tspan > TensorLifespan::FORWARD_FUNC_LIFESPAN) {
576 1088 : grad = tensor_pool.request(name + Var_Grad::grad_suffix, /// name
577 : dim, grad_exec_order, tspan,
578 2176 : Initializer::ZEROS /// tensor initializer
579 : );
580 : }
581 : }
582 :
583 2639 : tensors_v2.emplace_back(std::make_unique<Var_Grad>(var, grad));
584 2639 : }
585 :
586 : std::transform(tensors_v2.begin() + current_size, tensors_v2.end(),
587 : std::back_inserter(ret),
588 : [](auto const &elem) { return elem.get(); });
589 4435 : return ret;
590 0 : }
591 :
592 : /**
593 : * @brief Create tensors with the given spec
594 : */
595 : std::vector<Var_Grad *>
596 4435 : Manager::requestInputs(const GraphNode &node,
597 : const std::vector<TensorDim> &inputs_dim,
598 : const std::vector<std::string> &outputs_name) {
599 : using RT = TensorSpecV2::RequestType;
600 :
601 4435 : bool is_train_mode = exec_mode == ExecutionMode::TRAIN;
602 :
603 4435 : TensorSpecV2 var_common_spec, grad_common_spec;
604 4435 : if (is_train_mode) {
605 4430 : var_common_spec.ls = TensorLifespan::FORWARD_GRAD_LIFESPAN;
606 : } else {
607 5 : var_common_spec.ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
608 : }
609 :
610 4435 : grad_common_spec.ls = TensorLifespan::CALC_DERIV_LIFESPAN;
611 : /// @todo handle this inside layer
612 8870 : if (node.getType() == ActivationLayer::type or
613 8466 : node.getType() == MultiOutLayer::type or
614 8082 : node.getType() == BatchNormalizationLayer::type or
615 12283 : node.getType() == LayerNormalizationLayer::type or !node.getTrainable())
616 996 : var_common_spec.ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
617 :
618 8870 : if (node.getType() == MSELossLayer::type or
619 8870 : node.getType() == CrossEntropySoftmaxLossLayer::type or
620 8256 : node.getType() == CrossEntropySigmoidLossLayer::type)
621 627 : var_common_spec.ls = TensorLifespan::FORWARD_DERIV_LIFESPAN;
622 :
623 4435 : if (node.getType() == GRUCellLayer::type) {
624 32 : grad_common_spec.ls = TensorLifespan::CALC_GRAD_DERIV_LIFESPAN;
625 : }
626 :
627 : std::vector<Var_Grad *> ret;
628 : size_t current_size = inputs_v2.size();
629 :
630 9991 : for (unsigned int idx = 0; idx < inputs_dim.size(); idx++) {
631 5556 : TensorSpecV2 var_spec = var_common_spec, grad_spec = grad_common_spec;
632 :
633 11112 : var_spec.name = std::string("input") + std::to_string(idx);
634 5556 : var_spec.dim = inputs_dim[idx];
635 :
636 11112 : grad_spec.name = var_spec.name + Var_Grad::grad_suffix;
637 5556 : grad_spec.dim = inputs_dim[idx];
638 :
639 5556 : if (!outputs_name.empty()) {
640 4583 : grad_spec.request_type = var_spec.request_type = RT::READ_ONLY_VIEW;
641 : var_spec.reference_name = outputs_name[idx];
642 9166 : grad_spec.reference_name = outputs_name[idx] + Var_Grad::grad_suffix;
643 973 : } else if (!node.getInputConnections().empty()) {
644 0 : grad_spec.request_type = var_spec.request_type = RT::UNIQUE;
645 : } else {
646 973 : var_spec.request_type = RT::PLACEHOLDER;
647 :
648 : #ifdef ENABLE_TEST
649 973 : grad_spec.request_type = RT::UNIQUE;
650 : #else
651 : grad_spec.request_type = RT::PLACEHOLDER;
652 : #endif
653 : }
654 5556 : inputs_v2.emplace_back(std::make_unique<Var_Grad>(
655 11112 : requestTensor_(var_spec, node.getExecutionOrder(), node.getName(),
656 5556 : tensor_pool, false, node.getTrainable()),
657 : is_train_mode
658 16663 : ? requestTensor_(grad_spec, node.getExecutionOrder(), node.getName(),
659 5551 : tensor_pool, false, node.getTrainable())
660 : : nullptr));
661 5556 : }
662 :
663 4435 : ret.reserve(inputs_dim.size());
664 : std::transform(inputs_v2.begin() + current_size, inputs_v2.end(),
665 : std::back_inserter(ret),
666 : [](auto const &elem) { return elem.get(); });
667 :
668 4435 : return ret;
669 4435 : }
670 :
671 : std::vector<unsigned int>
672 112 : Manager::getTensorExecutionOrders(const std::string &name, bool is_weight) {
673 :
674 24 : return is_weight ? weight_pool.getExecutionOrder(name)
675 136 : : tensor_pool.getExecutionOrder(name);
676 : }
677 :
678 : std::pair<unsigned int, unsigned int>
679 13426 : Manager::getMinMaxTensorExecutionOrder(const std::string &name,
680 : bool is_weight) {
681 :
682 4762 : auto orders = is_weight ? weight_pool.getExecutionOrder(name)
683 18188 : : tensor_pool.getExecutionOrder(name);
684 13426 : auto [min_, max_] = std::minmax_element(orders.begin(), orders.end());
685 26852 : return {*min_, *max_};
686 13426 : }
687 :
688 16 : unsigned int Manager::getSecondMaxTensorExecutionOrder(const std::string &name,
689 : bool is_weight) {
690 :
691 0 : auto orders = is_weight ? weight_pool.getExecutionOrder(name)
692 16 : : tensor_pool.getExecutionOrder(name);
693 16 : if (orders.size() < 2)
694 : throw std::runtime_error(
695 0 : "Requesting second last access with less than 2 exec orders");
696 : /** tensor pool exec order can have same exec order multiple times */
697 16 : std::sort(orders.begin(), orders.end());
698 16 : orders.erase(std::unique(orders.begin(), orders.end()), orders.end());
699 16 : return orders[orders.size() - 2];
700 16 : }
701 :
702 4860 : bool Manager::isFirstAccess(const std::string &name, unsigned current_execution,
703 : bool is_weight) {
704 : /// @todo add cache mechanism, eg) sort at finalizing requesting
705 4860 : return getMinMaxTensorExecutionOrder(name, is_weight).first ==
706 4860 : current_execution;
707 : }
708 :
709 4860 : bool Manager::isLastAccess(const std::string &name, unsigned current_execution,
710 : bool is_weight) {
711 : /// @todo add cache mechanism, eg) sort at finalizing requesting
712 4860 : return getMinMaxTensorExecutionOrder(name, is_weight).second ==
713 4860 : current_execution;
714 : }
715 :
716 16 : bool Manager::isSecondLastAccess(const std::string &name,
717 : unsigned current_execution, bool is_weight) {
718 : /// @todo add cache mechanism, eg) sort at finalizing requesting
719 16 : return getSecondMaxTensorExecutionOrder(name, is_weight) == current_execution;
720 : }
721 :
722 : /**
723 : * @brief Create tensors with the given spec
724 : *
725 : */
726 3722 : std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
727 : const std::vector<TensorDim> &dims, const std::string &name,
728 : const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip,
729 : bool is_mixed_precision, Initializer initializer) {
730 :
731 : std::vector<Tensor *> ret;
732 3722 : ret.reserve(dims.size());
733 :
734 : std::vector<unsigned int> exec;
735 3722 : exec.reserve(1);
736 3722 : if (is_grad_clip || is_mixed_precision) {
737 16 : exec.emplace_back(TensorPool::PERSIST_END_ORDER);
738 : } else {
739 3706 : exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second);
740 : }
741 :
742 : /// @note this is assuming weight optimizer variables is treated as weight, if
743 : /// not, there is room to optimize below behavior
744 4158 : for (unsigned int idx = 0; idx < dims.size(); idx++)
745 1308 : ret.push_back(weight_pool.request(name + suffix + std::to_string(idx),
746 : dims[idx], exec, lifespan, initializer));
747 :
748 3722 : return ret;
749 3722 : }
750 :
751 : std::vector<Weight *>
752 1235 : Manager::getWeights(const std::function<bool(const Weight *)> &condition) {
753 : std::vector<Weight *> conditional_weights;
754 :
755 10909 : for (auto &w : weights_v2) {
756 14516 : if (!condition || condition(w.get()))
757 4848 : conditional_weights.push_back(w.get());
758 : }
759 1235 : return conditional_weights;
760 0 : }
761 :
762 5400 : void Manager::flushCache() {
763 5400 : if (!fsu_lookahead) {
764 5400 : weight_pool.flushCache();
765 5400 : tensor_pool.flushCache();
766 : }
767 5400 : }
768 :
769 0 : bool Manager::checkLoadComplete(unsigned int order) {
770 :
771 : auto checkLoadCompleteAtPool = [](TensorPool &pool, unsigned int order) {
772 0 : return pool.checkLoadComplete(order);
773 : };
774 :
775 0 : if (exec_mode == ExecutionMode::TRAIN) {
776 0 : return checkLoadCompleteAtPool(weight_pool, order) &&
777 0 : checkLoadCompleteAtPool(tensor_pool, order);
778 : } else {
779 0 : return checkLoadCompleteAtPool(weight_pool, order);
780 : }
781 : }
782 :
783 0 : bool Manager::checkUnloadComplete(unsigned int order) {
784 : if (async_unload_tensor.count(order)) {
785 0 : auto &tasks = async_unload_tensor[order];
786 0 : std::unique_lock<std::mutex> lock(completed_unload_mutex);
787 0 : if (exec_mode == ExecutionMode::TRAIN) {
788 0 : auto w_fut = completed_unload_tensor[std::get<0>(tasks)].get_future();
789 0 : auto t_fut = completed_unload_tensor[std::get<1>(tasks)].get_future();
790 0 : lock.unlock();
791 0 : if (std::get<0>(tasks) != 0)
792 0 : w_fut.wait();
793 0 : if (std::get<1>(tasks) != 0)
794 0 : t_fut.wait();
795 : } else {
796 0 : auto w_fut = completed_unload_tensor[std::get<0>(tasks)].get_future();
797 0 : lock.unlock();
798 0 : if (std::get<0>(tasks) != 0)
799 0 : w_fut.wait();
800 : }
801 : async_unload_tensor.erase(order);
802 : }
803 0 : return true;
804 : }
805 :
806 0 : void Manager::LoadTensors(unsigned int order,
807 : unsigned int remainder_lookahead) {
808 :
809 0 : auto loadTensorsAsync = [&](TensorPool &pool, unsigned int order) {
810 0 : return pool.loadCacheExecAsync(
811 0 : order, [&](int id, TaskExecutor::CompleteStatus status,
812 : std::future<TaskExecutor::CompleteStatus> fut) {
813 0 : std::scoped_lock<std::mutex> lock(completed_load_mutex);
814 0 : completed_load_fut[id] = std::move(fut);
815 0 : });
816 0 : };
817 :
818 0 : auto enqueTasks = [&](unsigned int o) {
819 0 : auto load_weight = loadTensorsAsync(weight_pool, o);
820 0 : ml_logd("load weight is requested in LoadTensors with order - %d", o);
821 : int load_tensor = 0;
822 :
823 0 : if (exec_mode != ml::train::ExecutionMode::INFERENCE) {
824 0 : load_tensor = loadTensorsAsync(tensor_pool, o);
825 0 : ml_logd("load tensor is requested in LoadTensors with order - %d", o);
826 : }
827 0 : NNTR_THROW_IF(load_weight < 0 || load_tensor < 0, std::runtime_error)
828 : << "Fail to launch task";
829 0 : };
830 :
831 0 : if (order <= max_exec_order) {
832 0 : enqueTasks(order);
833 : }
834 0 : }
835 :
836 0 : void Manager::UnloadTensors(unsigned int order) {
837 :
838 0 : auto unloadTensorsAsync = [&](TensorPool &pool, unsigned int order) {
839 0 : return pool.flushCacheExecAsync(
840 0 : order, [&](int id, TaskExecutor::CompleteStatus status,
841 : std::future<TaskExecutor::CompleteStatus> fut) {
842 0 : std::scoped_lock<std::mutex> lock(completed_unload_mutex);
843 0 : completed_unload_tensor[id].set_value(true);
844 0 : });
845 0 : };
846 :
847 0 : auto enqueTasks = [&](unsigned int o) {
848 0 : if (async_unload_tensor.count(o)) {
849 0 : ml_logd("Task unloadTensors (%d) is in progress", o);
850 : return;
851 : }
852 0 : auto unload_weight = unloadTensorsAsync(weight_pool, o);
853 0 : ml_logd("unload weight is requested in UnLoadTensors with order - %d", o);
854 : int unload_tensor = 0;
855 0 : if (exec_mode != ml::train::ExecutionMode::INFERENCE) {
856 0 : unload_tensor = unloadTensorsAsync(tensor_pool, o);
857 0 : ml_logd("unload tensor is requested in UnLoadTensors with order - %d", o);
858 : }
859 0 : NNTR_THROW_IF(unload_weight < 0 || unload_tensor < 0, std::runtime_error)
860 : << "Faile to launch task";
861 0 : async_unload_tensor[o] = std::make_tuple(unload_weight, unload_tensor);
862 0 : };
863 :
864 0 : enqueTasks(order);
865 0 : }
866 :
867 94179 : void Manager::flushCacheExcept(unsigned int order) {
868 0 : auto loadAsync = [&](TensorPool &pool, unsigned int order) {
869 0 : return pool.loadCacheExecAsync(
870 :
871 0 : order, [&](int id, TaskExecutor::CompleteStatus status,
872 : std::future<TaskExecutor::CompleteStatus> fu) {
873 0 : std::scoped_lock<std::mutex> lock(completed_mutex);
874 0 : completed[id].set_value(true);
875 0 : });
876 94179 : };
877 :
878 0 : auto waitComplete = [&](unsigned int o) {
879 0 : auto &tasks = async_task_eos[o];
880 :
881 0 : std::unique_lock<std::mutex> lock(completed_mutex);
882 0 : auto w_fut = completed[std::get<0>(tasks)].get_future();
883 0 : auto t_fut = completed[std::get<1>(tasks)].get_future();
884 0 : lock.unlock();
885 :
886 0 : w_fut.wait();
887 0 : t_fut.wait();
888 :
889 0 : async_task_eos.erase(o);
890 0 : };
891 :
892 : // TODO: lookahead > 1 is required.
893 94179 : if (fsu_lookahead == 1) {
894 : if (async_task_eos.count(order) == 1)
895 0 : waitComplete(order);
896 :
897 0 : auto load_weight = loadAsync(weight_pool, order + 1);
898 0 : auto load_tensor = loadAsync(tensor_pool, order + 1);
899 :
900 0 : NNTR_THROW_IF(load_weight < 0 || load_tensor < 0, std::runtime_error)
901 : << "Failed to launch preloading task";
902 0 : async_task_eos[order + 1] = std::make_tuple(load_weight, load_tensor);
903 : } else {
904 94179 : weight_pool.flushCacheExcept(order);
905 94179 : tensor_pool.flushCacheExcept(order);
906 : }
907 94179 : }
908 :
909 1301 : void Manager::finalizeTensorPool(TensorPool &pool, unsigned int start,
910 : unsigned int end) {
911 1301 : if (enable_optimizations) {
912 408 : if (exec_mode == ExecutionMode::INFERENCE && enable_fsu) {
913 : //@todo change V3 and validate
914 0 : pool.finalize(OptimizedV1Planner(), start, end);
915 : } else {
916 408 : pool.finalize(OptimizedV1Planner(), start, end);
917 : }
918 : } else {
919 893 : pool.finalize(BasicPlanner(), start, end);
920 : }
921 1301 : }
922 :
923 0 : unsigned int Manager::inActive(unsigned int order) {
924 0 : return weight_pool.inActive(order);
925 : }
926 :
927 : } // namespace nntrainer
|