LCOV - code coverage report
Current view: top level - nntrainer/tensor - manager.cpp (source / functions) Coverage Total Hit
Test: coverage_filtered.info Lines: 61.5 % 361 222
Test Date: 2025-12-14 20:38:17 Functions: 57.9 % 38 22

            Line data    Source code
       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2020 Parichay Kapoor <pk.kapoor@samsung.com>
       4              :  *
       5              :  * @file   manager.cpp
       6              :  * @date   2 Dec 2020
       7              :  * @brief  This is NNtrainer manager for all weights, i/o and intermediate
       8              :  * tensors
       9              :  * @see    https://github.com/nnstreamer/nntrainer
      10              :  * @author Parichay Kapoor <pk.kapoor@samsung.com>
      11              :  * @author Jihoon Lee <jhoon.it.lee@samsung.com>
      12              :  * @bug    No known bugs except for NYI items
      13              :  *
      14              :  */
      15              : 
      16              : #ifdef __ANDROID__
      17              : #include <android/sharedmem.h>
      18              : #endif
      19              : 
      20              : #ifdef DEBUG
      21              : #include <cassert>
      22              : #endif
      23              : #include <fcntl.h>
      24              : #include <functional>
      25              : #include <limits>
      26              : #include <stdexcept>
      27              : #include <sys/stat.h>
      28              : #include <vector>
      29              : 
      30              : #if !defined(_WIN32)
      31              : #include <sys/mman.h>
      32              : #include <unistd.h>
      33              : #endif
      34              : 
      35              : #include <activation_layer.h>
      36              : #include <basic_planner.h>
      37              : #include <bn_layer.h>
      38              : #include <graph_node.h>
      39              : #include <grucell.h>
      40              : #include <layer_node.h>
      41              : #include <layer_normalization_layer.h>
      42              : #include <loss/cross_entropy_sigmoid_loss_layer.h>
      43              : #include <loss/cross_entropy_softmax_loss_layer.h>
      44              : #include <loss/mse_loss_layer.h>
      45              : #include <manager.h>
      46              : #include <multiout_layer.h>
      47              : #include <nntrainer_log.h>
      48              : #include <optimized_v1_planner.h>
      49              : #include <optimized_v2_planner.h>
      50              : #include <optimized_v3_planner.h>
      51              : #include <tensor_pool.h>
      52              : #include <tensor_wrap_specs.h>
      53              : #include <util_func.h>
      54              : #include <var_grad.h>
      55              : 
      56              : #include "utils/mman_windows.h"
      57              : 
      58              : namespace nntrainer {
      59              : 
      60            0 : MMapedMemory::MMapedMemory(size_t size, bool allocate_fd_) :
      61            0 :   fd(-1), buf(nullptr), buf_size(0), allocate_fd(allocate_fd_) {
      62              : 
      63              : #ifndef __ANDROID__
      64            0 :   if (allocate_fd) {
      65              :     /// @todo create a file in tmpfs and bind to memfs
      66              :     /// memfd_create is not available for number of platforms so this is
      67              :     /// commented
      68              :     // auto fd_ = memfd_create("", 0);
      69              :     // if (fd_ < 0) {
      70              :     //   throw std::runtime_error("[Manager] creating mem fd failed");
      71              :     // }
      72              :     // if (ftruncate(fd_, size) < 0) {
      73              :     //   throw std::runtime_error("[Manager] truncating fd failed");
      74              :     // }
      75            0 :     ml_logi("[MMapedMemory] fd creation is not supported in this platform");
      76            0 :     allocate_fd = false;
      77              :   }
      78              : #endif
      79              :   int fd_ = -1;
      80              :   void *buf_ = nullptr;
      81              : 
      82            0 :   if (allocate_fd) {
      83              : #ifdef __ANDROID__
      84              :     /// unfortunately, memfd_create is not supported before android level 30
      85              :     fd_ = ASharedMemory_create("", size);
      86              :     if (fd_ < 0) {
      87              :       throw std::runtime_error("[MMapedMemory] creating mem fd failed");
      88              :     }
      89              : 
      90              :     if (ASharedMemory_setProt(fd_, PROT_READ | PROT_WRITE) < 0) {
      91              :       // unlink / close the given fd here
      92              :       close(fd_);
      93              :       throw std::runtime_error("[MMapedMemory] Setting prot failed");
      94              :     }
      95              : 
      96              :     buf_ = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 0);
      97              : #endif
      98              :   } else {
      99            0 :     buf_ = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS,
     100              :                 fd_, 0);
     101              :   }
     102              : 
     103            0 :   if (buf_ == MAP_FAILED) {
     104              : #ifdef __ANDROID__
     105              :     if (fd_ != -1) {
     106              :       // unlink / close the given fd here
     107              :       close(fd_);
     108              :     }
     109              : #endif
     110              : 
     111            0 :     throw std::runtime_error("[MMapedMemory] mmap failed");
     112              :   }
     113              : 
     114            0 :   fd = fd_;
     115            0 :   buf = buf_;
     116            0 :   buf_size = size;
     117              : 
     118            0 :   ml_logd("[MMapedMemory] memory acquired size: %zu, fd: %d, addr: %p",
     119              :           buf_size, fd, buf);
     120            0 : }
     121              : 
     122            0 : MMapedMemory::~MMapedMemory() noexcept {
     123              : #ifdef DEBUG
     124              :   assert(buf_size > 0 && fd > 0);
     125              : #endif
     126              : 
     127            0 :   if (fd != -1) {
     128            0 :     if (close(fd) < 0) {
     129            0 :       ml_logw("[MMapedMemory] closing fd failed on destruction please check");
     130              :     }
     131              :   }
     132              : 
     133            0 :   if (buf != nullptr) {
     134            0 :     if (munmap(buf, buf_size) < 0) {
     135            0 :       ml_logw("[MMapedMemory] munmap failed on destruction please check");
     136              :     }
     137              :   }
     138              : 
     139              :   /// keeping the invariant although this is not necessary as of now
     140            0 :   fd = -1;
     141            0 :   buf = nullptr;
     142            0 :   buf_size = 0;
     143            0 :   ml_logd("[MMapedMemory] buf released");
     144            0 : }
     145              : 
     146            1 : void Manager::reinitialize() {
     147              :   inputs_v2.clear();
     148              :   outputs_v2.clear();
     149              :   tensors_v2.clear();
     150            1 :   tensor_pool.reinitialize();
     151            1 : }
     152              : 
     153         1300 : void Manager::allocateWeights(unsigned int max_exec_order_, bool init) {
     154         1300 :   max_exec_order = max_exec_order_;
     155         1300 :   if (!weight_pool.isAllocated()) {
     156          618 :     finalizeTensorPool(weight_pool, 0, max_exec_order_);
     157          618 :     weight_pool.allocate(init);
     158              :   }
     159         1300 : }
     160              : 
     161         1534 : void Manager::deallocateWeights() { weight_pool.deallocate(); }
     162              : 
     163        21671 : static Tensor *requestTensor_(const TensorSpecV2 &spec,
     164              :                               const GraphNode::ExecutionOrder &exec_order,
     165              :                               const std::string &scope, TensorPool &tp,
     166              :                               bool expose, bool trainable) {
     167              :   using RT = TensorSpecV2::RequestType;
     168              :   using LS = TensorLifespan;
     169        21671 :   NNTR_THROW_IF(spec.request_type == RT::MAYBE_MODIFYING_VIEW,
     170              :                 std::invalid_argument)
     171              :     << "Modifying view cannot be requested, the request type has to be "
     172              :        "delegated to either view or unique";
     173              : 
     174        21671 :   auto [forward, calc_grad, calc_deriv, apply_grad] = exec_order;
     175              : 
     176        21671 :   std::vector<unsigned> order = spec.additional_exec_order;
     177        21671 :   if (expose) {
     178            0 :     order.push_back(TensorPool::PERSIST_END_ORDER);
     179              :   }
     180              : 
     181        21671 :   const auto name = scope + ":" + spec.name;
     182        21671 :   if (enum_class_or(spec.ls, LS::FORWARD_FUNC_LIFESPAN) == spec.ls) {
     183        10908 :     order.push_back(forward);
     184              :   }
     185        21671 :   if (enum_class_or(spec.ls, LS::CALC_GRAD_LIFESPAN) == spec.ls) {
     186         9307 :     order.push_back(calc_grad);
     187              :   }
     188        21671 :   if (enum_class_or(spec.ls, LS::CALC_DERIV_LIFESPAN) == spec.ls) {
     189        12226 :     order.push_back(calc_deriv);
     190              :   }
     191        21671 :   if (enum_class_or(spec.ls, LS::CALC_AGRAD_LIFESPAN) == spec.ls) {
     192            0 :     order.push_back(apply_grad);
     193              :   }
     194              : 
     195        21671 :   switch (spec.request_type) {
     196         1604 :   case RT::PLACEHOLDER:
     197         1604 :     return tp.placeholder(name, spec.dim);
     198         8416 :   case RT::UNIQUE:
     199         8416 :     return tp.request(name, spec.dim, order, spec.ls, spec.initializer);
     200            0 :   case RT::SHARED:
     201            0 :     return tp.requestOrExtend(name, spec.dim, order, spec.ls, spec.initializer);
     202        11651 :   case RT::READ_ONLY_VIEW:
     203        11651 :     return tp.view(name, spec.reference_name, spec.dim, order, spec.ls);
     204            0 :   case RT::MAYBE_MODIFYING_VIEW:
     205              :   default:
     206            0 :     throw std::logic_error("requestTensor_ should not reach here");
     207              :   }
     208              : 
     209              :   return nullptr;
     210        21671 : }
     211              : 
     212         5352 : Var_Grad *Manager::requestTensor(const VarGradSpecV2 &spec,
     213              :                                  TensorGroupType identify_as,
     214              :                                  const GraphNode::ExecutionOrder &exec_order,
     215              :                                  const std::string &scope, bool expose_var,
     216              :                                  bool expose_grad) {
     217         5352 :   NNTR_THROW_IF(identify_as == TensorGroupType::WEIGHT, std::invalid_argument)
     218              :     << "requestTensor with var grad spec cannot be identified as weights, use "
     219              :        "requestTensor with weight spec instead";
     220              : 
     221         5352 :   NNTR_THROW_IF(identify_as == TensorGroupType::INPUT or
     222              :                   identify_as == TensorGroupType::TENSORS,
     223              :                 nntrainer::exception::not_supported)
     224              :     << "Currently, input and tensors group type is not yet implemented, use "
     225              :        "requestInputs() requestTensors() instead";
     226              : 
     227         5352 :   bool is_train_mode = (exec_mode == ExecutionMode::TRAIN) ? true : false;
     228              : 
     229         5352 :   Tensor *var = requestTensor_(spec.variable_spec, exec_order, scope,
     230         5352 :                                tensor_pool, expose_var, false);
     231         5216 :   Tensor *grad = (spec.gradient_spec && is_train_mode)
     232        10564 :                    ? requestTensor_(*spec.gradient_spec, exec_order, scope,
     233              :                                     tensor_pool, expose_grad, false)
     234         5352 :                    : nullptr;
     235              : 
     236              :   /// @note as only supporting identify_as == TensorGroupType::output, only
     237              :   /// saves to outputs for now
     238        10704 :   outputs_v2.push_back(std::make_unique<Var_Grad>(var, grad));
     239              : 
     240         5352 :   return outputs_v2.back().get();
     241              : }
     242              : 
     243         4435 : std::vector<Var_Grad *> Manager::requestTensors(
     244              :   const std::vector<VarGradSpecV2> &specs, TensorGroupType identify_as,
     245              :   const GraphNode::ExecutionOrder &exec_order, const std::string &scope,
     246              :   bool expose_var, bool expose_grad) {
     247              :   std::vector<Var_Grad *> ret;
     248         4435 :   ret.reserve(specs.size());
     249         9787 :   for (auto &spec : specs) {
     250         5352 :     ret.push_back(requestTensor(spec, identify_as, exec_order, scope,
     251              :                                 expose_var, expose_grad));
     252              :   }
     253              : 
     254         4435 :   return ret;
     255            0 : }
     256              : 
     257              : /**
     258              :  * @brief Allocate memory for all the managed tensors
     259              :  */
     260          683 : void Manager::allocateTensors(unsigned int max_exec_order_) {
     261          683 :   allocateWeights(max_exec_order_);
     262              : 
     263          683 :   if (!tensor_pool.isAllocated()) {
     264          683 :     finalizeTensorPool(tensor_pool, 0, max_exec_order_);
     265          683 :     tensor_pool.allocate();
     266              :   }
     267          683 : }
     268              : 
     269              : /**
     270              :  * @brief Deallocate memory for all the managed tensors
     271              :  */
     272         2484 : void Manager::deallocateTensors(bool dealloc_weights) {
     273         2484 :   if (dealloc_weights) {
     274         1534 :     deallocateWeights();
     275              :   }
     276              : 
     277         2484 :   tensor_pool.deallocate();
     278         2484 : }
     279              : 
     280              : #ifdef LAYER_V1
     281              : void Manager::initializeTensorsInference(unsigned int max_exec_order_) {
     282              :   /**
     283              :    * A single buffer (shared_inout) provides memory for inputs and outputs of a
     284              :    * layer. Further, the output of layer i shares memory with input with layer
     285              :    * i+1. So, each alternate layer allocates memory from either the start of the
     286              :    * buffer or the end of the buffer, and use_first_last tracks this
     287              :    *
     288              :    * @note Label for the last layer is not initialized in inference.
     289              :    * @note Input for the first layer is not initialized in inference.
     290              :    */
     291              :   // Initialize shared input/output memory for inference
     292              :   // @note Memory for label is not allocated here as inference doesnt has label
     293              :   if (enable_inference_inout_memory_opt)
     294              :     shared_inout = Tensor(TensorDim({max_shared_inout}), false);
     295              : 
     296              :   bool use_first_last = 0;
     297              :   for (unsigned int idx = 0; idx < in_outs.size(); idx++) {
     298              :     auto &l_io = in_outs[idx];
     299              :     unsigned int offset = 0;
     300              :     bool is_first_layer = idx == 0;
     301              : 
     302              :     // For flatten layer, do not assign new memory
     303              :     if (idx > 0 && is_flat_type[idx])
     304              :       use_first_last = 1 - use_first_last;
     305              : 
     306              :     // In inference mode, do not allocate the memory for the input of the
     307              :     // first layer. These is the first entry in the in_outs. Inference() will
     308              :     // override input tensors of the first layer
     309              :     if (is_first_layer)
     310              :       continue;
     311              : 
     312              :     for (auto &io : l_io) {
     313              :       Tensor shared_inout_cur = Tensor();
     314              :       if (enable_inference_inout_memory_opt) {
     315              :         // if optimized
     316              :         if (use_first_last) {
     317              :           // Create tensor with from the front of shared tensor
     318              :           shared_inout_cur =
     319              :             shared_inout.getSharedDataTensor(io->getDim(), offset);
     320              :         } else {
     321              :           // Create tensor with from the back of shared tensor
     322              :           shared_inout_cur = shared_inout.getSharedDataTensor(
     323              :             io->getDim(),
     324              :             max_shared_inout - io->getDim().getDataLen() - offset);
     325              :         }
     326              :         offset += io->getDim().getDataLen();
     327              :       }
     328              :       io->initialize(shared_inout_cur, Tensor(), false);
     329              :     }
     330              :     use_first_last = 1 - use_first_last;
     331              :   }
     332              : }
     333              : 
     334              : void Manager::initializeTensorsTrain(unsigned int max_exec_order_) {
     335              :   // Initialize gradients
     336              :   initializeGradients();
     337              : 
     338              :   // Initialize shared derivative memory
     339              :   if (max_derivative_size > 0 && enable_activation_memory_opt)
     340              :     shared_deriv = Tensor(TensorDim({max_derivative_size}), false);
     341              :   for (unsigned int idx = 0; idx < in_outs.size(); idx++) {
     342              :     auto &l_io = in_outs[idx];
     343              :     unsigned int offset = 0;
     344              :     bool is_last_layer = idx == in_outs.size() - 1;
     345              : 
     346              :     for (auto &io : l_io) {
     347              :       // Last layer requires separate memory allocations for output and label
     348              :       // (deriv)
     349              :       if (enable_derivative_memory_opt && !is_last_layer) {
     350              :         // Training Mode with optimizations
     351              :         if (enable_activation_memory_opt &&
     352              :             (is_rnn_type[idx] || is_act_type[idx])) {
     353              :           io->initialize(
     354              :             Tensor(), shared_deriv.getSharedDataTensor(io->getDim(), offset));
     355              :           offset += io->getDim().getDataLen();
     356              :         } else {
     357              :           io->initializeShared();
     358              :         }
     359              : 
     360              :       } else {
     361              :         // Training Mode without optimizations
     362              :         io->initialize(Tensor(), Tensor(), true);
     363              :       }
     364              :     }
     365              :   }
     366              : }
     367              : #endif
     368              : 
     369              : /**
     370              :  * @brief     Create weights with the given spec
     371              :  *
     372              :  */
     373         4407 : std::vector<Weight *> Manager::requestWeights(
     374              :   const GraphNode &node, const std::vector<Weight::Spec> &weights_spec,
     375              :   bool trainable, const std::vector<std::string> &shared_names) {
     376              :   const auto [forwarding_order, calcGradient_order, calcDerivative_order,
     377         4407 :               applyGradient_order] = node.getExecutionOrder();
     378              : 
     379              :   std::vector<unsigned int> default_var_exec_order(
     380         4407 :     {forwarding_order, calcDerivative_order});
     381              : 
     382              :   /**
     383              :    *  TODO: This needs to be fixed. calcDerivative does not needs the gradient.
     384              :    *  However, current implementation of loss needs the gradient computation.
     385              :    *  and therefore, if we remove the calcDerivative order, then tests fails.
     386              :    */
     387              :   TensorLifespan var_ls;
     388         4407 :   if (exec_mode != ExecutionMode::INFERENCE) {
     389              :     var_ls = TensorLifespan::MAX_LIFESPAN;
     390              :   } else {
     391            5 :     if (enable_fsu) {
     392              :       var_ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
     393              :     } else {
     394              :       var_ls = TensorLifespan::FORWARD_INFER_LIFESPAN;
     395              :     }
     396              :   }
     397              : 
     398              :   TensorLifespan grad_ls = TensorLifespan::BACKWARD_FUNC_LIFESPAN;
     399              : 
     400              :   std::vector<Weight *> ret;
     401              :   size_t current_size = weights_v2.size();
     402              : 
     403         9249 :   for (unsigned int i = 0; i < weights_spec.size(); ++i) {
     404              :     auto &[dim_v, dim_g, t_initializer, w_reg, w_reg_const, decay,
     405              :            clip_by_global_norm, need_gradient, name, axis, loss_scale, is_mixed,
     406              :            is_virtual] = weights_spec.at(i);
     407              : 
     408              :     std::vector<unsigned int> var_exec_order;
     409        14510 :     for (auto order : default_var_exec_order) {
     410         9676 :       var_exec_order.push_back(order);
     411         9676 :       if (exec_mode == ExecutionMode::INFERENCE)
     412              :         break;
     413              :     }
     414              :     // auto var_exec_order = default_var_exec_order;
     415              :     std::vector<unsigned int> grad_exec_order;
     416              : 
     417         4842 :     if (trainable) {
     418         4802 :       var_exec_order.reserve(var_exec_order.size() + 2);
     419         4802 :       var_exec_order.push_back(calcGradient_order);
     420         4802 :       var_exec_order.push_back(applyGradient_order);
     421         4802 :       grad_exec_order.push_back(calcGradient_order);
     422         4802 :       grad_exec_order.push_back(applyGradient_order);
     423              :     }
     424              : 
     425              :     /**
     426              :      * If the weight is supposed to be clip by global norm, extend its exec
     427              :      * order with the max exec order where it will be used for clipping and then
     428              :      * applied to the weight.
     429              :      */
     430         9668 :     if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) ||
     431         4826 :         isMixedPrecision()) {
     432           16 :       grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
     433              :       // TODO: We need double check if it is OK not to add PERSIST_END_ORDER
     434              :       // here or add other conditions
     435              :       // var_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
     436              :     }
     437              : 
     438         4842 :     Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr;
     439         4842 :     bool is_dependent = !shared_names.empty();
     440         4842 :     if (is_dependent) {
     441              :       /// shared_name is used and the original name is discarded
     442              :       const auto &shared_name = shared_names.at(i);
     443              :       /** case when shared names are given */
     444         1648 :       var = weight_pool.requestOrExtend(shared_name, dim_v, var_exec_order,
     445              :                                         var_ls, t_initializer);
     446         1648 :       if (trainable && need_gradient) {
     447              :         /** We cannot use the tensor scheduling for weight gradient if the
     448              :          * weight is shared. Weight Sharing means, the gradient is not temporal
     449              :          * for each layer anymore and it is hard to overwritten.
     450              :          */
     451         1216 :         grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
     452              :                                            dim_g, grad_exec_order, grad_ls,
     453         1216 :                                            Initializer::ZEROS);
     454              : 
     455         1216 :         if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
     456            0 :           TensorDim var32_dim(dim_v);
     457              :           var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
     458              :           std::vector<unsigned int> var32_exec_order;
     459            0 :           var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
     460              : 
     461            0 :           var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim,
     462              :                                               var32_exec_order, var_ls,
     463            0 :                                               Initializer::ZEROS);
     464            0 :         }
     465              :       }
     466              :     } else {
     467              :       /** case requesting fresh weights */
     468         3194 :       if (exec_mode == ExecutionMode::INFERENCE && enable_fsu) {
     469            0 :         for (unsigned int i = 0; i < fsu_lookahead; ++i) {
     470            0 :           int lah_order = (forwarding_order - (fsu_lookahead - i));
     471            0 :           var_exec_order.push_back(std::max(lah_order, 0));
     472              :         }
     473              :       }
     474         3194 :       if (is_virtual) {
     475            0 :         var = weight_pool.request(name, dim_v, var_exec_order,
     476              :                                   TensorLifespan::VIRTUAL, t_initializer);
     477              :       } else {
     478         3194 :         var = weight_pool.request(name, dim_v, var_exec_order, var_ls,
     479              :                                   t_initializer);
     480              :       }
     481              :       // }
     482              : 
     483         3194 :       if (trainable && need_gradient) {
     484              :         /** is_wgrad is the index which is true when it is the gradient tensor
     485              :          * of weight. If it is true, memory planner schedule based on it to
     486              :          * reduce the memory.
     487              :          */
     488              :         bool is_wgrad = true;
     489              :         //        if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm))
     490              :         //          is_wgrad = false;
     491         3116 :         grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
     492         3116 :                                    grad_exec_order, grad_ls, Initializer::ZEROS,
     493              :                                    is_wgrad);
     494         3116 :         if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
     495            0 :           TensorDim var32_dim(dim_v);
     496              :           var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
     497              :           std::vector<unsigned int> var32_exec_order;
     498            0 :           var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
     499            0 :           var32 =
     500            0 :             weight_pool.request(name + ":var32", var32_dim, var32_exec_order,
     501            0 :                                 var_ls, Initializer::ZEROS);
     502            0 :         }
     503              :       }
     504              :     }
     505              : 
     506         4842 :     weights_v2.emplace_back(std::make_unique<Weight>(
     507              :       var, grad, var32, w_reg, w_reg_const, decay, is_dependent,
     508              :       clip_by_global_norm, axis, loss_scale, is_mixed));
     509         4842 :   }
     510              : 
     511              :   std::transform(weights_v2.begin() + current_size, weights_v2.end(),
     512              :                  std::back_inserter(ret),
     513              :                  [](auto const &elem) { return elem.get(); });
     514         4407 :   return ret;
     515         4407 : }
     516              : 
     517              : /**
     518              :  * @brief     Create tensors with the given spec
     519              :  *
     520              :  */
     521         4435 : std::vector<Var_Grad *> Manager::requestTensors(
     522              :   const GraphNode &node, const std::vector<Var_Grad::Spec> &tensors_spec,
     523              :   bool trainable, const std::vector<std::string> &shared_names) {
     524              :   const auto [forwarding_order, calcGradient_order, calcDerivative_order,
     525         4435 :               applyGradient_order] = node.getExecutionOrder();
     526              : 
     527              :   std::vector<Var_Grad *> ret;
     528              :   size_t current_size = tensors_v2.size();
     529         4435 :   bool is_train_mode = (exec_mode == ExecutionMode::TRAIN) ? true : false;
     530              : 
     531         7074 :   for (unsigned int i = 0; i < tensors_spec.size(); ++i) {
     532              :     auto const &[dim, t_init, need_grad, name, tspan, t_engine] =
     533              :       tensors_spec.at(i);
     534              : 
     535              :     std::vector<unsigned int> var_exec_order;
     536              :     std::vector<unsigned int> grad_exec_order;
     537              : 
     538              :     /** usage for tensors */
     539         2639 :     if (enum_class_logical_and(tspan, TensorLifespan::FORWARD_FUNC_LIFESPAN))
     540         2292 :       var_exec_order.push_back(forwarding_order);
     541              : 
     542              :     /** usage for tensors gradient in backwarding */
     543         2639 :     if (trainable && is_train_mode &&
     544         2637 :         enum_class_logical_and(tspan, TensorLifespan::CALC_GRAD_LIFESPAN)) {
     545         2215 :       var_exec_order.push_back(calcGradient_order);
     546         2215 :       grad_exec_order.push_back(calcGradient_order);
     547              :     }
     548              : 
     549         2639 :     if (is_train_mode &&
     550         2639 :         enum_class_logical_and(tspan, TensorLifespan::CALC_DERIV_LIFESPAN)) {
     551         2583 :       var_exec_order.push_back(calcDerivative_order);
     552         2583 :       grad_exec_order.push_back(calcDerivative_order);
     553              :     }
     554              : 
     555         2639 :     if (trainable && is_train_mode &&
     556         2637 :         enum_class_logical_and(tspan, TensorLifespan::CALC_AGRAD_LIFESPAN)) {
     557         2215 :       var_exec_order.push_back(applyGradient_order);
     558         2215 :       grad_exec_order.push_back(applyGradient_order);
     559              :     }
     560              : 
     561              :     bool is_dependent = !shared_names.empty();
     562         2639 :     Tensor *var = nullptr, *grad = nullptr;
     563         2639 :     if (is_dependent) {
     564              :       const auto &shared_name = shared_names.at(i);
     565          488 :       var = tensor_pool.requestOrExtend(shared_name, dim, var_exec_order, tspan,
     566              :                                         t_init);
     567          488 :       if (need_grad && tspan > TensorLifespan::FORWARD_FUNC_LIFESPAN) {
     568          488 :         grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
     569              :                                            dim, grad_exec_order, tspan,
     570          976 :                                            Initializer::ZEROS);
     571              :       }
     572              :     } else {
     573         2151 :       var = tensor_pool.request(name, dim, var_exec_order, tspan, t_init);
     574         2151 :       if (is_train_mode && need_grad &&
     575         1088 :           tspan > TensorLifespan::FORWARD_FUNC_LIFESPAN) {
     576         1088 :         grad = tensor_pool.request(name + Var_Grad::grad_suffix, /// name
     577              :                                    dim, grad_exec_order, tspan,
     578         2176 :                                    Initializer::ZEROS /// tensor initializer
     579              :         );
     580              :       }
     581              :     }
     582              : 
     583         2639 :     tensors_v2.emplace_back(std::make_unique<Var_Grad>(var, grad));
     584         2639 :   }
     585              : 
     586              :   std::transform(tensors_v2.begin() + current_size, tensors_v2.end(),
     587              :                  std::back_inserter(ret),
     588              :                  [](auto const &elem) { return elem.get(); });
     589         4435 :   return ret;
     590            0 : }
     591              : 
     592              : /**
     593              :  * @brief     Create tensors with the given spec
     594              :  */
     595              : std::vector<Var_Grad *>
     596         4435 : Manager::requestInputs(const GraphNode &node,
     597              :                        const std::vector<TensorDim> &inputs_dim,
     598              :                        const std::vector<std::string> &outputs_name) {
     599              :   using RT = TensorSpecV2::RequestType;
     600              : 
     601         4435 :   bool is_train_mode = exec_mode == ExecutionMode::TRAIN;
     602              : 
     603         4435 :   TensorSpecV2 var_common_spec, grad_common_spec;
     604         4435 :   if (is_train_mode) {
     605         4430 :     var_common_spec.ls = TensorLifespan::FORWARD_GRAD_LIFESPAN;
     606              :   } else {
     607            5 :     var_common_spec.ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
     608              :   }
     609              : 
     610         4435 :   grad_common_spec.ls = TensorLifespan::CALC_DERIV_LIFESPAN;
     611              :   /// @todo handle this inside layer
     612         8870 :   if (node.getType() == ActivationLayer::type or
     613         8466 :       node.getType() == MultiOutLayer::type or
     614         8082 :       node.getType() == BatchNormalizationLayer::type or
     615        12283 :       node.getType() == LayerNormalizationLayer::type or !node.getTrainable())
     616          996 :     var_common_spec.ls = TensorLifespan::FORWARD_FUNC_LIFESPAN;
     617              : 
     618         8870 :   if (node.getType() == MSELossLayer::type or
     619         8870 :       node.getType() == CrossEntropySoftmaxLossLayer::type or
     620         8256 :       node.getType() == CrossEntropySigmoidLossLayer::type)
     621          627 :     var_common_spec.ls = TensorLifespan::FORWARD_DERIV_LIFESPAN;
     622              : 
     623         4435 :   if (node.getType() == GRUCellLayer::type) {
     624           32 :     grad_common_spec.ls = TensorLifespan::CALC_GRAD_DERIV_LIFESPAN;
     625              :   }
     626              : 
     627              :   std::vector<Var_Grad *> ret;
     628              :   size_t current_size = inputs_v2.size();
     629              : 
     630         9991 :   for (unsigned int idx = 0; idx < inputs_dim.size(); idx++) {
     631         5556 :     TensorSpecV2 var_spec = var_common_spec, grad_spec = grad_common_spec;
     632              : 
     633        11112 :     var_spec.name = std::string("input") + std::to_string(idx);
     634         5556 :     var_spec.dim = inputs_dim[idx];
     635              : 
     636        11112 :     grad_spec.name = var_spec.name + Var_Grad::grad_suffix;
     637         5556 :     grad_spec.dim = inputs_dim[idx];
     638              : 
     639         5556 :     if (!outputs_name.empty()) {
     640         4583 :       grad_spec.request_type = var_spec.request_type = RT::READ_ONLY_VIEW;
     641              :       var_spec.reference_name = outputs_name[idx];
     642         9166 :       grad_spec.reference_name = outputs_name[idx] + Var_Grad::grad_suffix;
     643          973 :     } else if (!node.getInputConnections().empty()) {
     644            0 :       grad_spec.request_type = var_spec.request_type = RT::UNIQUE;
     645              :     } else {
     646          973 :       var_spec.request_type = RT::PLACEHOLDER;
     647              : 
     648              : #ifdef ENABLE_TEST
     649          973 :       grad_spec.request_type = RT::UNIQUE;
     650              : #else
     651              :       grad_spec.request_type = RT::PLACEHOLDER;
     652              : #endif
     653              :     }
     654         5556 :     inputs_v2.emplace_back(std::make_unique<Var_Grad>(
     655        11112 :       requestTensor_(var_spec, node.getExecutionOrder(), node.getName(),
     656         5556 :                      tensor_pool, false, node.getTrainable()),
     657              :       is_train_mode
     658        16663 :         ? requestTensor_(grad_spec, node.getExecutionOrder(), node.getName(),
     659         5551 :                          tensor_pool, false, node.getTrainable())
     660              :         : nullptr));
     661         5556 :   }
     662              : 
     663         4435 :   ret.reserve(inputs_dim.size());
     664              :   std::transform(inputs_v2.begin() + current_size, inputs_v2.end(),
     665              :                  std::back_inserter(ret),
     666              :                  [](auto const &elem) { return elem.get(); });
     667              : 
     668         4435 :   return ret;
     669         4435 : }
     670              : 
     671              : std::vector<unsigned int>
     672          112 : Manager::getTensorExecutionOrders(const std::string &name, bool is_weight) {
     673              : 
     674           24 :   return is_weight ? weight_pool.getExecutionOrder(name)
     675          136 :                    : tensor_pool.getExecutionOrder(name);
     676              : }
     677              : 
     678              : std::pair<unsigned int, unsigned int>
     679        13426 : Manager::getMinMaxTensorExecutionOrder(const std::string &name,
     680              :                                        bool is_weight) {
     681              : 
     682         4762 :   auto orders = is_weight ? weight_pool.getExecutionOrder(name)
     683        18188 :                           : tensor_pool.getExecutionOrder(name);
     684        13426 :   auto [min_, max_] = std::minmax_element(orders.begin(), orders.end());
     685        26852 :   return {*min_, *max_};
     686        13426 : }
     687              : 
     688           16 : unsigned int Manager::getSecondMaxTensorExecutionOrder(const std::string &name,
     689              :                                                        bool is_weight) {
     690              : 
     691            0 :   auto orders = is_weight ? weight_pool.getExecutionOrder(name)
     692           16 :                           : tensor_pool.getExecutionOrder(name);
     693           16 :   if (orders.size() < 2)
     694              :     throw std::runtime_error(
     695            0 :       "Requesting second last access with less than 2 exec orders");
     696              :   /** tensor pool exec order can have same exec order multiple times */
     697           16 :   std::sort(orders.begin(), orders.end());
     698           16 :   orders.erase(std::unique(orders.begin(), orders.end()), orders.end());
     699           16 :   return orders[orders.size() - 2];
     700           16 : }
     701              : 
     702         4860 : bool Manager::isFirstAccess(const std::string &name, unsigned current_execution,
     703              :                             bool is_weight) {
     704              :   /// @todo add cache mechanism, eg) sort at finalizing requesting
     705         4860 :   return getMinMaxTensorExecutionOrder(name, is_weight).first ==
     706         4860 :          current_execution;
     707              : }
     708              : 
     709         4860 : bool Manager::isLastAccess(const std::string &name, unsigned current_execution,
     710              :                            bool is_weight) {
     711              :   /// @todo add cache mechanism, eg) sort at finalizing requesting
     712         4860 :   return getMinMaxTensorExecutionOrder(name, is_weight).second ==
     713         4860 :          current_execution;
     714              : }
     715              : 
     716           16 : bool Manager::isSecondLastAccess(const std::string &name,
     717              :                                  unsigned current_execution, bool is_weight) {
     718              :   /// @todo add cache mechanism, eg) sort at finalizing requesting
     719           16 :   return getSecondMaxTensorExecutionOrder(name, is_weight) == current_execution;
     720              : }
     721              : 
     722              : /**
     723              :  * @brief     Create tensors with the given spec
     724              :  *
     725              :  */
     726         3722 : std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
     727              :   const std::vector<TensorDim> &dims, const std::string &name,
     728              :   const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip,
     729              :   bool is_mixed_precision, Initializer initializer) {
     730              : 
     731              :   std::vector<Tensor *> ret;
     732         3722 :   ret.reserve(dims.size());
     733              : 
     734              :   std::vector<unsigned int> exec;
     735         3722 :   exec.reserve(1);
     736         3722 :   if (is_grad_clip || is_mixed_precision) {
     737           16 :     exec.emplace_back(TensorPool::PERSIST_END_ORDER);
     738              :   } else {
     739         3706 :     exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second);
     740              :   }
     741              : 
     742              :   /// @note this is assuming weight optimizer variables is treated as weight, if
     743              :   /// not, there is room to optimize below behavior
     744         4158 :   for (unsigned int idx = 0; idx < dims.size(); idx++)
     745         1308 :     ret.push_back(weight_pool.request(name + suffix + std::to_string(idx),
     746              :                                       dims[idx], exec, lifespan, initializer));
     747              : 
     748         3722 :   return ret;
     749         3722 : }
     750              : 
     751              : std::vector<Weight *>
     752         1235 : Manager::getWeights(const std::function<bool(const Weight *)> &condition) {
     753              :   std::vector<Weight *> conditional_weights;
     754              : 
     755        10909 :   for (auto &w : weights_v2) {
     756        14516 :     if (!condition || condition(w.get()))
     757         4848 :       conditional_weights.push_back(w.get());
     758              :   }
     759         1235 :   return conditional_weights;
     760            0 : }
     761              : 
     762         5400 : void Manager::flushCache() {
     763         5400 :   if (!fsu_lookahead) {
     764         5400 :     weight_pool.flushCache();
     765         5400 :     tensor_pool.flushCache();
     766              :   }
     767         5400 : }
     768              : 
     769            0 : bool Manager::checkLoadComplete(unsigned int order) {
     770              : 
     771              :   auto checkLoadCompleteAtPool = [](TensorPool &pool, unsigned int order) {
     772            0 :     return pool.checkLoadComplete(order);
     773              :   };
     774              : 
     775            0 :   if (exec_mode == ExecutionMode::TRAIN) {
     776            0 :     return checkLoadCompleteAtPool(weight_pool, order) &&
     777            0 :            checkLoadCompleteAtPool(tensor_pool, order);
     778              :   } else {
     779            0 :     return checkLoadCompleteAtPool(weight_pool, order);
     780              :   }
     781              : }
     782              : 
     783            0 : bool Manager::checkUnloadComplete(unsigned int order) {
     784              :   if (async_unload_tensor.count(order)) {
     785            0 :     auto &tasks = async_unload_tensor[order];
     786            0 :     std::unique_lock<std::mutex> lock(completed_unload_mutex);
     787            0 :     if (exec_mode == ExecutionMode::TRAIN) {
     788            0 :       auto w_fut = completed_unload_tensor[std::get<0>(tasks)].get_future();
     789            0 :       auto t_fut = completed_unload_tensor[std::get<1>(tasks)].get_future();
     790            0 :       lock.unlock();
     791            0 :       if (std::get<0>(tasks) != 0)
     792            0 :         w_fut.wait();
     793            0 :       if (std::get<1>(tasks) != 0)
     794            0 :         t_fut.wait();
     795              :     } else {
     796            0 :       auto w_fut = completed_unload_tensor[std::get<0>(tasks)].get_future();
     797            0 :       lock.unlock();
     798            0 :       if (std::get<0>(tasks) != 0)
     799            0 :         w_fut.wait();
     800              :     }
     801              :     async_unload_tensor.erase(order);
     802              :   }
     803            0 :   return true;
     804              : }
     805              : 
     806            0 : void Manager::LoadTensors(unsigned int order,
     807              :                           unsigned int remainder_lookahead) {
     808              : 
     809            0 :   auto loadTensorsAsync = [&](TensorPool &pool, unsigned int order) {
     810            0 :     return pool.loadCacheExecAsync(
     811            0 :       order, [&](int id, TaskExecutor::CompleteStatus status,
     812              :                  std::future<TaskExecutor::CompleteStatus> fut) {
     813            0 :         std::scoped_lock<std::mutex> lock(completed_load_mutex);
     814            0 :         completed_load_fut[id] = std::move(fut);
     815            0 :       });
     816            0 :   };
     817              : 
     818            0 :   auto enqueTasks = [&](unsigned int o) {
     819            0 :     auto load_weight = loadTensorsAsync(weight_pool, o);
     820            0 :     ml_logd("load weight is requested in LoadTensors with order - %d", o);
     821              :     int load_tensor = 0;
     822              : 
     823            0 :     if (exec_mode != ml::train::ExecutionMode::INFERENCE) {
     824            0 :       load_tensor = loadTensorsAsync(tensor_pool, o);
     825            0 :       ml_logd("load tensor is requested in LoadTensors with order - %d", o);
     826              :     }
     827            0 :     NNTR_THROW_IF(load_weight < 0 || load_tensor < 0, std::runtime_error)
     828              :       << "Fail to launch task";
     829            0 :   };
     830              : 
     831            0 :   if (order <= max_exec_order) {
     832            0 :     enqueTasks(order);
     833              :   }
     834            0 : }
     835              : 
     836            0 : void Manager::UnloadTensors(unsigned int order) {
     837              : 
     838            0 :   auto unloadTensorsAsync = [&](TensorPool &pool, unsigned int order) {
     839            0 :     return pool.flushCacheExecAsync(
     840            0 :       order, [&](int id, TaskExecutor::CompleteStatus status,
     841              :                  std::future<TaskExecutor::CompleteStatus> fut) {
     842            0 :         std::scoped_lock<std::mutex> lock(completed_unload_mutex);
     843            0 :         completed_unload_tensor[id].set_value(true);
     844            0 :       });
     845            0 :   };
     846              : 
     847            0 :   auto enqueTasks = [&](unsigned int o) {
     848            0 :     if (async_unload_tensor.count(o)) {
     849            0 :       ml_logd("Task unloadTensors (%d) is in progress", o);
     850              :       return;
     851              :     }
     852            0 :     auto unload_weight = unloadTensorsAsync(weight_pool, o);
     853            0 :     ml_logd("unload weight is requested in UnLoadTensors with order - %d", o);
     854              :     int unload_tensor = 0;
     855            0 :     if (exec_mode != ml::train::ExecutionMode::INFERENCE) {
     856            0 :       unload_tensor = unloadTensorsAsync(tensor_pool, o);
     857            0 :       ml_logd("unload tensor is requested in UnLoadTensors with order - %d", o);
     858              :     }
     859            0 :     NNTR_THROW_IF(unload_weight < 0 || unload_tensor < 0, std::runtime_error)
     860              :       << "Faile to launch task";
     861            0 :     async_unload_tensor[o] = std::make_tuple(unload_weight, unload_tensor);
     862            0 :   };
     863              : 
     864            0 :   enqueTasks(order);
     865            0 : }
     866              : 
     867        94179 : void Manager::flushCacheExcept(unsigned int order) {
     868            0 :   auto loadAsync = [&](TensorPool &pool, unsigned int order) {
     869            0 :     return pool.loadCacheExecAsync(
     870              : 
     871            0 :       order, [&](int id, TaskExecutor::CompleteStatus status,
     872              :                  std::future<TaskExecutor::CompleteStatus> fu) {
     873            0 :         std::scoped_lock<std::mutex> lock(completed_mutex);
     874            0 :         completed[id].set_value(true);
     875            0 :       });
     876        94179 :   };
     877              : 
     878            0 :   auto waitComplete = [&](unsigned int o) {
     879            0 :     auto &tasks = async_task_eos[o];
     880              : 
     881            0 :     std::unique_lock<std::mutex> lock(completed_mutex);
     882            0 :     auto w_fut = completed[std::get<0>(tasks)].get_future();
     883            0 :     auto t_fut = completed[std::get<1>(tasks)].get_future();
     884            0 :     lock.unlock();
     885              : 
     886            0 :     w_fut.wait();
     887            0 :     t_fut.wait();
     888              : 
     889            0 :     async_task_eos.erase(o);
     890            0 :   };
     891              : 
     892              :   // TODO: lookahead > 1 is required.
     893        94179 :   if (fsu_lookahead == 1) {
     894              :     if (async_task_eos.count(order) == 1)
     895            0 :       waitComplete(order);
     896              : 
     897            0 :     auto load_weight = loadAsync(weight_pool, order + 1);
     898            0 :     auto load_tensor = loadAsync(tensor_pool, order + 1);
     899              : 
     900            0 :     NNTR_THROW_IF(load_weight < 0 || load_tensor < 0, std::runtime_error)
     901              :       << "Failed to launch preloading task";
     902            0 :     async_task_eos[order + 1] = std::make_tuple(load_weight, load_tensor);
     903              :   } else {
     904        94179 :     weight_pool.flushCacheExcept(order);
     905        94179 :     tensor_pool.flushCacheExcept(order);
     906              :   }
     907        94179 : }
     908              : 
     909         1301 : void Manager::finalizeTensorPool(TensorPool &pool, unsigned int start,
     910              :                                  unsigned int end) {
     911         1301 :   if (enable_optimizations) {
     912          408 :     if (exec_mode == ExecutionMode::INFERENCE && enable_fsu) {
     913              :       //@todo change V3 and validate
     914            0 :       pool.finalize(OptimizedV1Planner(), start, end);
     915              :     } else {
     916          408 :       pool.finalize(OptimizedV1Planner(), start, end);
     917              :     }
     918              :   } else {
     919          893 :     pool.finalize(BasicPlanner(), start, end);
     920              :   }
     921         1301 : }
     922              : 
     923            0 : unsigned int Manager::inActive(unsigned int order) {
     924            0 :   return weight_pool.inActive(order);
     925              : }
     926              : 
     927              : } // namespace nntrainer
        

Generated by: LCOV version 2.0-1