LCOV - coverage_filtered.info - nntrainer/graph/network

LCOV - code coverage report

Current view:	top level - nntrainer/graph - network_graph.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	78.6 %	658	517
Test Date:	2025-12-14 20:38:17	Functions:	80.4 %	51	41

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
       4              :  *
       5              :  * @file    network_graph.h
       6              :  * @date    19 Oct 2020
       7              :  * @see     https://github.com/nnstreamer/nntrainer
       8              :  * @author  Jijoong Moon <jijoong.moon@samsung.com>
       9              :  * @bug     No known bugs except for NYI items
      10              :  * @brief   This is Network Graph Class for Neural Network
      11              :  *
      12              :  * @todo    Support multi-input graph.
      13              :  */
      14              : 
      15              : #include <activation_layer.h>
      16              : #include <addition_layer.h>
      17              : #include <bn_layer.h>
      18              : #include <concat_layer.h>
      19              : #include <connection.h>
      20              : #include <cross_entropy_loss_layer.h>
      21              : #include <cross_entropy_sigmoid_loss_layer.h>
      22              : #include <cross_entropy_softmax_loss_layer.h>
      23              : #include <engine.h>
      24              : #include <flatten_layer.h>
      25              : #include <grucell.h>
      26              : #include <identity_layer.h>
      27              : #include <input_layer.h>
      28              : #include <layer_node.h>
      29              : #include <layer_normalization_layer.h>
      30              : #include <lstmcell.h>
      31              : #include <multiout_layer.h>
      32              : #include <network_graph.h>
      33              : #include <nntrainer_error.h>
      34              : #include <nntrainer_log.h>
      35              : #include <profiler.h>
      36              : #include <rnn.h>
      37              : #include <rnncell.h>
      38              : #include <split_layer.h>
      39              : #include <tensor_layer.h>
      40              : #include <time_dist.h>
      41              : #include <tracer.h>
      42              : #include <util_func.h>
      43              : #include <weight_layer.h>
      44              : 
      45              : #include <cmath>
      46              : #include <iostream>
      47              : #include <stdexcept>
      48              : #include <string>
      49              : 
      50              : #include "graph_node.h"
      51              : #include "tensor.h"
      52              : 
      53              : #define LNODE(x) std::static_pointer_cast<LayerNode>(x)
      54              : 
      55              : namespace nntrainer {
      56          703 : int NetworkGraph::compile(const std::string &loss_type) {
      57              :   int status = ML_ERROR_NONE;
      58              : 
      59          703 :   status = isCompilable();
      60          703 :   NN_RETURN_STATUS();
      61              : 
      62              :   try {
      63          642 :     setOutputConnections();
      64            0 :   } catch (std::exception &e) {
      65            0 :     ml_loge("setting output layer failed, reason: %s", e.what());
      66              :     return ML_ERROR_INVALID_PARAMETER;
      67            0 :   }
      68              : 
      69          642 :   graph.realizeInputOutputNode();
      70              : 
      71          642 :   if (exec_mode != ExecutionMode::INFERENCE) {
      72              :     try {
      73              :       /// @todo realize loss beforehand
      74          641 :       status = addLossLayer(loss_type);
      75          632 :       NN_RETURN_STATUS();
      76            9 :     } catch (const std::exception &e) {
      77            9 :       ml_loge("%s", e.what());
      78              :       status = ML_ERROR_INVALID_PARAMETER;
      79              :       NN_RETURN_STATUS();
      80            9 :     }
      81              :   } else {
      82            1 :     if (!loss_type.empty()) {
      83            0 :       ml_loge("Warning : Loss type is given in inference mode. Ignoring loss "
      84              :               "type.");
      85              :     }
      86              :   }
      87              : 
      88          633 :   graph.topologicalSort();
      89              : 
      90          633 :   setExecutionOrder();
      91          633 :   forward_iter_end = (*(cend() - 1)).get();
      92              : 
      93          633 :   inPlaceOptimize();
      94              : 
      95          633 :   status = checkCompiledGraph();
      96          633 :   NN_RETURN_STATUS();
      97              : 
      98          627 :   compiled = true;
      99              : 
     100          627 :   return status;
     101              : }
     102              : 
     103          633 : void NetworkGraph::setExecutionOrder() {
     104              :   auto backward_order = graph.size();
     105         5102 :   for (auto iter = getBackwardingBeginIter(); iter != getBackwardingEndIter();
     106              :        iter++) {
     107              :     auto &node = *iter;
     108         4469 :     auto order_idx = getBackwardingEndIter() - iter - 1;
     109              :     auto forward_order = order_idx;
     110              :     auto calc_gradient_order = backward_order;
     111         4469 :     if (node->getTrainable())
     112         4451 :       backward_order++;
     113              :     auto calc_derivative_order = backward_order;
     114         4469 :     if (node->getTrainable())
     115         4451 :       backward_order++;
     116         4469 :     auto apply_gradient_order = backward_order++;
     117              : 
     118              :     node->setExecutionOrder({forward_order, calc_gradient_order,
     119              :                              calc_derivative_order, apply_gradient_order});
     120              :   }
     121              : 
     122              :   /**
     123              :    * This sets max execution order temporarily till model is initialized.
     124              :    * This set max execution order is used to extend gradient exec orders for
     125              :    * clipping.
     126              :    */
     127          633 :   graph_exec_end = std::get<3>((*(cbegin()))->getExecutionOrder());
     128          633 : }
     129              : 
     130            0 : void NetworkGraph::addLayerNode(std::unique_ptr<Layer> layer) {
     131            0 :   graph.addNode(std::make_unique<LayerNode>(std::move(layer)));
     132            0 : }
     133              : 
     134          641 : int NetworkGraph::addLossLayer(const std::string &loss_type_) {
     135         1292 :   for (unsigned int i = 0; i < graph.getNumOutputNodes(); ++i) {
     136              :     auto output_layer_node = LNODE(graph.getOutputNode(i));
     137              :     std::string loss_type = loss_type_;
     138              : 
     139          660 :     if (output_layer_node->requireLabel())
     140          338 :       continue;
     141              : 
     142          322 :     if (loss_type.empty())
     143           13 :       continue;
     144              : 
     145              :     auto second_to_last_layer_node = output_layer_node;
     146              :     bool is_cross_entropy_loss =
     147          309 :       istrequal(loss_type, CrossEntropyLossLayer::type);
     148          309 :     if (is_cross_entropy_loss) {
     149          189 :       auto type = output_layer_node->getType();
     150              : 
     151          189 :       if (type != ActivationLayer::type) {
     152              :         throw exception::not_supported(
     153              :           "Error: Cross Entropy need last layer to have softmax or sigmoid"
     154            0 :           "activation.");
     155              :       }
     156              : 
     157          189 :       switch (output_layer_node->getActivationType()) {
     158              :       case ActivationType::ACT_SIGMOID:
     159              :         loss_type = CrossEntropySigmoidLossLayer::type;
     160              :         break;
     161              :       case ActivationType::ACT_SOFTMAX:
     162              :         loss_type = CrossEntropySoftmaxLossLayer::type;
     163              :         break;
     164            8 :       default:
     165              :         throw exception::not_supported(
     166           16 :           "Error: Cross Entropy not supported without softmax or sigmoid.");
     167              :       }
     168              : 
     169              :       second_to_last_layer_node =
     170          370 :         LNODE(graph.getNode(output_layer_node->getInputConnectionName(0)));
     171              :     }
     172              : 
     173          610 :     std::shared_ptr<LayerNode> lnode = createLayerNode(loss_type);
     174          600 :     graph.ensureName(*lnode);
     175              : 
     176          300 :     if (second_to_last_layer_node->getDistribute()) {
     177            1 :       lnode->setProperty({"distribute=true"});
     178              :     }
     179              : 
     180              :     /// @todo remove this by add loss at realization
     181          600 :     second_to_last_layer_node->setOutputLayers({lnode->getName()});
     182          600 :     lnode->setProperty(
     183          300 :       {"input_layers=" + second_to_last_layer_node->getName()});
     184              : 
     185          300 :     if (is_cross_entropy_loss) {
     186          543 :       graph.replaceNode(output_layer_node, lnode);
     187              :     } else {
     188          238 :       graph.addNode(lnode, false);
     189              :     }
     190          300 :     graph.replaceOutputNode(i, lnode);
     191              :   }
     192              : 
     193          632 :   return ML_ERROR_NONE;
     194          600 : }
     195              : 
     196          642 : void NetworkGraph::setOutputConnections() {
     197         5027 :   for (auto layer_iter = cbegin(); layer_iter != cend(); layer_iter++) {
     198              :     const auto &node = *layer_iter;
     199        13283 :     for (auto i = 0u, num_inode = node->getNumInputConnections(); i < num_inode;
     200              :          ++i) {
     201         4513 :       const auto &name = node->getInputConnectionName(i);
     202         4513 :       const auto &idx = node->getInputConnectionIndex(i);
     203              : 
     204         4513 :       auto node_setting_output = getLayerNode(name);
     205         9026 :       node_setting_output->setOutputConnection(idx, node->getName(), i);
     206              :     }
     207              :   }
     208          642 : }
     209              : 
     210          703 : int NetworkGraph::isCompilable() {
     211          703 :   if (compiled) {
     212            0 :     ml_loge("Graph is already compiled");
     213            0 :     return ML_ERROR_NOT_SUPPORTED;
     214              :   }
     215              : 
     216          703 :   if (graph.empty()) {
     217           61 :     ml_loge("Graph is empty");
     218           61 :     return ML_ERROR_INVALID_PARAMETER;
     219              :   }
     220              : 
     221              :   return ML_ERROR_NONE;
     222              : }
     223              : 
     224          633 : int NetworkGraph::checkCompiledGraph() {
     225              :   /** Dimension of input layers must be known */
     226         5054 :   for (auto iter = cbegin(); iter != cend(); iter++) {
     227              :     auto lnode = (*iter);
     228         4427 :     if (lnode->getNumInputConnections() == 0) {
     229          984 :       if (!lnode->hasInputShapeProperty()) {
     230           12 :         ml_loge("Layer with no inbound connection need input_shape property");
     231              :         return ML_ERROR_INVALID_PARAMETER;
     232              :       }
     233              :     }
     234              :   }
     235              : 
     236              :   return ML_ERROR_NONE;
     237              : }
     238              : 
     239          619 : void NetworkGraph::markNodesForBackwarding() {
     240              :   /** accumulate all the nodes which must support backwarding */
     241              :   std::unordered_set<std::string> must_support_backwarding;
     242          619 :   if (exec_mode == ExecutionMode::INFERENCE) {
     243            6 :     for (auto iter = cbegin(); iter != cend(); iter++) {
     244              :       auto lnode = (*iter);
     245              :       lnode->needsCalcGradient(false);
     246            5 :       lnode->needsCalcDerivative(false);
     247              :     }
     248              :     return;
     249              :   }
     250              : 
     251              :   /**
     252              :    * if a node is trainable, then all the nodes ahead of it must support
     253              :    * backwarding operation
     254              :    */
     255         5018 :   for (auto iter = cbegin(); iter != cend(); iter++) {
     256              :     auto lnode = (*iter);
     257         7363 :     if (lnode->getTrainable() ||
     258         7363 :         must_support_backwarding.find(lnode->getName()) !=
     259              :           must_support_backwarding.end()) {
     260         3187 :       if (lnode->getTrainable()) {
     261              :         lnode->needsCalcGradient(true);
     262              :       }
     263              : #ifdef ENABLE_TEST
     264         3187 :       if (lnode->supportBackwarding() && !optimize_memory) {
     265         1406 :         lnode->needsCalcDerivative(true);
     266              :       }
     267              : #endif
     268              : 
     269         6363 :       for (auto i = 0u, num_node = lnode->getNumOutputConnections();
     270         6363 :            i < num_node; ++i) {
     271         3176 :         auto conn = lnode->getOutputConnection(i);
     272         3176 :         if (!conn) {
     273            8 :           continue;
     274              :         }
     275              : 
     276              :         must_support_backwarding.insert(conn->getName());
     277              :       }
     278              :     }
     279              :   }
     280              : 
     281              :   /** mark all the required nodes support backwarding */
     282         3179 :   for (auto const &node_name : must_support_backwarding) {
     283         2561 :     auto ln = LNODE(graph.getNode(node_name)).get();
     284         2561 :     ln->needsCalcDerivative(true);
     285              :   }
     286              : }
     287              : 
     288          645 : void NetworkGraph::setBatchSize(unsigned int batch_size) {
     289          645 :   if (batch_size == this->batch_size)
     290              :     return;
     291              : 
     292          625 :   this->batch_size = batch_size;
     293          815 :   if (!input_list.empty() && getInputDimension()[0].batch() == batch_size)
     294              :     return;
     295              : 
     296              :   auto allocated = tensor_manager->isAllocated();
     297              : 
     298          435 :   if (allocated)
     299              :     deallocateTensors();
     300              : 
     301         3497 :   for (auto iter = cbegin(); iter != cend(); iter++) {
     302         3062 :     if ((*iter)->isFinalized()) {
     303              :       /// resize tensors spec
     304              :       /// @todo remove below, if custom tensor needs to change dimension
     305              :       /// according to the tensor, it must be done explicitly, or at least have
     306              :       /// a property to control the behavior
     307         3062 :       const RunLayerContext &context = (*iter)->getRunContext();
     308         5099 :       for (unsigned int idx = 0; idx < context.getNumTensors(); idx++) {
     309         2037 :         auto const &ts = context.getTensor(idx);
     310         2037 :         tensor_manager->setBatchSize(ts.getName(), ts.getDim().batch());
     311         2037 :         if (context.tensorHasGradient(idx)) {
     312         1016 :           auto const &ts_grad = context.getTensorGrad(idx);
     313         1016 :           tensor_manager->setBatchSize(ts_grad.getName(),
     314         2032 :                                        ts_grad.getDim().batch());
     315              :         }
     316              :       }
     317              :       /// override setting batch as per request
     318         6124 :       (*iter)->setBatch(batch_size);
     319              :     }
     320              :   }
     321              :   /// resize input and output spec
     322          435 :   tensor_manager->setBatchSize(batch_size);
     323              : 
     324          435 :   if (allocated)
     325            0 :     allocateTensors(exec_mode);
     326              : 
     327              :   /** update input and label dimensions */
     328         1005 :   for (unsigned int idx = 0; idx < input_list.size(); idx++)
     329          570 :     input_dims_[idx] = tensor_manager->getTensor(input_list[idx])->getDim();
     330          883 :   for (unsigned int idx = 0; idx < label_list.size(); idx++)
     331          448 :     label_dims_[idx] = tensor_manager->getTensor(label_list[idx])->getDim();
     332              : }
     333              : 
     334            0 : void NetworkGraph::resetInputDimension(std::vector<TensorDim> dims) {
     335              :   auto allocated = tensor_manager->isAllocated();
     336              : 
     337            0 :   if (allocated)
     338              :     deallocateTensors();
     339              : 
     340            0 :   for (auto iter = cbegin(); iter != cend(); iter++) {
     341            0 :     if ((*iter)->isFinalized()) {
     342            0 :       (*iter)->updateTensorsByInputDimensions(dims);
     343              :     }
     344              :   }
     345              : 
     346            0 :   if (allocated)
     347            0 :     allocateTensors(exec_mode);
     348              : 
     349              :   /** update input and label dimensions */
     350            0 :   for (unsigned int idx = 0; idx < input_list.size(); idx++)
     351            0 :     input_dims_[idx] = tensor_manager->getTensor(input_list[idx])->getDim();
     352            0 :   for (unsigned int idx = 0; idx < label_list.size(); idx++)
     353            0 :     label_dims_[idx] = tensor_manager->getTensor(label_list[idx])->getDim();
     354            0 : }
     355              : 
     356        22292 : void NetworkGraph::applyGradients(
     357              :   LayerNode *node, const std::function<void(Weight &)> &apply_func) {
     358        22292 :   if (!node->getTrainable())
     359              :     return;
     360              : 
     361        14934 :   TRACE_MEMORY() << node->getName() + ": AG";
     362        14934 :   TRACE_TIME() << node->getName() + ": AG";
     363              : 
     364         7467 :   auto &rc = node->getRunContext();
     365         7467 :   auto num_weight = rc.getNumWeights();
     366        23960 :   for (unsigned i = 0; i < num_weight; ++i) {
     367        16493 :     if (!rc.weightHasGradient(i)) {
     368          408 :       continue;
     369              :     }
     370              : 
     371        16085 :     if (!rc.isGradientLastAccess(i)) {
     372              :       /// @note instead of checking the last access of the weight, checking
     373              :       /// if weights are dependent to others to minimize overhead.
     374              :       /// this logic assume that the source of the dependent weight must be
     375              :       /// prior to the dependent.
     376          458 :       continue;
     377              :     }
     378              : 
     379        15627 :     if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
     380              :       /**
     381              :        * @note the weights whose gradient are to be clipped by global norm will
     382              :        * be clipped at once at the end of iteration and applied then.
     383              :        * For those weights where mixed precision is uesed, their gradient
     384              :        * updates might be delayed until they confirm whether their loss scales
     385              :        * are appropeiate.
     386              :        */
     387           44 :       continue;
     388              :     }
     389              : 
     390        15583 :     apply_func(rc.getWeightObject(i));
     391              :   }
     392              : }
     393              : 
     394         6821 : sharedConstTensors NetworkGraph::forwarding(
     395              :   bool training,
     396              :   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op,
     397              :   std::function<bool(void *userdata)> stop_cb, void *userdata) {
     398        61427 :   for (auto iter = cbegin(); iter != cend() && !stop_cb(userdata); iter++) {
     399              :     auto &ln = *iter;
     400              :     PROFILE_TIME_START(profile_keys.at(ln->getType()));
     401        27303 :     forwarding_op(*iter, training);
     402              :     PROFILE_TIME_END(profile_keys.at(ln->getType()));
     403              :   }
     404              : 
     405              :   sharedConstTensors out;
     406        13666 :   for (unsigned int i = 0; i < graph.getNumOutputNodes(); ++i) {
     407              :     auto const &output_layer_node = LNODE(graph.getOutputNode(i));
     408        13690 :     for (unsigned int j = 0; j < output_layer_node->getNumOutputs(); ++j) {
     409              :       // @todo we should determine what type to return
     410              :       // out.push_back(MAKE_SHARED_TENSOR(
     411              :       //   output_layer_node->getOutput(j).clone(TensorDim::DataType::FP32)));
     412        13690 :       out.push_back(MAKE_SHARED_TENSOR(output_layer_node->getOutput(j)));
     413              :     }
     414              :   }
     415              : 
     416         6821 :   return out;
     417            0 : }
     418              : 
     419            0 : sharedConstTensors NetworkGraph::incremental_forwarding(
     420              :   unsigned int from, unsigned int to, bool training,
     421              :   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op,
     422              :   std::function<bool(void *userdata)> stop_cb, void *userdata) {
     423            0 :   for (auto iter = cbegin(); iter != cend() && !stop_cb(userdata); iter++) {
     424              :     auto &ln = *iter;
     425              :     PROFILE_TIME_START(profile_keys.at(ln->getType()));
     426            0 :     forwarding_op(*iter, training);
     427              :     PROFILE_TIME_END(profile_keys.at(ln->getType()));
     428              :   }
     429              : 
     430              :   sharedConstTensors out;
     431            0 :   for (unsigned int i = 0; i < graph.getNumOutputNodes(); ++i) {
     432              :     auto const &output_layer_node = LNODE(graph.getOutputNode(i));
     433            0 :     for (unsigned int j = 0; j < output_layer_node->getNumOutputs(); ++j) {
     434            0 :       out.push_back(MAKE_SHARED_TENSOR(output_layer_node->getOutput(j)));
     435              :     }
     436              :   }
     437              : 
     438            0 :   return out;
     439            0 : }
     440              : 
     441         6132 : bool NetworkGraph::backwarding(
     442              :   int iteration,
     443              :   std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
     444              :   std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
     445              :   std::function<void(Weight &, int)> &lazy_apply_grad_op,
     446              :   std::function<bool(void *userdata)> stop_cb, void *userdata) {
     447              :   /**
     448              :    * last layer backwarding is run out of this loop
     449              :    */
     450              :   auto iter_begin = getBackwardingBeginIter();
     451              :   auto iter_end = getBackwardingEndIter();
     452              :   bool is_valid = true;
     453              : 
     454              :   /// there is no layer to train, so backwarding is essentially noop
     455         6132 :   if (iter_begin == iter_end) {
     456              :     return true;
     457              :   }
     458              : 
     459              :   auto const &lptr_begin = (*iter_begin);
     460              :   // graph_const_reverse_iterator
     461              :   auto iter_ = iter_begin;
     462              : 
     463         6132 :   if (lptr_begin->requireLabel() == false)
     464              :     throw std::runtime_error(
     465            0 :       "Error: last layer does not accept label, we can't train");
     466              : 
     467        50716 :   for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
     468              :     auto &ln = *iter_;
     469              :     PROFILE_TIME_START(profile_keys.at(ln->getType()));
     470        22292 :     is_valid = backwarding_op(ln, iteration);
     471              :     PROFILE_TIME_END(profile_keys.at(ln->getType()));
     472              : 
     473        22292 :     if (!is_valid) {
     474              :       break;
     475              :     }
     476              :   }
     477              : 
     478         6132 :   if (!is_valid) {
     479              :     /** if has NaN
     480              :      * 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
     481              :      * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
     482              :      * 3. return false --> run backwarding again;
     483              :      */
     484            0 :     float scale = (*iter_)->getRunContext().getLossScale();
     485              : 
     486            0 :     NNTR_THROW_IF(scale - 1.0f < 10e-6, std::invalid_argument)
     487              :       << "Loss Scale Factor is 1.0f";
     488              : 
     489            0 :     float s = scale > 1.5f ? scale * 0.5f : 1.0f;
     490              : 
     491            0 :     resetLossScale(s);
     492              : 
     493            0 :     auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());
     494              : 
     495            0 :     for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
     496              :       auto &ln = *iter;
     497              :       ln->reStoreData(true);
     498              :     }
     499              : 
     500            0 :     for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
     501              :       auto &ln = *iter;
     502              :       PROFILE_TIME_START(profile_keys.at(ln->getType()));
     503            0 :       forwarding_op(*iter, true);
     504              :       PROFILE_TIME_END(profile_keys.at(ln->getType()));
     505              :     }
     506              : 
     507              :     return false;
     508              :   }
     509              : 
     510              :   /** perform clipping of the gradients by global norm if any */
     511         6132 :   if (lazy_weights.empty())
     512              :     return true;
     513              : 
     514           11 :   if (is_clip_grad) {
     515              :     /** calculate the global norm */
     516              :     Tensor global_norm_t(
     517           11 :       TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
     518              :     float *global_norm_data = global_norm_t.getData();
     519              : 
     520           55 :     for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
     521              :       auto const &w = lazy_weights[idx];
     522              : 
     523           44 :       if (isMixedPrecision()) {
     524              :         Tensor scaled_grad =
     525            0 :           w->getGradientRef().clone(TensorDim::DataType::FP32);
     526            0 :         scaled_grad.divide_i(loss_scale);
     527            0 :         global_norm_data[idx] = scaled_grad.l2norm();
     528            0 :       } else {
     529           44 :         global_norm_data[idx] = w->getGradientNorm();
     530              :       }
     531              :     }
     532           11 :     float global_norm = global_norm_t.l2norm();
     533              :     /** apply the gradient with the above global norm */
     534           55 :     for (auto w : lazy_weights) {
     535              :       w->clipGradientByGlobalNorm(global_norm);
     536              :     }
     537           11 :   }
     538              :   /** apply the gradient with the above global norm */
     539           55 :   for (auto w : lazy_weights) {
     540           44 :     lazy_apply_grad_op(*w, iteration);
     541              :   }
     542           11 :   nan_count++;
     543              : 
     544              :   /** @todo : handle as property : growth_interval : default --> 2000 */
     545           11 :   if (nan_count > 2000) {
     546            0 :     float scale = (*iter_)->getRunContext().getLossScale();
     547              :     /** @todo growth_factor : default --> 2.0 */
     548            0 :     float s = scale * 2.0f;
     549            0 :     resetLossScale(s);
     550            0 :     nan_count = 0;
     551              :   }
     552              : 
     553              :   return true;
     554              : }
     555              : 
     556          619 : LayerNode *NetworkGraph::computeBackwardEnd() {
     557              :   int max_exec_order = -1;
     558              :   LayerNode *node = nullptr;
     559              : 
     560          619 :   if (!optimize_memory) {
     561          241 :     return (*cbegin()).get();
     562              :   }
     563              : 
     564         2879 :   for (auto iter = getBackwardingBeginIter(); iter != getBackwardingEndIter();
     565              :        iter++) {
     566              :     auto &ln = *iter;
     567              :     const auto &exec_order = ln->getExecutionOrder();
     568         2501 :     int cur_order = std::get<0>(exec_order);
     569         2501 :     if (ln->needsCalcDerivative() || ln->needsCalcGradient()) {
     570              : #ifdef ENABLE_TEST
     571         1781 :       cur_order = std::get<2>(exec_order);
     572              : #else
     573              :       cur_order = std::get<1>(exec_order);
     574              : #endif
     575              :     }
     576              : 
     577         2501 :     NNTR_THROW_IF(max_exec_order == cur_order, std::invalid_argument)
     578            0 :       << "layer node: " << ln->getName()
     579              :       << " has duplicated max_exec_order, this should not happen, current "
     580              :          "execution order: "
     581              :       << max_exec_order;
     582              : 
     583         2501 :     if (max_exec_order < cur_order) {
     584              :       max_exec_order = cur_order;
     585              :       node = ln.get();
     586              :     }
     587              :   }
     588              : 
     589              :   return node;
     590              : }
     591              : 
     592              : /**
     593              :  * @brief Allocate memory for all the managed tensors
     594              :  */
     595          683 : void NetworkGraph::allocateTensors(ExecutionMode exec_mode_) {
     596          683 :   exec_mode = exec_mode_;
     597          683 :   if (exec_mode == ExecutionMode::INFERENCE)
     598              :     /**
     599              :      * get the order of execution/usage order for the forwarding of the last
     600              :      * layer and pass that as the max_exec_order ensuring that all tensors
     601              :      * with usage less than the max_exec_order are allocated.
     602              :      */
     603          420 :     tensor_manager->allocateTensors(
     604          420 :       std::get<0>((*(cend() - 1))->getExecutionOrder()));
     605              :   else {
     606              :     /**
     607              :      * get the order of execution/usage order for the backwarding of the first
     608              :      * layer (as that will be the last layer to executed in the backwarding)
     609              :      * and pass that as the max_exec_order ensuring that all tensors with
     610              :      * usage less than the max_exec_order are allocated.
     611              :      * @todo if model is gradient clipping, we have to add last execution order
     612              :      * + 1
     613              :      */
     614          263 :     tensor_manager->allocateTensors(
     615          263 :       std::get<3>(backward_iter_end->getExecutionOrder()));
     616              :   }
     617          683 : }
     618              : 
     619         1313 : std::vector<TensorDim> NetworkGraph::getInputDimension() const {
     620         1313 :   NNTR_THROW_IF(input_dims_.empty(), std::invalid_argument)
     621              :     << "[NetworkGraph] the graph has no node identified as input!";
     622         1313 :   return input_dims_;
     623              : }
     624              : 
     625         1563 : unsigned int NetworkGraph::getBatchSize() const { return batch_size; }
     626              : 
     627          166 : std::vector<TensorDim> NetworkGraph::getOutputDimension() const {
     628          166 :   NNTR_THROW_IF(label_dims_.empty(), std::invalid_argument)
     629              :     << "[NetworkGraph] the graph has no node identified as output!";
     630              :   /// for now, outputting label_dims_ works, later label dim will be different
     631              :   /// from output dimension
     632          166 :   return label_dims_;
     633              : }
     634              : 
     635              : std::vector<std::shared_ptr<LayerNode>>
     636            0 : NetworkGraph::getUnsortedLayers(const std::string &input_layer,
     637              :                                 const std::string &output_layer) const {
     638              :   /// @fixme: this won't work if input, output layers are not in order
     639              :   /// Further, this function must be removed. There should be rather
     640              :   /// getAllNames and getLayerByName instead of getUnsortedLayers.
     641              : 
     642              :   /** count layers after output layer */
     643              :   unsigned int num_layers_remove_end = 0;
     644            0 :   if (!output_layer.empty()) {
     645            0 :     for (auto iter = graph.crbegin(); iter != graph.crend(); iter++) {
     646            0 :       if ((*iter)->getName() != output_layer)
     647            0 :         num_layers_remove_end++;
     648              :       else
     649              :         break;
     650              :     }
     651              :   }
     652              : 
     653            0 :   if (num_layers_remove_end == graph.size())
     654            0 :     return {};
     655              : 
     656              :   /** count layers before input layer */
     657              :   unsigned int num_layers_remove_start = 0;
     658            0 :   if (!input_layer.empty()) {
     659              :     for (auto iter = graph.cbegin();
     660            0 :          iter != graph.cend() - num_layers_remove_end; iter++) {
     661            0 :       if ((*iter)->getName() != input_layer)
     662            0 :         num_layers_remove_start++;
     663              :       else
     664              :         break;
     665              :     }
     666              :   }
     667              : 
     668              :   /** copy the graph and return */
     669              :   std::vector<std::shared_ptr<LayerNode>> ret;
     670            0 :   std::transform(graph.cbegin() + num_layers_remove_start,
     671              :                  graph.cend() - num_layers_remove_end, std::back_inserter(ret),
     672              :                  [](auto const &elem) { return LNODE(elem); });
     673              : 
     674              :   return ret;
     675            0 : }
     676              : 
     677           15 : std::vector<std::shared_ptr<LayerNode>> NetworkGraph::getLayerNodes() const {
     678           15 :   return std::vector<std::shared_ptr<LayerNode>>(cbegin(), cend());
     679              : }
     680              : 
     681         8497 : void NetworkGraph::addLayer(std::shared_ptr<LayerNode> layer) {
     682         8497 :   if (compiled)
     683            1 :     throw std::runtime_error("Cannot modify graph after compile");
     684              : 
     685              :   /** Insert the layer to the graph */
     686        16992 :   graph.addNode(layer);
     687         8496 : }
     688              : 
     689              : InPlaceType
     690         2565 : NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
     691         2565 :   InPlaceType inplace_type = lnode->initializeInPlace();
     692              : 
     693         2565 :   if (inplace_type == InPlaceType::NONE) {
     694              :     return inplace_type;
     695              :   }
     696              : 
     697         2958 :   if (lnode->getType() == InputLayer::type &&
     698         2819 :       !istrequal(getTensorType()[2], "FP32")) {
     699              :     return InPlaceType::NONE;
     700              :   }
     701              : 
     702         1211 :   if (lnode->getType() == MultiOutLayer::type) {
     703              :     return InPlaceType::RESTRICTING;
     704              :   }
     705              : 
     706              :   /** A case where it can operate in-place even if there is a multi-out type
     707              :    * input connection. */
     708         1017 :   if (inplace_type == InPlaceType::RESTRICTING) {
     709          194 :     for (size_t i = 0, num_node = lnode->getNumInputConnections(); i < num_node;
     710              :          ++i) {
     711           97 :       const std::string &input_name = lnode->getInputConnectionName(i);
     712          194 :       if (getLayerNode(input_name)->getInPlaceType() ==
     713              :           InPlaceType::RESTRICTING)
     714              :         return inplace_type;
     715              :     }
     716              :     return InPlaceType::NON_RESTRICTING;
     717              :   }
     718              :   /** A case where it cannot operate in-place if there is a multi-out type
     719              :    * input connection. */
     720              :   else {
     721              :     /** condition: NON_RESTRICTING */
     722         1300 :     for (size_t i = 0, num_node = lnode->getNumInputConnections(); i < num_node;
     723              :          ++i) {
     724          384 :       const std::string &input_name = lnode->getInputConnectionName(i);
     725          768 :       if (getLayerNode(input_name)->getInPlaceType() ==
     726              :           InPlaceType::RESTRICTING)
     727              :         return InPlaceType::NONE;
     728              :     }
     729              :     return inplace_type;
     730              :   }
     731              : }
     732              : 
     733          633 : void NetworkGraph::inPlaceOptimize() {
     734          633 :   if (optimize_memory) {
     735         2957 :     for (unsigned int idx = 0; idx < graph.size(); ++idx) {
     736         2565 :       auto const &lnode = getSortedLayerNode(idx);
     737         2565 :       lnode->setInPlaceType(canExecuteInPlace(lnode));
     738              :     }
     739              :   }
     740          633 : }
     741              : 
     742              : /**
     743              :  * @brief Set the Inplace Shared Memory Config By Layer object
     744              :  *
     745              :  * @param lnode layer node object
     746              :  * @param shared_var if the variable should be shared
     747              :  * @param shared_grad if the gradient should be shared
     748              :  */
     749              : static void
     750         1196 : setInplaceSharedMemoryConfigByLayer(const std::shared_ptr<LayerNode> &lnode,
     751              :                                     bool &shared_var, bool &shared_grad) {
     752              :   /** for multiout layer, variables are shared but gradients are not */
     753         1196 :   if (lnode->getType() == MultiOutLayer::type) {
     754          196 :     shared_var = true;
     755          196 :     shared_grad = false;
     756              :   } else {
     757         1000 :     shared_var = true;
     758         1000 :     shared_grad = true;
     759              :   }
     760              : 
     761              :   /**
     762              :    * @todo for layers which support in-place, both variables and gradients
     763              :    * will be shared.
     764              :    */
     765         1196 : }
     766              : 
     767              : std::vector<Var_Grad *>
     768         4410 : NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
     769              :                               const std::vector<Var_Grad *> &prev_inputs) {
     770         4410 :   const GraphNode &gnode = *lnode.get();
     771              :   std::vector<TensorDim> input_dims;
     772         4410 :   input_dims.reserve(prev_inputs.size());
     773         4410 :   std::transform(prev_inputs.begin(), prev_inputs.end(),
     774              :                  std::back_inserter(input_dims),
     775              :                  [](const Var_Grad *vg) { return vg->getDim(); });
     776              : 
     777              :   /** finalize the layer and get the final context */
     778         4410 :   auto init_context = lnode->finalize(input_dims, getTensorType(), exec_mode);
     779         4407 :   const auto &ct_engine = nntrainer::Engine::Global();
     780              : 
     781              :   /**
     782              :    * Request manager for either a pre-allocated output as input or a newly
     783              :    * allocated output. This is necessary for manager to know when this
     784              :    * output node is going to be used.
     785              :    */
     786              :   std::vector<std::string> input_names;
     787         4407 :   input_names.reserve(prev_inputs.size());
     788         4407 :   std::transform(
     789              :     prev_inputs.begin(), prev_inputs.end(), std::back_inserter(input_names),
     790         4554 :     [](auto const &vg) -> const auto & { return vg->getName(); });
     791              :   const std::vector<Var_Grad *> &inputs = tensor_manager->requestInputs(
     792         4407 :     gnode, init_context.getInputDimensions(), input_names);
     793              : 
     794              :   /** In-Place optimizations */
     795              :   /**
     796              :    * Request manager for either a pre-allocated input as output or a newly
     797              :    * allocated output. This is necessary for manager to know when this
     798              :    * output node is going to be used with in-place optimizations.
     799              :    */
     800         4407 :   auto out_specs = init_context.getOutSpecs();
     801              : 
     802              :   /// @note try move inplace control to finalize
     803         4407 :   bool shared_var = false, shared_grad = false;
     804              : 
     805         4407 :   if (lnode->getInPlaceType() != InPlaceType::NONE && lnode->supportInPlace()) {
     806         1182 :     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
     807              : 
     808         2662 :     for (unsigned int i = 0; i < out_specs.size(); ++i) {
     809              :       auto &s = out_specs.at(i);
     810         1480 :       if (shared_var) {
     811         1480 :         s.variable_spec.request_type =
     812              :           TensorSpecV2::RequestType::READ_ONLY_VIEW;
     813         1480 :         if (lnode->getType() == IdentityLayer::type) {
     814           50 :           s.variable_spec.reference_name = inputs[i]->getName();
     815           50 :           s.variable_spec.dim.setFormat(inputs[i]->getDim().getFormat());
     816         1430 :         } else if (lnode->getInPlaceDirection() == InPlaceDirection::RIGHT) {
     817            0 :           s.variable_spec.reference_name = inputs[1]->getName();
     818            0 :           s.variable_spec.dim.setFormat(inputs[1]->getDim().getFormat());
     819         1430 :         } else if (lnode->getType() == WeightLayer::type) {
     820              :           WeightSpec w_spec = init_context.getWeightsSpec()[i];
     821            0 :           s.variable_spec.reference_name = std::get<8>(w_spec);
     822              :           s.variable_spec.dim.setFormat(std::get<0>(w_spec).getFormat());
     823         1430 :         } else if (lnode->getType() == TensorLayer::type) {
     824              :           InitLayerContext::TensorSpec t_spec =
     825              :             init_context.getTensorsSpec()[i];
     826            0 :           s.variable_spec.reference_name = std::get<3>(t_spec);
     827              :           s.variable_spec.dim.setFormat(std::get<0>(t_spec).getFormat());
     828              :         } else {
     829         1430 :           s.variable_spec.reference_name = inputs[0]->getName();
     830         1430 :           s.variable_spec.dim.setFormat(inputs[0]->getDim().getFormat());
     831              :         }
     832              :       }
     833         1480 :       if (shared_grad && s.gradient_spec) {
     834          984 :         s.gradient_spec->request_type =
     835              :           TensorSpecV2::RequestType::READ_ONLY_VIEW;
     836          984 :         if (lnode->getType() == IdentityLayer::type) {
     837           50 :           s.gradient_spec->reference_name = inputs[i]->getGradientName();
     838           50 :           s.gradient_spec->dim.setFormat(inputs[i]->getDim().getFormat());
     839          934 :         } else if (lnode->getInPlaceDirection() == InPlaceDirection::RIGHT) {
     840            0 :           s.gradient_spec->reference_name = inputs[1]->getGradientName();
     841            0 :           s.gradient_spec->dim.setFormat(inputs[1]->getDim().getFormat());
     842          934 :         } else if (lnode->getType() == WeightLayer::type) {
     843              :           WeightSpec w_spec = init_context.getWeightsSpec()[i];
     844              :           s.gradient_spec->reference_name =
     845            0 :             std::get<8>(w_spec) + Var_Grad::grad_suffix;
     846              :           s.gradient_spec->dim.setFormat(std::get<0>(w_spec).getFormat());
     847          934 :         } else if (lnode->getType() == TensorLayer::type) {
     848              :           InitLayerContext::TensorSpec t_spec =
     849              :             init_context.getTensorsSpec()[i];
     850              :           s.gradient_spec->reference_name =
     851            0 :             std::get<3>(t_spec) + Var_Grad::grad_suffix;
     852              :           s.gradient_spec->dim.setFormat(std::get<0>(t_spec).getFormat());
     853              :         } else {
     854          934 :           s.gradient_spec->reference_name = inputs[0]->getGradientName();
     855          934 :           s.gradient_spec->dim.setFormat(inputs[0]->getDim().getFormat());
     856              :         }
     857              :       }
     858              :     }
     859              :   }
     860         4407 :   if (lnode->requireLabel()) {
     861          630 :     NNTR_THROW_IF(out_specs.size() != 1, std::invalid_argument)
     862              :       << "out specification size must be 1 for label layer for now, "
     863            0 :       << lnode->getName() << " out spec size: " << out_specs.size();
     864          630 :     NNTR_THROW_IF(out_specs[0].gradient_spec == nullptr, std::invalid_argument)
     865            0 :       << "label space does not exist for " << lnode->getName();
     866          630 :     out_specs[0].gradient_spec->request_type =
     867              :       TensorSpecV2::RequestType::PLACEHOLDER;
     868              :   }
     869              : 
     870              :   /// @note below needs to be enabled only for inference mode, but need
     871              :   /// decision if we are going to separate inference initialization from
     872              :   /// train initialization this might not worth optimize because in general
     873              :   /// output of a neuralnet is very small
     874         4407 :   if (lnode->getOutputConnections().size() == 0u) {
     875              :     std::for_each(out_specs.begin(), out_specs.end(),
     876              :                   [this](VarGradSpecV2 &spec) {
     877          638 :                     spec.variable_spec.additional_exec_order.push_back(
     878          638 :                       std::get<0>(forward_iter_end->getExecutionOrder()));
     879              :                   });
     880              :   }
     881              : 
     882         8814 :   if (lnode->getType() == RNNCellLayer::type or
     883         8814 :       lnode->getType() == LSTMCellLayer::type or
     884         8766 :       lnode->getType() == GRUCellLayer::type) {
     885              :     std::for_each(out_specs.begin(), out_specs.end(), [](VarGradSpecV2 &spec) {
     886          104 :       spec.variable_spec.ls = TensorLifespan::FORWARD_GRAD_LIFESPAN;
     887              :     });
     888              :   }
     889              : 
     890              :   const std::vector<Var_Grad *> &outputs = tensor_manager->requestTensors(
     891         4407 :     out_specs, Manager::TensorGroupType::OUTPUT, lnode->getExecutionOrder(),
     892         8814 :     lnode->getName());
     893              : 
     894              :   /** create shared weight names if requested */
     895              :   std::vector<std::string> shared_weight_names;
     896              :   std::vector<std::string> shared_tensor_names;
     897         4407 :   if (auto shared_node_str = lnode->getSharedFrom(); !shared_node_str.empty()) {
     898              :     /// @note below is commented but kept from quick fix to be referenced
     899              :     /// for later(#1707)
     900              :     // auto shared_node = getLayerNode(shared_node_str).get();
     901              :     // NNTR_THROW_IF(shared_node == nullptr, std::invalid_argument)
     902              :     //   << "shared_node requested but it is not registered in the graph,
     903              :     //   name:
     904              :     //   "
     905              :     //   << shared_node_str << " requested from " << lnode->getName();
     906              :     // NNTR_THROW_IF(shared_node->getType() != lnode->getType(),
     907              :     //               std::invalid_argument)
     908              :     //   << " shared_node and lnode type mismatch, source node type: "
     909              :     //   << shared_node->getType() << " depedent node type: " <<
     910              :     //   lnode->getType()
     911              :     //   << " depedent node name: " << lnode->getName();
     912              :     // NNTR_THROW_IF(!shared_node->isFinalized(), std::invalid_argument)
     913              :     //   << "shared node must be prior to the dependent node and it should
     914              :     //   be
     915              :     //   "
     916              :     //      "finalized beforehand, shared node name: "
     917              :     //   << shared_node_str << " dependent node name: " << lnode->getName();
     918              :     // auto num_weight = shared_node->getNumWeights();
     919              :     // shared_weight_names.reserve(num_weight);
     920              :     // for (auto i = 0u; i < num_weight; ++i) {
     921              :     //   shared_weight_names.emplace_back(shared_node->getWeightName(i));
     922              :     // }
     923              :     // auto &rc = node->getRunContext();
     924              : 
     925              :     /// @fixme tensor should be only shared if context explicitly requested
     926              :     /// to do so. This has to be added to the part of tensor spec, other
     927              :     /// wise it will break many things
     928              :     const auto &t_specs = init_context.getTensorsSpec();
     929          800 :     for (auto i = 0u; i < t_specs.size(); ++i) {
     930          488 :       shared_tensor_names.emplace_back(std::get<3>(t_specs.at(i)));
     931              :     }
     932              : 
     933              :     const auto &w_specs = init_context.getWeightsSpec();
     934         1960 :     for (auto i = 0u; i < w_specs.size(); ++i) {
     935         1648 :       shared_weight_names.emplace_back(std::get<8>(w_specs.at(i)));
     936              :     }
     937              :   }
     938              :   lnode->setDataType(init_context.getWeightDataType(),
     939              :                      init_context.getActivationDataType());
     940         4407 :   bool trainable = lnode->getTrainable();
     941         4407 :   if (exec_mode == ExecutionMode::INFERENCE)
     942              :     trainable = false;
     943              : 
     944         8814 :   auto context = ct_engine.getRegisteredContext(lnode->getComputeEngineType());
     945              : 
     946              :   auto ct_data = context->getContextData();
     947              : 
     948         4407 :   lnode->configureRunContext(
     949              :     // TODO: update weights spec for trainable based on layer trainable prop
     950         8814 :     tensor_manager->requestWeights(gnode, init_context.getWeightsSpec(),
     951              :                                    trainable, shared_weight_names),
     952              :     inputs, outputs,
     953         8814 :     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
     954              :                                    trainable, shared_tensor_names),
     955              :     init_context.getLossScale(), ct_data);
     956              : 
     957         8814 :   return outputs;
     958         4410 : }
     959              : 
     960              : std::vector<Var_Grad *>
     961           28 : NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
     962              :                                 const std::vector<Var_Grad *> &prev_inputs) {
     963           28 :   const GraphNode &gnode = *lnode.get();
     964              :   std::vector<TensorDim> input_dims;
     965           28 :   input_dims.reserve(prev_inputs.size());
     966           28 :   std::transform(prev_inputs.begin(), prev_inputs.end(),
     967              :                  std::back_inserter(input_dims),
     968              :                  [](const Var_Grad *vg) { return vg->getDim(); });
     969              : 
     970              :   /** refinalize the layer and get the final context */
     971           28 :   auto init_context = lnode->refinalize(input_dims);
     972           28 :   const auto &ct_engine = nntrainer::Engine::Global();
     973              : 
     974              :   /**
     975              :    * Request manager for either a pre-allocated output as input or a newly
     976              :    * allocated output. This is necessary for manager to know when this
     977              :    * output node is going to be used.
     978              :    */
     979              :   std::vector<std::string> input_names;
     980           28 :   input_names.reserve(prev_inputs.size());
     981           28 :   std::transform(
     982              :     prev_inputs.begin(), prev_inputs.end(), std::back_inserter(input_names),
     983           29 :     [](auto const &vg) -> const auto & { return vg->getName(); });
     984              :   const std::vector<Var_Grad *> &inputs = tensor_manager->requestInputs(
     985           28 :     gnode, init_context.getInputDimensions(), input_names);
     986              : 
     987              :   /** In-Place optimizations */
     988              :   /**
     989              :    * Request manager for either a pre-allocated input as output or a newly
     990              :    * allocated output. This is necessary for manager to know when this
     991              :    * output node is going to be used with in-place optimizations.
     992              :    */
     993           28 :   auto out_specs = init_context.getOutSpecs();
     994              :   /// @note try move inplace control to finalize
     995           28 :   bool shared_var = false, shared_grad = false;
     996           28 :   if (lnode->getInPlaceType() != InPlaceType::NONE) {
     997           14 :     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
     998           30 :     for (unsigned int i = 0; i < out_specs.size(); ++i) {
     999              :       auto &s = out_specs.at(i);
    1000           16 :       if (shared_var) {
    1001           16 :         s.variable_spec.request_type =
    1002              :           TensorSpecV2::RequestType::READ_ONLY_VIEW;
    1003           16 :         if (lnode->getType() == IdentityLayer::type) {
    1004            0 :           s.variable_spec.reference_name = inputs[i]->getName();
    1005           16 :         } else if (lnode->getInPlaceDirection() == InPlaceDirection::RIGHT) {
    1006            0 :           s.variable_spec.reference_name = inputs[1]->getName();
    1007              :         } else {
    1008           16 :           s.variable_spec.reference_name = inputs[0]->getName();
    1009              :         }
    1010              :       }
    1011           16 :       if (shared_grad && s.gradient_spec) {
    1012           10 :         s.gradient_spec->request_type =
    1013              :           TensorSpecV2::RequestType::READ_ONLY_VIEW;
    1014           10 :         if (lnode->getType() == IdentityLayer::type) {
    1015            0 :           s.gradient_spec->reference_name = inputs[i]->getGradientName();
    1016           10 :         } else if (lnode->getInPlaceDirection() == InPlaceDirection::RIGHT) {
    1017              :           // @note With binary inputs, inputs[0] represents the left input
    1018              :           // tensor while inputs[1] represents the right input tensor. As a
    1019              :           // result, if the in-place direction is set to right, the in-place
    1020              :           // memory is assigned to inputs[1].
    1021            0 :           s.gradient_spec->reference_name = inputs[1]->getGradientName();
    1022              :         } else {
    1023           10 :           s.gradient_spec->reference_name = inputs[0]->getGradientName();
    1024              :         }
    1025              :       }
    1026              :     }
    1027              :   }
    1028           28 :   if (lnode->requireLabel()) {
    1029            1 :     NNTR_THROW_IF(out_specs.size() != 1, std::invalid_argument)
    1030              :       << "out specification size must be 1 for label layer for now, "
    1031            0 :       << lnode->getName() << " out spec size: " << out_specs.size();
    1032            1 :     NNTR_THROW_IF(out_specs[0].gradient_spec == nullptr, std::invalid_argument)
    1033            0 :       << "label space does not exist for " << lnode->getName();
    1034            1 :     out_specs[0].gradient_spec->request_type =
    1035              :       TensorSpecV2::RequestType::PLACEHOLDER;
    1036              :   }
    1037              : 
    1038              :   /// @note below needs to be enabled only for inference mode, but need
    1039              :   /// decision if we are going to separate inference initialization from
    1040              :   /// train initialization this might not worth optimize because in general
    1041              :   /// output of a neuralnet is very small
    1042           28 :   if (lnode->getOutputConnections().size() == 0u) {
    1043              :     std::for_each(out_specs.begin(), out_specs.end(),
    1044              :                   [this](VarGradSpecV2 &spec) {
    1045            1 :                     spec.variable_spec.additional_exec_order.push_back(
    1046            1 :                       std::get<0>(forward_iter_end->getExecutionOrder()));
    1047              :                   });
    1048              :   }
    1049              : 
    1050           56 :   if (lnode->getType() == RNNCellLayer::type or
    1051           56 :       lnode->getType() == LSTMCellLayer::type or
    1052           56 :       lnode->getType() == GRUCellLayer::type) {
    1053              :     std::for_each(out_specs.begin(), out_specs.end(), [](VarGradSpecV2 &spec) {
    1054            0 :       spec.variable_spec.ls = TensorLifespan::FORWARD_GRAD_LIFESPAN;
    1055              :     });
    1056              :   }
    1057              : 
    1058              :   const std::vector<Var_Grad *> &outputs = tensor_manager->requestTensors(
    1059           28 :     out_specs, Manager::TensorGroupType::OUTPUT, lnode->getExecutionOrder(),
    1060           56 :     lnode->getName());
    1061              : 
    1062              :   /** create shared weight names if requested */
    1063              :   std::vector<std::string> shared_weight_names;
    1064              :   std::vector<std::string> shared_tensor_names;
    1065           28 :   if (auto shared_node_str = lnode->getSharedFrom(); !shared_node_str.empty()) {
    1066              :     /// @note below is commented but kept from quick fix to be referenced
    1067              :     /// for later(#1707)
    1068              :     // auto shared_node = getLayerNode(shared_node_str).get();
    1069              :     // NNTR_THROW_IF(shared_node == nullptr, std::invalid_argument)
    1070              :     //   << "shared_node requested but it is not registered in the graph,
    1071              :     //   name:
    1072              :     //   "
    1073              :     //   << shared_node_str << " requested from " << lnode->getName();
    1074              :     // NNTR_THROW_IF(shared_node->getType() != lnode->getType(),
    1075              :     //               std::invalid_argument)
    1076              :     //   << " shared_node and lnode type mismatch, source node type: "
    1077              :     //   << shared_node->getType() << " depedent node type: " <<
    1078              :     //   lnode->getType()
    1079              :     //   << " depedent node name: " << lnode->getName();
    1080              :     // NNTR_THROW_IF(!shared_node->isFinalized(), std::invalid_argument)
    1081              :     //   << "shared node must be prior to the dependent node and it should
    1082              :     //   be
    1083              :     //   "
    1084              :     //      "finalized beforehand, shared node name: "
    1085              :     //   << shared_node_str << " dependent node name: " << lnode->getName();
    1086              :     // auto num_weight = shared_node->getNumWeights();
    1087              :     // shared_weight_names.reserve(num_weight);
    1088              :     // for (auto i = 0u; i < num_weight; ++i) {
    1089              :     //   shared_weight_names.emplace_back(shared_node->getWeightName(i));
    1090              :     // }
    1091              :     // auto &rc = node->getRunContext();
    1092              : 
    1093              :     /// @fixme tensor should be only shared if context explicitly requested
    1094              :     /// to do so. This has to be added to the part of tensor spec, other
    1095              :     /// wise it will break many things
    1096              :     const auto &t_specs = init_context.getTensorsSpec();
    1097            0 :     for (auto i = 0u; i < t_specs.size(); ++i) {
    1098            0 :       shared_tensor_names.emplace_back(std::get<3>(t_specs.at(i)));
    1099              :     }
    1100              : 
    1101              :     const auto &w_specs = init_context.getWeightsSpec();
    1102            0 :     for (auto i = 0u; i < w_specs.size(); ++i) {
    1103            0 :       shared_weight_names.emplace_back(std::get<8>(w_specs.at(i)));
    1104              :     }
    1105              :   }
    1106              : 
    1107           28 :   auto weights = lnode->getRunContext().getWeights();
    1108              : 
    1109           56 :   auto context = ct_engine.getRegisteredContext(lnode->getComputeEngineType());
    1110              : 
    1111              :   auto ct_data = context->getContextData();
    1112              : 
    1113           28 :   lnode->configureRunContext(
    1114              :     // TODO: update weights spec for trainable based on layer trainable prop
    1115              :     weights, inputs, outputs,
    1116           56 :     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
    1117           28 :                                    lnode->getTrainable(), shared_tensor_names),
    1118              :     init_context.getLossScale(), ct_data);
    1119              : 
    1120           56 :   return outputs;
    1121           28 : }
    1122              : 
    1123              : #ifdef ENABLE_TEST
    1124              : 
    1125              : std::map<std::string, std::vector<unsigned int>>
    1126           32 : NetworkGraph::getLayerExecutionOrders(const std::shared_ptr<LayerNode> &lnode) {
    1127           32 :   const auto &init_context = lnode->getInitContext();
    1128           32 :   auto out_specs = init_context.getOutSpecs();
    1129           32 :   auto weight_specs = init_context.getWeightsSpec();
    1130           32 :   auto tensor_specs = init_context.getTensorsSpec();
    1131              : 
    1132              :   std::map<std::string, std::vector<unsigned int>> exec_orders;
    1133              : 
    1134           64 :   for (auto &spec : out_specs) {
    1135           64 :     const auto &name = lnode->getName() + ":" + spec.variable_spec.name;
    1136           32 :     auto orders = tensor_manager->getTensorExecutionOrders(name, false);
    1137           64 :     exec_orders.insert({name, orders});
    1138              :     try {
    1139              :       auto orders_grad =
    1140           32 :         tensor_manager->getTensorExecutionOrders(name + ":grad", false);
    1141           32 :       exec_orders.insert({name + ":grad", orders_grad});
    1142           32 :     } catch (const std::exception &e) {
    1143            0 :       ml_logi("Cannot find grad tensor for %s:grad", name.c_str());
    1144              :       continue;
    1145            0 :     }
    1146           32 :   }
    1147              : 
    1148           56 :   for (auto &spec : weight_specs) {
    1149              :     const auto &name = std::get<const std::string>(spec);
    1150           24 :     auto orders = tensor_manager->getTensorExecutionOrders(name, true);
    1151           48 :     exec_orders.insert({name, orders});
    1152              :     try {
    1153              :       auto orders_grad =
    1154           24 :         tensor_manager->getTensorExecutionOrders(name + ":grad", false);
    1155           18 :       exec_orders.insert({name + ":grad", orders_grad});
    1156           24 :     } catch (const std::exception &e) {
    1157            6 :       ml_logi("Cannot find grad tensor for %s:grad", name.c_str());
    1158              :       continue;
    1159            6 :     }
    1160           24 :   }
    1161              : 
    1162           32 :   for (auto &spec : tensor_specs) {
    1163              :     const auto &name = std::get<const std::string>(spec);
    1164            0 :     auto orders = tensor_manager->getTensorExecutionOrders(name, false);
    1165            0 :     exec_orders.insert({name, orders});
    1166              :     try {
    1167              :       auto orders_grad =
    1168            0 :         tensor_manager->getTensorExecutionOrders(name + ":grad", false);
    1169            0 :       exec_orders.insert({name + ":grad", orders_grad});
    1170            0 :     } catch (const std::exception &e) {
    1171            0 :       ml_logi("Cannot find grad tensor for %s:grad", name.c_str());
    1172              :       continue;
    1173            0 :     }
    1174            0 :   }
    1175              : 
    1176           32 :   return exec_orders;
    1177           32 : }
    1178              : 
    1179              : #endif // ENABLE_TEST
    1180              : 
    1181          622 : int NetworkGraph::initialize(ExecutionMode mode,
    1182              :                              const std::vector<Connection> &model_input_names,
    1183              :                              const std::vector<Connection> &model_label_names) {
    1184          622 :   exec_mode = mode;
    1185              :   tensor_manager->setExecutionMode(mode);
    1186              :   /**
    1187              :    * this contains the map from node name to its input tensor names
    1188              :    * @note: these input tensors have already been allocated
    1189              :    */
    1190              :   std::unordered_map<std::string, std::vector<Var_Grad *>> input_map;
    1191              : 
    1192              :   /** check if the given config of node is of input node */
    1193              :   auto is_input_node = [](const LayerNode *node) -> bool {
    1194         9321 :     return node->getInputConnections().empty();
    1195              :   };
    1196              : 
    1197         4408 :   for (unsigned int idx = 0; idx < graph.size(); ++idx) {
    1198              :     std::vector<Var_Grad *> inputs = {};
    1199         4408 :     auto const &lnode = getSortedLayerNode(idx);
    1200         8816 :     if (profile_keys.find(lnode->getType()) == profile_keys.end()) {
    1201              :       int event_key = 0;
    1202              :       PROFILE_TIME_REGISTER_EVENT(event_key, lnode->getType());
    1203         5870 :       profile_keys[lnode->getType()] = event_key;
    1204              :     }
    1205              : 
    1206              :     /**
    1207              :      * Set input dimension for all the layers.
    1208              :      * For input layer, as input dimension is known, set input tensor.
    1209              :      */
    1210         4408 :     if (!is_input_node(lnode.get())) {
    1211         6870 :       if (input_map.find(lnode->getName()) == input_map.end())
    1212            0 :         throw std::runtime_error("Cannot find input buffers for the node");
    1213        10308 :       inputs = input_map.at(lnode->getName());
    1214              :     }
    1215              : 
    1216              :     /**
    1217              :      * Initialize all the layers, allocate output tensors for each layer
    1218              :      * init2and add optimizer related weights for the layer
    1219              :      */
    1220         4408 :     const std::vector<Var_Grad *> &outputs = finalizeContext(lnode, inputs);
    1221              : 
    1222              :     /** no need to update input_map for the last layer */
    1223         4405 :     if (idx == graph.size() - 1)
    1224              :       break;
    1225              : 
    1226         8348 :     for (auto i = 0u, num_node = lnode->getNumOutputConnections(); i < num_node;
    1227              :          ++i) {
    1228         4562 :       auto conn = lnode->getOutputConnection(i);
    1229         4562 :       if (!conn) {
    1230           16 :         ml_logi("out connection not defined for  %s, %u",
    1231              :                 lnode->getName().c_str(), i);
    1232            8 :         continue;
    1233              :       }
    1234              : 
    1235         4554 :       auto sink_node = getLayerNode(conn->getName());
    1236              :       [[maybe_unused]] auto [it, b] =
    1237         9108 :         input_map.try_emplace({sink_node->getName(), {}});
    1238              : 
    1239         9108 :       NNTR_THROW_IF(sink_node->getInputConnectionName(conn->getIndex()) !=
    1240              :                       lnode->getName(),
    1241              :                     std::invalid_argument)
    1242            0 :         << "node pair does not match between " << lnode->getName() << ' '
    1243            0 :         << sink_node->getName();
    1244              : 
    1245         4554 :       auto &sink_tensors = it->second;
    1246         4554 :       sink_tensors.resize(sink_node->getNumInputConnections());
    1247         4554 :       sink_tensors[conn->getIndex()] = outputs[i];
    1248              :     }
    1249         8813 :   }
    1250              : 
    1251         5024 :   for (unsigned int idx = 0; idx < graph.size(); ++idx) {
    1252         4405 :     auto const &lnode = getSortedLayerNode(idx);
    1253         4405 :     auto &rc = lnode->getRunContext();
    1254              :     auto first_grad_access = std::get<1>(lnode->getExecutionOrder());
    1255              :     auto last_grad_access = std::get<3>(lnode->getExecutionOrder());
    1256         9247 :     for (unsigned i = 0; i < rc.getNumWeights(); ++i) {
    1257         4842 :       if (!rc.weightHasGradient(i)) {
    1258              :         /// @todo this is duck taping that MUST BE REMOVED. We will need to
    1259              :         /// have, is weight first access kind of concept.
    1260          510 :         if (tensor_manager->isFirstAccess(
    1261          510 :               rc.getWeight(i).getName(),
    1262              :               std::get<0>(lnode->getExecutionOrder()), true)) {
    1263          294 :           rc.getWeightObject(i).setAsGradientFirstAccess();
    1264              :         }
    1265          510 :         if (tensor_manager->isLastAccess(rc.getWeight(i).getName(),
    1266              :                                          last_grad_access, true)) {
    1267          286 :           rc.getWeightObject(i).setAsGradientLastAccess();
    1268              :         }
    1269              :       } else {
    1270         4332 :         if (tensor_manager->isFirstAccess(rc.getWeightGrad(i).getName(),
    1271              :                                           first_grad_access)) {
    1272         3724 :           rc.getWeightObject(i).setAsGradientFirstAccess();
    1273              :         }
    1274              :         /**
    1275              :          * if the gradient is to be clipped by global norm, then the last
    1276              :          * access is by clipping itself. However, as clipping is not a layer
    1277              :          * and does not contain any weights, such weights never get assigned
    1278              :          * gradient_last_access. This is a quick hotfix.
    1279              :          * TODO: make an independent clipping layer which will execute at
    1280              :          * the end, and will share ownership of weights which it will clip.
    1281              :          * This will remove this hot fix, and also remove the checks of if
    1282              :          * weights require clipping.
    1283              :          */
    1284         4332 :         if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
    1285         4956 :                                          last_grad_access) ||
    1286         1248 :             ((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
    1287           16 :              tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
    1288              :                                                 last_grad_access))) {
    1289         3724 :           rc.getWeightObject(i).setAsGradientLastAccess();
    1290              :         }
    1291              :       }
    1292              :     }
    1293              :   }
    1294              : 
    1295              :   /**** identify model input / output to be set externally later ****/
    1296          970 :   auto identify_as_model_input = [this](LayerNode *node) {
    1297          970 :     auto num_input = node->getNumInputs();
    1298          970 :     NNTR_THROW_IF(num_input != 1, std::invalid_argument)
    1299              :       << "Input layer is supposed to have exactly one input, but more then "
    1300              :          "one input detected, num inputs: "
    1301              :       << num_input;
    1302              : 
    1303          970 :     input_list.push_back(node->getInput(0).getName());
    1304          970 :     input_dims_.push_back(node->getInputDimensions()[0]);
    1305          970 :   };
    1306              : 
    1307         4489 :   auto is_label_node = [](LayerNode *node) { return node->requireLabel(); };
    1308              : 
    1309          630 :   auto identify_as_model_label = [this](LayerNode *node) {
    1310              :     /// @todo change this as lnode->getNumLabels of sorts
    1311          630 :     auto num_label = node->getNumOutputs();
    1312          630 :     NNTR_THROW_IF(!node->getOutputConnections().empty(), std::invalid_argument)
    1313              :       << "label layer is supposed to be a leaf for now";
    1314          630 :     NNTR_THROW_IF(num_label != 1, std::invalid_argument)
    1315              :       << "label layer is supposed to have exactly one label, but more then "
    1316              :          "one label detected, num labels: "
    1317              :       << num_label;
    1318              : 
    1319              :     /// @todo implement and use getLabel(0) instead.
    1320          630 :     output_list.push_back(node->getOutput(0).getName());
    1321          630 :     label_list.push_back(node->getOutputGrad(0).getName());
    1322          630 :     label_dims_.push_back(node->getOutputDimensions()[0]);
    1323          630 :   };
    1324              : 
    1325         1238 :   auto identify_external_tensors = [this](const std::vector<Connection> &conns,
    1326              :                                           auto &&pred, auto &&identify) {
    1327         1238 :     if (conns.empty()) {
    1328         6568 :       for (unsigned int i = 0; i < graph.size(); ++i) {
    1329        11116 :         auto lnode = getSortedLayerNode(i).get();
    1330         5558 :         if (!pred(lnode)) {
    1331         4550 :           continue;
    1332              :         }
    1333              :         /// when name is empty, we identify everything as the node, all of
    1334              :         /// them must be having identical dimensions
    1335         1008 :         identify(lnode);
    1336              :       }
    1337              :     } else {
    1338          820 :       for (auto &conn : conns) {
    1339         1184 :         auto lnode = getLayerNode(conn.getName()).get();
    1340          592 :         NNTR_THROW_IF(!pred(lnode), std::invalid_argument)
    1341              :           << "given node is not of that kind, name: " << conn.getName();
    1342          592 :         identify(lnode);
    1343              :       }
    1344              :       unsigned int num_node_of_kind = 0;
    1345         3480 :       for (unsigned int i = 0; i < graph.size(); ++i) {
    1346         6504 :         auto lnode = getSortedLayerNode(i).get();
    1347         3252 :         if (!pred(lnode)) {
    1348         2660 :           continue;
    1349              :         }
    1350          592 :         num_node_of_kind++;
    1351              :       }
    1352          228 :       NNTR_THROW_IF(num_node_of_kind != conns.size(), std::invalid_argument)
    1353              :         << "conns given but there are not identified node of the kind, num "
    1354              :            "node of kind: "
    1355              :         << num_node_of_kind << " identifier size: " << conns.size();
    1356              :     }
    1357         1857 :   };
    1358              : 
    1359          619 :   identify_external_tensors(model_input_names, is_input_node,
    1360              :                             identify_as_model_input);
    1361          619 :   identify_external_tensors(model_label_names, is_label_node,
    1362              :                             identify_as_model_label);
    1363              :   /** mark the nodes which will be backwarded during the graph operation */
    1364              :   try {
    1365          619 :     markNodesForBackwarding();
    1366          619 :     backward_iter_end = computeBackwardEnd();
    1367            0 :   } catch (std::exception &e) {
    1368            0 :     ml_loge("Backwarding required from layer which doesn't support "
    1369              :             "backwarding: %s",
    1370              :             e.what());
    1371              :     return ML_ERROR_INVALID_PARAMETER;
    1372            0 :   }
    1373              : 
    1374              :   /** select weights which would require clipping of the gradients by global
    1375              :    * norm if any */
    1376         1241 :   lazy_weights = tensor_manager->getWeights([](const Weight *w) {
    1377         4842 :     return w->hasGradient() && w->isGradientLastAccess() &&
    1378              :            (w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
    1379          619 :   });
    1380              : 
    1381          619 :   is_clip_grad = false;
    1382          619 :   for (auto w : lazy_weights) {
    1383            4 :     if (w->isGradientClipByGlobalNorm()) {
    1384            4 :       is_clip_grad = true;
    1385            4 :       break;
    1386              :     }
    1387              :   }
    1388              :   return ML_ERROR_NONE;
    1389              : }
    1390              : 
    1391            1 : int NetworkGraph::reinitialize(
    1392              :   const std::vector<Connection> &model_input_names,
    1393              :   const std::vector<Connection> &model_label_names) {
    1394              :   input_dims_.clear();
    1395              :   label_dims_.clear();
    1396            1 :   tensor_manager->reinitialize();
    1397              : 
    1398              :   /**
    1399              :    * this contains the map from node name to its input tensor names
    1400              :    * @note: these input tensors have already been allocated
    1401              :    */
    1402              :   std::unordered_map<std::string, std::vector<Var_Grad *>> input_map;
    1403              : 
    1404              :   /** check if the given config of node is of input node */
    1405              :   auto is_input_node = [](const LayerNode *node) -> bool {
    1406           56 :     return node->getInputConnections().empty();
    1407              :   };
    1408              : 
    1409           28 :   for (unsigned int idx = 0; idx < graph.size(); ++idx) {
    1410              :     std::vector<Var_Grad *> inputs = {};
    1411           28 :     auto const &lnode = getSortedLayerNode(idx);
    1412              : 
    1413           56 :     if (profile_keys.find(lnode->getType()) == profile_keys.end()) {
    1414              :       int event_key = 0;
    1415              :       PROFILE_TIME_REGISTER_EVENT(event_key, lnode->getType());
    1416            0 :       profile_keys[lnode->getType()] = event_key;
    1417              :     }
    1418              : 
    1419              :     /**
    1420              :      * Set input dimension for all the layers.
    1421              :      * For input layer, as input dimension is known, set input tensor.
    1422              :      */
    1423           28 :     if (!is_input_node(lnode.get())) {
    1424           54 :       if (input_map.find(lnode->getName()) == input_map.end())
    1425            0 :         throw std::runtime_error("Cannot find input buffers for the node");
    1426           81 :       inputs = input_map.at(lnode->getName());
    1427              :     }
    1428              : 
    1429              :     /**
    1430              :      * Reinitialize all the layers, allocate output tensors for each layer
    1431              :      * init2and add optimizer related weights for the layer
    1432              :      */
    1433           28 :     const std::vector<Var_Grad *> &outputs = refinalizeContext(lnode, inputs);
    1434              : 
    1435              :     /** no need to update input_map for the last layer */
    1436           28 :     if (idx == graph.size() - 1)
    1437              :       break;
    1438              : 
    1439           56 :     for (auto i = 0u, num_node = lnode->getNumOutputConnections(); i < num_node;
    1440              :          ++i) {
    1441           29 :       auto conn = lnode->getOutputConnection(i);
    1442           29 :       if (!conn) {
    1443            0 :         ml_logi("out connection not defined for  %s, %u",
    1444              :                 lnode->getName().c_str(), i);
    1445            0 :         continue;
    1446              :       }
    1447              : 
    1448           29 :       auto sink_node = getLayerNode(conn->getName());
    1449              :       [[maybe_unused]] auto [it, b] =
    1450           58 :         input_map.try_emplace({sink_node->getName(), {}});
    1451              : 
    1452           58 :       NNTR_THROW_IF(sink_node->getInputConnectionName(conn->getIndex()) !=
    1453              :                       lnode->getName(),
    1454              :                     std::invalid_argument)
    1455            0 :         << "node pair does not match between " << lnode->getName() << ' '
    1456            0 :         << sink_node->getName();
    1457              : 
    1458           29 :       auto &sink_tensors = it->second;
    1459           29 :       sink_tensors.resize(sink_node->getNumInputConnections());
    1460           29 :       sink_tensors[conn->getIndex()] = outputs[i];
    1461              :     }
    1462           56 :   }
    1463              : 
    1464           29 :   for (unsigned int idx = 0; idx < graph.size(); ++idx) {
    1465           28 :     auto const &lnode = getSortedLayerNode(idx);
    1466           28 :     auto &rc = lnode->getRunContext();
    1467              :     auto first_grad_access = std::get<1>(lnode->getExecutionOrder());
    1468              :     auto last_grad_access = std::get<3>(lnode->getExecutionOrder());
    1469           46 :     for (unsigned i = 0; i < rc.getNumWeights(); ++i) {
    1470           18 :       if (!rc.weightHasGradient(i)) {
    1471              :         /// @todo this is duck taping that MUST BE REMOVED. We will need to
    1472              :         /// have, is weight first access kind of concept.
    1473           18 :         if (tensor_manager->isFirstAccess(
    1474           18 :               rc.getWeight(i).getName(),
    1475              :               std::get<0>(lnode->getExecutionOrder()), true)) {
    1476           18 :           rc.getWeightObject(i).setAsGradientFirstAccess();
    1477              :         }
    1478           18 :         if (tensor_manager->isLastAccess(rc.getWeight(i).getName(),
    1479              :                                          last_grad_access, true)) {
    1480           18 :           rc.getWeightObject(i).setAsGradientLastAccess();
    1481              :         }
    1482              :       } else {
    1483            0 :         if (tensor_manager->isFirstAccess(rc.getWeightGrad(i).getName(),
    1484              :                                           first_grad_access)) {
    1485            0 :           rc.getWeightObject(i).setAsGradientFirstAccess();
    1486              :         }
    1487              :         /**
    1488              :          * if the gradient is to be clipped by global norm, then the last
    1489              :          * access is by clipping itself. However, as clipping is not a layer
    1490              :          * and does not contain any weights, such weights never get assigned
    1491              :          * gradient_last_access. This is a quick hotfix.
    1492              :          * TODO: make an independent clipping layer which will execute at
    1493              :          * the end, and will share ownership of weights which it will clip.
    1494              :          * This will remove this hot fix, and also remove the checks of if
    1495              :          * weights require clipping.
    1496              :          */
    1497            0 :         if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
    1498            0 :                                          last_grad_access) ||
    1499            0 :             (rc.isGradientClipByGlobalNorm(i) &&
    1500            0 :              tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
    1501              :                                                 last_grad_access))) {
    1502            0 :           rc.getWeightObject(i).setAsGradientLastAccess();
    1503              :         }
    1504              :       }
    1505              :     }
    1506              :   }
    1507              :   /**** identify model input / output to be set externally later ****/
    1508            1 :   auto identify_as_model_input = [this](LayerNode *node) {
    1509            1 :     auto num_input = node->getNumInputs();
    1510            1 :     NNTR_THROW_IF(num_input != 1, std::invalid_argument)
    1511              :       << "Input layer is supposed to have exactly one input, but more then "
    1512              :          "one input detected, num inputs: "
    1513              :       << num_input;
    1514              : 
    1515              :     // input_list.push_back(node->getInput(0).getName());
    1516            1 :     input_dims_.push_back(node->getInputDimensions()[0]);
    1517            1 :   };
    1518              : 
    1519           28 :   auto is_label_node = [](LayerNode *node) { return node->requireLabel(); };
    1520              : 
    1521            1 :   auto identify_as_model_label = [this](LayerNode *node) {
    1522              :     /// @todo change this as lnode->getNumLabels of sorts
    1523            1 :     auto num_label = node->getNumOutputs();
    1524            1 :     NNTR_THROW_IF(!node->getOutputConnections().empty(), std::invalid_argument)
    1525              :       << "label layer is supposed to be a leaf for now";
    1526            1 :     NNTR_THROW_IF(num_label != 1, std::invalid_argument)
    1527              :       << "label layer is supposed to have exactly one label, but more then "
    1528              :          "one label detected, num labels: "
    1529              :       << num_label;
    1530              : 
    1531              :     /// @todo implement and use getLabel(0) instead.
    1532              :     // output_list.push_back(node->getOutput(0).getName());
    1533              :     // label_list.push_back(node->getOutputGrad(0).getName());
    1534            1 :     label_dims_.push_back(node->getOutputDimensions()[0]);
    1535            1 :   };
    1536              : 
    1537            2 :   auto identify_external_tensors = [this](const std::vector<Connection> &conns,
    1538              :                                           auto &&pred, auto &&identify) {
    1539            2 :     if (conns.empty()) {
    1540           58 :       for (unsigned int i = 0; i < graph.size(); ++i) {
    1541          112 :         auto lnode = getSortedLayerNode(i).get();
    1542           56 :         if (!pred(lnode)) {
    1543           54 :           continue;
    1544              :         }
    1545              :         /// when name is empty, we identify everything as the node, all of
    1546              :         /// them must be having identical dimensions
    1547            2 :         identify(lnode);
    1548              :       }
    1549              :     } else {
    1550            0 :       for (auto &conn : conns) {
    1551            0 :         auto lnode = getLayerNode(conn.getName()).get();
    1552            0 :         NNTR_THROW_IF(!pred(lnode), std::invalid_argument)
    1553              :           << "given node is not of that kind, name: " << conn.getName();
    1554            0 :         identify(lnode);
    1555              :       }
    1556              :       unsigned int num_node_of_kind = 0;
    1557            0 :       for (unsigned int i = 0; i < graph.size(); ++i) {
    1558            0 :         auto lnode = getSortedLayerNode(i).get();
    1559            0 :         if (!pred(lnode)) {
    1560            0 :           continue;
    1561              :         }
    1562            0 :         num_node_of_kind++;
    1563              :       }
    1564            0 :       NNTR_THROW_IF(num_node_of_kind != conns.size(), std::invalid_argument)
    1565              :         << "conns given but there are not identified node of the kind, num "
    1566              :            "node of kind: "
    1567              :         << num_node_of_kind << " identifier size: " << conns.size();
    1568              :     }
    1569            3 :   };
    1570              : 
    1571            1 :   identify_external_tensors(model_input_names, is_input_node,
    1572              :                             identify_as_model_input);
    1573            1 :   identify_external_tensors(model_label_names, is_label_node,
    1574              :                             identify_as_model_label);
    1575              : 
    1576            1 :   return ML_ERROR_NONE;
    1577              : }
    1578              : 
    1579        14518 : void NetworkGraph::setExternalTensors(const std::vector<Tensor> &data,
    1580              :                                       const std::vector<std::string> names) {
    1581              :   /// feed or clear label
    1582        29853 :   for (unsigned int idx = 0; idx < names.size(); idx++) {
    1583        15335 :     if (data.empty())
    1584         4725 :       tensor_manager->fillPlaceholder(names[idx], Tensor());
    1585        13760 :     else if (data.size() == 1)
    1586              :       tensor_manager->fillPlaceholder(names[idx], data[0]);
    1587              :     else
    1588              :       tensor_manager->fillPlaceholder(names[idx], data[idx]);
    1589              :   }
    1590        14518 : }
    1591              : 
    1592         7259 : void NetworkGraph::setInputsLabels(const std::vector<Tensor> &inputs,
    1593              :                                    const std::vector<Tensor> &labels) {
    1594         7259 :   NNTR_THROW_IF(labels.size() > 1 && labels.size() != label_list.size(),
    1595              :                 std::invalid_argument)
    1596              :     << "label size does not match with the network requirements"
    1597              :     << " label size: " << labels.size()
    1598              :     << " requirements size: " << label_list.size();
    1599              : 
    1600         7259 :   NNTR_THROW_IF(inputs.size() > 1 && inputs.size() != input_list.size(),
    1601              :                 std::invalid_argument)
    1602              :     << "input size does not match with the network requirements"
    1603              :     << " input size: " << inputs.size()
    1604              :     << " requirements size: " << input_list.size();
    1605              : 
    1606         7259 :   setExternalTensors(inputs, input_list);
    1607         7259 :   setExternalTensors(labels, label_list);
    1608         7259 : }
    1609              : 
    1610         1148 : void NetworkGraph::setInputsLabels(sharedConstTensors &inputs,
    1611              :                                    sharedConstTensors &labels) {
    1612              :   std::vector<Tensor> ins;
    1613              :   std::transform(
    1614              :     inputs.begin(), inputs.end(), std::back_inserter(ins),
    1615              :     [](auto const &val) -> const auto & { return *val.get(); });
    1616              : 
    1617              :   std::vector<Tensor> labs;
    1618              :   std::transform(
    1619              :     labels.begin(), labels.end(), std::back_inserter(labs),
    1620              :     [](auto const &val) -> const auto & { return *val.get(); });
    1621              : 
    1622         1148 :   setInputsLabels(ins, labs);
    1623         1148 : }
    1624              : 
    1625           23 : std::vector<Tensor> NetworkGraph::getOutputTensors() const {
    1626              :   std::vector<Tensor> output_tensors;
    1627           23 :   output_tensors.reserve(output_list.size());
    1628              : 
    1629           46 :   for (auto const &name : output_list)
    1630           23 :     output_tensors.push_back(*tensor_manager->getTensor(name));
    1631              : 
    1632           23 :   return output_tensors;
    1633            0 : }
    1634              : 
    1635         5400 : void NetworkGraph::flushCache() { tensor_manager->flushCache(); }
    1636              : 
    1637        94179 : void NetworkGraph::flushCacheExcept(unsigned int order) {
    1638        94179 :   tensor_manager->flushCacheExcept(order);
    1639        94179 : }
    1640              : 
    1641            0 : void NetworkGraph::LoadTensors(unsigned int order, unsigned int lookahead) {
    1642            0 :   tensor_manager->LoadTensors(order, lookahead);
    1643            0 : }
    1644              : 
    1645            0 : bool NetworkGraph::checkLoadComplete(unsigned int order) {
    1646            0 :   return tensor_manager->checkLoadComplete(order);
    1647              : }
    1648              : 
    1649            0 : bool NetworkGraph::inActive(unsigned int order) {
    1650            0 :   return tensor_manager->inActive(order);
    1651              : }
    1652              : 
    1653            0 : bool NetworkGraph::checkUnloadComplete(unsigned int order) {
    1654            0 :   return tensor_manager->checkUnloadComplete(order);
    1655              : }
    1656              : 
    1657            0 : void NetworkGraph::UnloadTensors(unsigned int order) {
    1658            0 :   tensor_manager->UnloadTensors(order);
    1659            0 : }
    1660              : 
    1661          616 : void NetworkGraph::requestOptimizerVariable(
    1662              :   std::function<std::vector<TensorDim>(const TensorDim &)> cb,
    1663              :   bool request_only_trainable) {
    1664         6064 :   for (auto const &w : tensor_manager->getWeights()) {
    1665         4832 :     if (w->isGradientLastAccess() && w->hasGradient()) {
    1666         7444 :       const TensorDim &dim = w->getDim();
    1667              :       std::vector<TensorDim> dims = cb(dim);
    1668        11166 :       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
    1669         3722 :         dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
    1670              :         w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
    1671              :         Initializer::ZEROS));
    1672         3722 :     }
    1673          616 :   }
    1674          616 : }
    1675              : 
    1676            0 : void NetworkGraph::resetLossScale(float scale) {
    1677            0 :   loss_scale = scale;
    1678            0 :   for (auto iter = cbegin(); iter != cend(); iter++) {
    1679              :     auto &ln = *iter;
    1680            0 :     ln->getRunContext().setLossScale(scale);
    1681              :   }
    1682            0 : }
    1683              : 
    1684              : } /* namespace nntrainer */

Generated by: LCOV version 2.0-1