LCOV - coverage_filtered.info - nntrainer/models/neuralnet.cpp

LCOV - code coverage report

Current view:	top level - nntrainer/models - neuralnet.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	66.7 %	765	510
Test Date:	2025-12-14 20:38:17	Functions:	78.6 %	70	55

            Line data    Source code

       1              : /**
       2              :  * Copyright (C) 2019 Samsung Electronics Co., Ltd. All Rights Reserved.
       3              :  *
       4              :  * Licensed under the Apache License, Version 2.0 (the "License");
       5              :  * you may not use this file except in compliance with the License.
       6              :  * You may obtain a copy of the License at
       7              :  *   http://www.apache.org/licenses/LICENSE-2.0
       8              :  * Unless required by applicable law or agreed to in writing, software
       9              :  * distributed under the License is distributed on an "AS IS" BASIS,
      10              :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      11              :  * See the License for the specific language governing permissions and
      12              :  * limitations under the License.
      13              :  *
      14              :  *
      15              :  * @file        neuralnet.cpp
      16              :  * @date        04 December 2019
      17              :  * @brief       This is Neural Network Class
      18              :  * @see         https://github.com/nnstreamer/nntrainer
      19              :  * @author      Jijoong Moon <jijoong.moon@samsung.com>
      20              :  * @bug         No known bugs except for NYI items
      21              :  *
      22              :  */
      23              : 
      24              : #include "layer_context.h"
      25              : #include "model.h"
      26              : #include "model_common_properties.h"
      27              : #include <cmath>
      28              : #include <cstring>
      29              : #include <fstream>
      30              : #include <future>
      31              : #include <iomanip>
      32              : #include <sstream>
      33              : 
      34              : #include <activation_realizer.h>
      35              : #include <adamw.h>
      36              : #include <common_properties.h>
      37              : #include <databuffer.h>
      38              : #include <flatten_realizer.h>
      39              : #include <ini_interpreter.h>
      40              : #include <ini_wrapper.h>
      41              : #include <input_realizer.h>
      42              : #include <model_loader.h>
      43              : #include <multiout_realizer.h>
      44              : #include <neuralnet.h>
      45              : #include <nntrainer_error.h>
      46              : #include <nntrainer_log.h>
      47              : #include <node_exporter.h>
      48              : #include <optimizer_context.h>
      49              : #include <optional>
      50              : #include <previous_input_realizer.h>
      51              : #include <profiler.h>
      52              : #include <recurrent_realizer.h>
      53              : #include <remap_realizer.h>
      54              : #include <slice_realizer.h>
      55              : #include <util_func.h>
      56              : 
      57              : #ifdef ENABLE_TFLITE_INTERPRETER
      58              : #include <tflite_interpreter.h>
      59              : #endif
      60              : 
      61              : /**
      62              :  * @brief Internal enum values for nntrainer to summarize model accuracy & loss
      63              :  */
      64              : #define ML_TRAIN_SUMMARY_MODEL_TRAIN_LOSS 101
      65              : #define ML_TRAIN_SUMMARY_MODEL_VALID_LOSS 102
      66              : #define ML_TRAIN_SUMMARY_MODEL_VALID_ACCURACY 103
      67              : 
      68              : namespace nntrainer {
      69              : 
      70          835 : NeuralNetwork::NeuralNetwork() :
      71          835 :   model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm(),
      72         1670 :               props::LossScale()),
      73         2505 :   model_flex_props(props::Epochs(), props::TrainingBatchSize(),
      74         1670 :                    props::SavePath(), props::ContinueTrain(),
      75         1670 :                    props::SaveBestPath(), props::MemoryOptimization(),
      76         2505 :                    props::Fsu(), props::FsuPath(), props::FsuLookahead(),
      77         1670 :                    props::TensorFormat(), props::ModelTensorDataType()),
      78              :   load_path(std::string()),
      79          835 :   epoch_idx(0),
      80          835 :   iter(0),
      81          835 :   loss(0.0f),
      82          835 :   data_buffers({nullptr, nullptr, nullptr}),
      83          835 :   initialized(false),
      84          835 :   compiled(false),
      85          835 :   loadedFromConfig(false),
      86          835 :   exec_mode(ExecutionMode::TRAIN),
      87         1670 :   ct_engine(&Engine::Global()) {}
      88              : 
      89            1 : NeuralNetwork::NeuralNetwork(const Engine *ct_engine_) :
      90            1 :   model_props(props::LossType(), {}, {}, props::ClipGradByGlobalNorm(),
      91            2 :               props::LossScale()),
      92            3 :   model_flex_props(props::Epochs(), props::TrainingBatchSize(),
      93            2 :                    props::SavePath(), props::ContinueTrain(),
      94            2 :                    props::SaveBestPath(), props::MemoryOptimization(),
      95            3 :                    props::Fsu(), props::FsuPath(), props::FsuLookahead(),
      96            2 :                    props::TensorFormat(), props::ModelTensorDataType()),
      97              :   load_path(std::string()),
      98            1 :   epoch_idx(0),
      99            1 :   iter(0),
     100            1 :   loss(0.0f),
     101            1 :   data_buffers({nullptr, nullptr, nullptr}),
     102            1 :   initialized(false),
     103            1 :   compiled(false),
     104            1 :   loadedFromConfig(false),
     105            1 :   exec_mode(ExecutionMode::TRAIN),
     106            2 :   ct_engine(ct_engine_) {}
     107              : 
     108          696 : int NeuralNetwork::loadFromConfig(const std::string &config) {
     109          696 :   if (loadedFromConfig == true) {
     110           30 :     ml_loge("can not do loadFromConfig twice");
     111           30 :     return ML_ERROR_INVALID_PARAMETER;
     112              :   }
     113              : 
     114          666 :   ModelLoader loader(ct_engine);
     115          666 :   NeuralNetwork tempNet(*this);
     116              : 
     117          666 :   int status = loader.loadFromContext(tempNet);
     118          666 :   if (status != ML_ERROR_NONE) {
     119              :     return status;
     120              :   }
     121              : 
     122          666 :   status = loader.loadFromConfig(config, tempNet);
     123          666 :   if (status != ML_ERROR_NONE) {
     124              :     return status;
     125              :   }
     126              : 
     127          540 :   tempNet.loadedFromConfig = true;
     128          540 :   swap(tempNet, *this);
     129              : 
     130              :   return ML_ERROR_NONE;
     131          666 : }
     132              : 
     133            0 : unsigned int NeuralNetwork::getCurrentEpoch() {
     134              : #ifdef DEBUG
     135              :   ml_logd("[NNTrainer] Current epoch: %d", epoch_idx);
     136              : #endif
     137            0 :   return epoch_idx;
     138              : };
     139              : 
     140         1490 : void NeuralNetwork::setProperty(const std::vector<std::string> &values) {
     141         1490 :   auto left_props = loadProperties(values, model_props);
     142         1487 :   setTrainConfig(left_props);
     143         1487 : }
     144              : 
     145         1514 : void NeuralNetwork::setTrainConfig(const std::vector<std::string> &values) {
     146         1514 :   auto left_props = loadProperties(values, model_flex_props);
     147         1512 :   NNTR_THROW_IF(left_props.size(), std::invalid_argument)
     148              :     << "Model has unparsed properties, size: " << left_props.size()
     149              :     << " of first element: " << left_props.front();
     150         1511 : }
     151              : 
     152          697 : int NeuralNetwork::compile(ExecutionMode mode) {
     153              : 
     154          697 :   exec_mode = mode;
     155              : 
     156              :   std::string loss_type = std::get<props::LossType>(model_props).empty()
     157          697 :                             ? std::string()
     158          311 :                             : std::get<props::LossType>(model_props);
     159              : 
     160              :   auto &input_conn = std::get<std::vector<props::InputConnection>>(model_props);
     161              :   /// @note label layer might need to be treated in the similar way as well
     162              : 
     163              :   /// @todo make NetworkGraph compiled at the construction instead of having
     164              :   /// graph.compile(), neuralnetwork have ownership of list of layer nodes,
     165              :   /// which will be passed at compile time.
     166              : 
     167              :   std::vector<std::unique_ptr<GraphRealizer>> realizers;
     168              : 
     169          697 :   realizers.emplace_back(new PreviousInputRealizer(
     170         1394 :     std::vector<Connection>(input_conn.begin(), input_conn.end())));
     171          697 :   realizers.emplace_back(new MultioutRealizer());
     172          697 :   realizers.emplace_back(new FlattenRealizer());
     173          697 :   realizers.emplace_back(new ActivationRealizer());
     174              : 
     175         3475 :   for (auto &realizer : realizers) {
     176         2781 :     graph_representation = realizer->realize(graph_representation);
     177              :   }
     178              : 
     179          694 :   bool fsu = std::get<props::Fsu>(model_flex_props);
     180          694 :   const std::string fsu_path = std::get<props::FsuPath>(model_flex_props);
     181          694 :   unsigned int lookahead = std::get<props::FsuLookahead>(model_flex_props);
     182              : 
     183              :   const std::string tensor_format =
     184              :     to_string(std::get<props::TensorFormat>(model_flex_props));
     185              : 
     186              :   const std::string tensor_type =
     187              :     to_string(std::get<props::ModelTensorDataType>(model_flex_props));
     188              : 
     189              :   model_graph =
     190          694 :     NetworkGraph(fsu, mode, fsu_path, lookahead, tensor_format, tensor_type);
     191              : 
     192          694 :   model_graph.setMemoryOptimizations(
     193              :     std::get<props::MemoryOptimization>(model_flex_props));
     194         5027 :   for (auto &node : graph_representation) {
     195         4333 :     if (auto &prop = std::get<props::ClipGradByGlobalNorm>(model_props);
     196              :         !prop.empty()) {
     197            0 :       node->setProperty({"clip_grad_by_norm=" + to_string(prop)});
     198              :     }
     199              :     if (auto &prop = std::get<props::LossScale>(model_props); !prop.empty()) {
     200        12999 :       node->setProperty({"loss_scale=" + to_string(prop)});
     201              :     }
     202         8666 :     model_graph.addLayer(node);
     203              :   }
     204              : 
     205          694 :   int status = model_graph.compile(loss_type);
     206          694 :   NN_RETURN_STATUS();
     207              : 
     208          624 :   compiled = true;
     209              : 
     210          624 :   return status;
     211          697 : }
     212              : 
     213          839 : int NeuralNetwork::initialize(ExecutionMode mode) {
     214              :   int status = ML_ERROR_NONE;
     215              : 
     216          839 :   if (mode != exec_mode) {
     217            0 :     if (mode == ExecutionMode::INFERENCE) {
     218            0 :       ml_logd("Execution mode mismatch : train mode @compile & inference mode "
     219              :               "@ initialize");
     220            0 :       exec_mode = mode;
     221              :     } else {
     222            0 :       NNTR_THROW_IF(exec_mode == ExecutionMode::TRAIN, std::invalid_argument)
     223              :         << "Execution mode mismatch : trying to train with compiled for "
     224              :            "inference";
     225              :     }
     226              :   }
     227              : 
     228          839 :   if (initialized) {
     229           81 :     ml_loge("Error: Initializing the model again");
     230           81 :     return ML_ERROR_NOT_SUPPORTED;
     231              :   }
     232              : 
     233          758 :   if (!compiled) {
     234          137 :     ml_loge("Error: Need to compile first");
     235          137 :     return ML_ERROR_NOT_SUPPORTED;
     236              :   }
     237              : 
     238              :   unsigned int n_layers = (unsigned int)model_graph.size();
     239              : 
     240         1242 :   ml_logd("initializing neural network, layer size: %d", n_layers);
     241              :   PROFILE_MEM_ANNOTATE("Initialize");
     242              : 
     243              :   auto &input_conn_prop =
     244              :     std::get<std::vector<props::InputConnection>>(model_props);
     245              :   auto &label_layer_prop =
     246              :     std::get<std::vector<props::LabelLayer>>(model_props);
     247              : 
     248              :   std::vector<Connection> input_conn(input_conn_prop.begin(),
     249          621 :                                      input_conn_prop.end());
     250              :   std::vector<std::string> label_layers;
     251              : 
     252          621 :   if (!label_layer_prop.empty()) {
     253          136 :     label_layers = std::vector<std::string>(label_layer_prop.begin(),
     254           68 :                                             label_layer_prop.end());
     255              :   }
     256              : 
     257          621 :   status = model_graph.initialize(
     258              :     exec_mode, input_conn,
     259         1242 :     std::vector<Connection>(label_layers.begin(), label_layers.end()));
     260          618 :   NN_RETURN_STATUS();
     261              : 
     262          618 :   model_graph.setBatchSize(
     263              :     std::get<props::TrainingBatchSize>(model_flex_props));
     264              : 
     265              :   // If the execution mode is `train`, the optimizer and its relevant variables
     266              :   // are initialized. Throws an error if the optimizer is not set for training;
     267              :   // otherwise, it initializes
     268          618 :   if (exec_mode == ExecutionMode::TRAIN) {
     269              : 
     270          617 :     if (!opt) {
     271            1 :       ml_loge("Optimizer should be set before initialization for training.");
     272            1 :       return ML_ERROR_INVALID_PARAMETER;
     273              :     }
     274              :     /** TODO: update request of optimizer to be of same format as
     275              :      * Layer::requestTensor */
     276          616 :     opt->finalize();
     277              :     std::function<std::vector<TensorDim>(const TensorDim &)> cb =
     278              :       [this](const TensorDim &dim) {
     279         3722 :         return opt->getOptimizerVariableDim(dim);
     280              :       };
     281         1232 :     model_graph.requestOptimizerVariable(cb, true);
     282              :   }
     283              : 
     284              :   // Allocate weights
     285          617 :   model_graph.allocateWeights(exec_mode != ExecutionMode::INFERENCE);
     286              :   // enable this to save initialized weights for INFERENCE
     287              :   // model_graph.allocateWeights(true);
     288              : 
     289          617 :   initialized = true;
     290              : 
     291          617 :   if (!load_path.empty()) {
     292            0 :     load(load_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
     293              :   }
     294              : 
     295              :   return status;
     296          624 : }
     297              : 
     298            1 : int NeuralNetwork::reinitialize() {
     299              :   int status = ML_ERROR_NONE;
     300              : 
     301            1 :   if (!initialized) {
     302            0 :     ml_loge("Error: Need to initialize first");
     303            0 :     return ML_ERROR_NOT_SUPPORTED;
     304              :   }
     305              : 
     306              :   unsigned int n_layers = (unsigned int)model_graph.size();
     307              : 
     308            2 :   ml_logd("reinitializing neural network, layer size: %d", n_layers);
     309              :   PROFILE_MEM_ANNOTATE("Reinitialize");
     310              : 
     311              :   auto &input_conn_prop =
     312              :     std::get<std::vector<props::InputConnection>>(model_props);
     313              :   auto &label_layer_prop =
     314              :     std::get<std::vector<props::LabelLayer>>(model_props);
     315              : 
     316              :   std::vector<Connection> input_conn(input_conn_prop.begin(),
     317            1 :                                      input_conn_prop.end());
     318              :   std::vector<std::string> label_layers;
     319              : 
     320            1 :   if (!label_layer_prop.empty()) {
     321            0 :     label_layers = std::vector<std::string>(label_layer_prop.begin(),
     322            0 :                                             label_layer_prop.end());
     323              :   }
     324              : 
     325            1 :   status = model_graph.reinitialize(
     326              :     input_conn,
     327            2 :     std::vector<Connection>(label_layers.begin(), label_layers.end()));
     328              :   NN_RETURN_STATUS();
     329              : 
     330              :   return status;
     331            1 : }
     332              : 
     333              : /**
     334              :  * @brief     free layers
     335              :  */
     336         2086 : NeuralNetwork::~NeuralNetwork() {
     337              :   try {
     338         1534 :     deallocate();
     339            0 :   } catch (const std::runtime_error &e) {
     340            0 :     std::cerr << "Error occurred during destroying NeuralNetwork: " << e.what()
     341              :               << std::endl;
     342            0 :   }
     343              : 
     344              :   /** if neuralnet open fd */
     345         1534 :   if (model_file_fd != -1)
     346            0 :     close(model_file_fd);
     347         5154 : }
     348              : 
     349              : /**
     350              :  * @brief     forward propagation using layers object which has layer
     351              :  */
     352         6821 : sharedConstTensors NeuralNetwork::forwarding(
     353              :   bool training, std::function<bool(void *userdata)> stop_cb, void *userdata) {
     354              : 
     355         6821 :   unsigned int lookahead = std::get<props::FsuLookahead>(model_flex_props);
     356         6821 :   bool fsu_mode = std::get<props::Fsu>(model_flex_props);
     357         6821 :   if (fsu_mode) {
     358            0 :     for (unsigned int i = 0; i < lookahead; ++i) {
     359            0 :       model_graph.LoadTensors(i);
     360              :     }
     361              :   }
     362              :   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op =
     363        61408 :     [this, stop_cb, lookahead, fsu_mode](std::shared_ptr<LayerNode> node,
     364              :                                          bool training) -> void {
     365              :     (void)this;
     366              :     PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName());
     367              : 
     368              :     auto f = std::get<0>(node->getExecutionOrder());
     369        27303 :     if (exec_mode == ExecutionMode::TRAIN or
     370            5 :         (exec_mode == ExecutionMode::INFERENCE and !fsu_mode)) {
     371        27303 :       model_graph.flushCacheExcept(f);
     372        27303 :       node->forwarding(training);
     373              :     } else {
     374              :       /**
     375              :          currently, it supports FSU asynch mode for inference. The prcedure of
     376              :          FSU is below,
     377              : 
     378              :          Prerequests : This function is called node by node at the forwarding
     379              :          function in network graph.
     380              : 
     381              :          Step 1. If the execution order is the first (f==0) then, it will try
     382              :        to load tensors which used at layer 0.
     383              : 
     384              :          Step 2. It check whether these tensors from Step 1, then do the
     385              :                  forwarding of the first node.
     386              : 
     387              :          Step 3. Then check the look a head which says how many layer weights
     388              :        need to be loaded before running to hide overehad due to FSU,
     389              : 
     390              :          Step 4. Try to get the tesors by asking tensors for layers which is
     391              :        done by thread pool
     392              : 
     393              :          Step 5. Try to release the weights which has execution order less
     394              :        then f.
     395              : 
     396              :          Step n. repeat next layer starting with checking the tenosrs are
     397              :        loaded, and if it is loaded, then run forwarding. Every time it
     398              :        finishes the forwarding, ask load tensors for next n layers.
     399              : 
     400              :       **/
     401            0 :       model_graph.checkLoadComplete(f);
     402            0 :       node->forwarding(training);
     403            0 :       model_graph.inActive(f);
     404            0 :       model_graph.LoadTensors(f + lookahead);
     405              :     }
     406         6821 :   };
     407              : 
     408        27284 :   return model_graph.forwarding(training, forwarding_op, stop_cb, userdata);
     409              : }
     410              : 
     411              : /**
     412              :  * @brief     forward propagation using layers object which has layer
     413              :  */
     414         1148 : sharedConstTensors NeuralNetwork::forwarding(sharedConstTensors input,
     415              :                                              sharedConstTensors label,
     416              :                                              bool training) {
     417         1148 :   auto current_batch = model_graph.getBatchSize();
     418         1148 :   if (current_batch != input[0]->batch()) {
     419            0 :     model_graph.setBatchSize(input[0]->batch());
     420            0 :     current_batch = model_graph.getBatchSize();
     421              :   }
     422              : 
     423         1148 :   NNTR_THROW_IF(input[0]->batch() != current_batch ||
     424              :                   (!label.empty() && label[0]->batch() != current_batch),
     425              :                 std::logic_error)
     426              :     << "Error: mismatch in batchsize for data and model."
     427            0 :     << " input_batch: " << input[0]->batch()
     428            0 :     << " label_batch: " << label[0]->batch()
     429              :     << " target_batch: " << current_batch;
     430              : 
     431         1148 :   model_graph.setInputsLabels(input, label);
     432              : 
     433         2296 :   return forwarding(training);
     434              : }
     435              : 
     436            0 : sharedConstTensors NeuralNetwork::incremental_forwarding(
     437              :   unsigned int from, unsigned int to, bool training,
     438              :   std::function<bool(void *userdata)> stop_cb, void *userdata) {
     439              : 
     440            0 :   unsigned int lookahead = std::get<props::FsuLookahead>(model_flex_props);
     441            0 :   bool fsu_mode = std::get<props::Fsu>(model_flex_props);
     442              : 
     443            0 :   if (fsu_mode) {
     444            0 :     for (unsigned int i = 0; i < lookahead; ++i) {
     445            0 :       model_graph.LoadTensors(i);
     446              :     }
     447              :   }
     448              : 
     449              :   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op =
     450            0 :     [this, from, to, stop_cb, fsu_mode,
     451            0 :      lookahead](std::shared_ptr<LayerNode> node, bool training) -> void {
     452              :     PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName());
     453              : 
     454              :     auto f = std::get<0>(node->getExecutionOrder());
     455            0 :     if (exec_mode == ExecutionMode::TRAIN or
     456            0 :         (exec_mode == ExecutionMode::INFERENCE and !fsu_mode)) {
     457              :       // auto start_layer =
     458              :       //      std::chrono::high_resolution_clock::now(); // log the
     459              :       //      start_prefill time
     460            0 :       model_graph.flushCacheExcept(f);
     461            0 :       node->incremental_forwarding(from, to, training);
     462              :       // auto end_layer =
     463              :       //  std::chrono::high_resolution_clock::now(); // log th
     464              :       //   auto duration_ =
     465              :       //   std::chrono::duration_cast<std::chrono::nanoseconds>(end_layer-start_layer);
     466              :       // std::cout << node->getName() <<" : "<< duration_.count()<<"
     467              :       // ns"<<std::endl;
     468              :     } else {
     469            0 :       model_graph.checkLoadComplete(f);
     470            0 :       node->incremental_forwarding(from, to, training);
     471            0 :       model_graph.inActive(f);
     472            0 :       model_graph.LoadTensors(f + lookahead);
     473              :     }
     474            0 :   };
     475              : 
     476              :   return model_graph.incremental_forwarding(from, to, training, forwarding_op,
     477            0 :                                             stop_cb, userdata);
     478              : }
     479              : 
     480              : sharedConstTensors
     481            0 : NeuralNetwork::incremental_forwarding(unsigned int from, unsigned int to,
     482              :                                       sharedConstTensors input,
     483              :                                       sharedConstTensors label, bool training) {
     484            0 :   auto current_batch = model_graph.getBatchSize();
     485            0 :   NNTR_THROW_IF(input[0]->batch() != current_batch ||
     486              :                   (!label.empty() && label[0]->batch() != current_batch),
     487              :                 std::logic_error)
     488              :     << "Error: mismatch in batchsize for data and model."
     489            0 :     << " input_batch: " << input[0]->batch()
     490            0 :     << " label_batch: " << label[0]->batch()
     491              :     << " target_batch: " << current_batch;
     492              : 
     493            0 :   model_graph.setInputsLabels(input, label);
     494              : 
     495            0 :   return incremental_forwarding(from, to, training);
     496              : }
     497              : 
     498              : /**
     499              :  * @brief     back propagation
     500              :  *            Call backwarding function of layer in reverse order
     501              :  *            No need to call at first Input Layer (No data to be updated)
     502              :  */
     503         6132 : void NeuralNetwork::backwarding(int iteration,
     504              :                                 std::function<bool(void *userdata)> stop_cb,
     505              :                                 void *userdata) {
     506              : 
     507              : #ifdef DEBUG
     508              :   NNTR_THROW_IF(!opt, std::invalid_argument) << "optimizer is null!";
     509              : #endif
     510              : 
     511              :   std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op =
     512        18396 :     [this, stop_cb](std::shared_ptr<LayerNode> node, bool training) -> void {
     513              :     (void)this;
     514              :     PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName());
     515              : 
     516              :     auto f = std::get<0>(node->getExecutionOrder());
     517            0 :     model_graph.flushCacheExcept(f);
     518              : 
     519            0 :     node->forwarding(training);
     520         6132 :   };
     521              : 
     522              :   std::function<bool(std::shared_ptr<LayerNode>, int)> backwarding_op =
     523        24528 :     [this, stop_cb, userdata](std::shared_ptr<LayerNode> node,
     524              :                               int iteration) -> bool {
     525              :     /**
     526              :      * Do not change this order:
     527              :      * 1. calcGradient
     528              :      * 2. calcDerivative
     529              :      * 3. applyGradient
     530              :      * 4. gradientClippingOnLastAccess
     531              :      */
     532              : 
     533        22292 :     model_graph.flushCacheExcept(std::get<1>(node->getExecutionOrder()));
     534              :     PROFILE_MEM_ANNOTATE("CalcGradient: " + node->getName());
     535              : 
     536              :     bool apply_gradient = true;
     537        22292 :     if (node->getTrainable()) {
     538              :       /** If gradient optimization mode, then calculate gradient first */
     539         7467 :       if (dynamic_training_opt.isGradientMode())
     540            0 :         node->calcGradient();
     541              : 
     542              :       /**
     543              :        * If optimization off, or gradient must be applied, then this will be
     544              :        * true
     545              :        * @todo This apply gradient should be passed to the each weight and later
     546              :        * be queried when updating gradient at once. (after moving apply_gradient
     547              :        * out of this function)
     548              :        *
     549              :        */
     550              :       // auto &layer = node->getObject();
     551              :       // apply_gradient = dynamic_training_opt.checkIfApply(
     552              :       //   layer->getWeightsRef(), layer->net_input[0], layer->net_hidden[0],
     553              :       //   opt, iteration);
     554              : 
     555              :       /** If gradient must be applied and its not gradient mode, calculate
     556              :        * gradient
     557              :        */
     558         7467 :       if (!dynamic_training_opt.isGradientMode() && apply_gradient) {
     559         7467 :         node->calcGradient();
     560              : 
     561         7467 :         RunLayerContext &rc = node->getRunContext();
     562         7467 :         if (model_graph.isMixedPrecision()) {
     563            0 :           for (auto w : rc.getWeights()) {
     564            0 :             if (w->hasGradient())
     565            0 :               if (!w->getGradientRef().isValid())
     566              :                 return false;
     567            0 :           }
     568              :         }
     569              :       }
     570              :     }
     571              : 
     572        22292 :     model_graph.flushCacheExcept(std::get<2>(node->getExecutionOrder()));
     573              :     PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName());
     574              : 
     575        44584 :     if (stop_cb(userdata)) {
     576              :       return true;
     577              :     }
     578              : 
     579        22292 :     if (node->needsCalcDerivative()) {
     580         9774 :       node->calcDerivative();
     581              :     }
     582              : 
     583        22292 :     model_graph.flushCacheExcept(std::get<3>(node->getExecutionOrder()));
     584              :     PROFILE_MEM_ANNOTATE("ApplyGradient: " + node->getName());
     585              : 
     586              :     if (apply_gradient) {
     587              :       /// Apply gradient only at the end of the last shared weight access
     588        22292 :       model_graph.applyGradients(
     589        37875 :         node.get(), [iteration, opt_ = opt.get()](Weight &w) {
     590              :           w.calcRegularizationGradient();
     591        31166 :           if (opt_->getType() != AdamW::type) {
     592              :             w.calcWeightDecayGradient();
     593              :           }
     594              :           RunOptimizerContext opt_context(&w, iteration,
     595        15583 :                                           opt_->getLearningRate(iteration));
     596        15583 :           opt_->applyGradient(opt_context);
     597        15583 :         });
     598              :     }
     599        22292 :     return true;
     600         6132 :   };
     601              : 
     602              :   std::function<void(Weight &, int)> lazy_apply_grad_op =
     603           44 :     [opt_ = opt.get()](Weight &w, int iteration) -> void {
     604              :     w.calcRegularizationGradient();
     605              :     w.calcWeightDecayGradient();
     606              :     RunOptimizerContext opt_context(&w, iteration,
     607           44 :                                     opt_->getLearningRate(iteration));
     608           44 :     opt_->applyGradient(opt_context);
     609           44 :   };
     610              : 
     611              :   // return false if the gradient is not valid
     612              :   bool ret = false;
     613              : 
     614        12264 :   while (!ret) {
     615        12264 :     ret = model_graph.backwarding(iteration, forwarding_op, backwarding_op,
     616              :                                   lazy_apply_grad_op, stop_cb, userdata);
     617              :   }
     618         6132 : }
     619              : 
     620         1263 : void NeuralNetwork::save(const std::string &file_path,
     621              :                          ml::train::ModelFormat format) {
     622         1265 :   NNTR_THROW_IF(!initialized, std::runtime_error)
     623              :     << "Cannot save model if not initialized yet, path: " << file_path
     624            2 :     << " format: " << static_cast<unsigned>(format);
     625              : 
     626              :   /// @todo this switch case should be delegating the function call only. It's
     627              :   /// not delegating for now as required logics are manageable for now.
     628         1261 :   switch (format) {
     629         1021 :   case ml::train::ModelFormat::MODEL_FORMAT_BIN: {
     630              :     auto model_file = checkedOpenStream<std::ofstream>(
     631         1021 :       file_path, std::ios::out | std::ios::binary | std::ios::trunc);
     632              : 
     633         4088 :     for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
     634         6134 :       (*iter)->save(model_file, false, exec_mode);
     635              :     }
     636              : 
     637         3063 :     if (opt && istrequal(opt->getType(), "adam")) {
     638           21 :       std::string adam = "adam";
     639           21 :       model_file.write(adam.c_str(), 4);
     640           88 :       for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
     641              :            iter++) {
     642          134 :         (*iter)->save(model_file, true);
     643              :       }
     644              :     }
     645              : 
     646         1021 :     if (exec_mode == ml::train::ExecutionMode::TRAIN) {
     647         1021 :       model_file.write((char *)&epoch_idx, sizeof(epoch_idx));
     648         1021 :       model_file.write((char *)&iter, sizeof(iter));
     649              :     }
     650              : 
     651         1021 :     model_file.close();
     652              :     break;
     653         1021 :   }
     654          240 :   case ml::train::ModelFormat::MODEL_FORMAT_INI:
     655          240 :     saveModelIni(file_path);
     656          240 :     break;
     657            0 :   case ml::train::ModelFormat::MODEL_FORMAT_INI_WITH_BIN: {
     658              :     auto old_save_path = std::get<props::SavePath>(model_flex_props);
     659              :     auto bin_file_name =
     660            0 :       file_path.substr(0, file_path.find_last_of('.')) + ".bin";
     661              : 
     662            0 :     std::get<props::SavePath>(model_flex_props).set(bin_file_name);
     663            0 :     save(file_path, ml::train::ModelFormat::MODEL_FORMAT_INI);
     664            0 :     save(bin_file_name, ml::train::ModelFormat::MODEL_FORMAT_BIN);
     665              :     std::get<props::SavePath>(model_flex_props) = old_save_path;
     666              :     break;
     667              :   }
     668            0 :   case ml::train::ModelFormat::MODEL_FORMAT_ONNX: {
     669              :     throw nntrainer::exception::not_supported(
     670            0 :       "saving with ONNX format is not supported yet.");
     671              :     break;
     672              :   }
     673            0 :   default:
     674              :     throw nntrainer::exception::not_supported(
     675            0 :       "saving with given format is not supported yet");
     676              :   }
     677         1261 : }
     678              : 
     679          345 : void NeuralNetwork::load(const std::string &file_path,
     680              :                          ml::train::ModelFormat format) {
     681              :   /// @todo this switch case should be delegating the function call only. It's
     682              :   /// not delegating for now as required logics are manageable for now.
     683              : 
     684          345 :   bool fsu_mode = std::get<props::Fsu>(model_flex_props);
     685              : 
     686          345 :   const std::regex reg_("\\s*\\;\\s*");
     687          345 :   auto v = split(file_path, reg_);
     688              : 
     689              :   size_t start_from = 0;
     690              :   std::vector<std::pair<size_t, size_t>> file_offset;
     691          345 :   for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
     692            4 :     auto weights = (*iter)->getRunContext().getWeights();
     693            0 :     for (auto weight : weights) {
     694            0 :       size_t size = weight->getVariable().getMemoryBytes();
     695            0 :       auto tensor_data_type = weight->getDim().getDataType();
     696            0 :       weight->getVariableRef().setFileOffset(start_from);
     697              :       ///@todo instead of checking the data type,
     698              :       /// we may need to create a common parent class for
     699              :       /// quantized tensors, requiring qparam to be saved
     700              :       /// and creating a common interface to check if qparam is needed
     701              :       /// this kind of type checking should be avoided
     702            0 :       if (tensor_data_type != TensorDim::DataType::FP32 &&
     703            0 :           tensor_data_type != TensorDim::DataType::FP16 &&
     704            0 :           tensor_data_type != TensorDim::DataType::Q6_K &&
     705              :           tensor_data_type != TensorDim::DataType::Q4_0) {
     706              :         // for tensor with qparam
     707            0 :         size += sizeof(uint16_t);
     708              :       }
     709            0 :       file_offset.emplace_back(std::make_pair(start_from, size));
     710            0 :       start_from += size;
     711              :     }
     712            0 :   }
     713              : 
     714          343 :   if (exec_mode == ExecutionMode::INFERENCE && fsu_mode) {
     715            0 :     model_graph.setFsuWeightPath((v.size() == 2) ? v[1] : v[0]);
     716            0 :     model_graph.setWeightOffset(file_offset);
     717              :   }
     718              : 
     719          343 :   switch (format) {
     720            0 :   case ml::train::ModelFormat::MODEL_FORMAT_BIN: {
     721            0 :     NNTR_THROW_IF(!initialized, std::runtime_error)
     722              :       << "Cannot load if not initialized yet, path: " << file_path
     723              :       << " format: " << static_cast<unsigned>(format);
     724            0 :     auto f_path = (v.size() == 2) ? v[1] : v[0];
     725              : 
     726              :     auto model_file =
     727            0 :       checkedOpenStream<std::ifstream>(f_path, std::ios::in | std::ios::binary);
     728              : 
     729              : #if defined(_WIN32)
     730              :     HANDLE hFile, hMap;
     731              : #endif
     732              : 
     733            0 :     if (exec_mode == ml::train::ExecutionMode::INFERENCE) {
     734              :       if (!MMAP_READ) {
     735              :         ///@note for slim-tensor. This should be removed.
     736              :         model_file_fd = open(f_path.c_str(), O_RDONLY);
     737              :         NNTR_THROW_IF((model_file_fd == -1), std::invalid_argument)
     738              :           << "Cannot open file : " << f_path;
     739              :       }
     740              :       // std::vector<std::future<void>> futures;
     741              :       std::vector<std::thread> threads;
     742            0 :       threads.reserve(model_graph.size());
     743            0 :       for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
     744              :            ++iter) {
     745              :         auto node = *iter;
     746            0 :         auto exec_order = std::get<0>((*iter)->getExecutionOrder());
     747              : 
     748            0 :         threads.emplace_back([&, node]() {
     749              :           if (!MMAP_READ) {
     750              :             auto local_model_file = checkedOpenStream<std::ifstream>(
     751              :               (v.size() == 2) ? v[1] : v[0], std::ios::in | std::ios::binary);
     752              :             node->read(local_model_file, false, exec_mode, fsu_mode,
     753              :                        std::numeric_limits<size_t>::max(), true, model_file_fd);
     754              :           } else {
     755              : #if defined(_WIN32)
     756              :             // Map per-ask, then unmap immediately after: enables early release
     757              :             // of pages
     758              :             HANDLE hFile =
     759              :               CreateFileA(f_path.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL,
     760              :                           OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
     761              :             NNTR_THROW_IF((hFile == INVALID_HANDLE_VALUE), std::runtime_error)
     762              :               << "CreateFileA failed";
     763              : 
     764              :             HANDLE hMap =
     765              :               CreateFileMapping(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
     766              :             NNTR_THROW_IF((hMap == NULL), std::runtime_error)
     767              :               << "CreateFileMapping failed";
     768              : 
     769              :             char *view =
     770              :               static_cast<char *>(MapViewOfFile(hMap, FILE_MAP_READ, 0, 0, 0));
     771              :             NNTR_THROW_IF((view == nullptr), std::runtime_error)
     772              :               << "MapViewOfFile failed";
     773              : 
     774              :             node->read(view, false, exec_mode, fsu_mode,
     775              :                        std::numeric_limits<size_t>::max(), true);
     776              : 
     777              :             // Early unmap: let the OS reclaim the working set ASAP
     778              :             UnmapViewOfFile(view);
     779              :             CloseHandle(hMap);
     780              :             CloseHandle(hFile);
     781              : #else
     782              :             // POSIX: map per-task, advise kernel, drop pages, unmap
     783            0 :             int fd = ::open(f_path.c_str(), O_RDONLY);
     784            0 :             NNTR_THROW_IF((fd == -1), std::invalid_argument)
     785            0 :               << "Cannot open file : " << f_path;
     786              : 
     787            0 :             struct stat st {};
     788            0 :             NNTR_THROW_IF((::fstat(fd, &st) == -1), std::invalid_argument)
     789            0 :               << "Cannot get file info (fstat): " << f_path;
     790              : 
     791            0 :             size_t f_size = static_cast<size_t>(st.st_size);
     792              :             void *mmap_ptr =
     793            0 :               ::mmap(nullptr, f_size, PROT_READ, MAP_PRIVATE, fd, 0);
     794            0 :             ::close(fd); // fd not needed after mmap
     795            0 :             NNTR_THROW_IF((mmap_ptr == MAP_FAILED), std::runtime_error)
     796              :               << "mmap failed";
     797              : 
     798              :             // Hint: many model loads touch scattered regions -> RANDOM helps
     799              :             // reduce readahead
     800            0 :             (void)::posix_madvise(mmap_ptr, f_size, POSIX_MADV_RANDOM);
     801              : 
     802              :             char *view = static_cast<char *>(mmap_ptr);
     803            0 :             node->read(view, false, exec_mode, fsu_mode,
     804              :                        std::numeric_limits<size_t>::max(), true);
     805              : 
     806              :             // Early drop: pages no longer needed; helps lower peak RSS during
     807              :             // overlap
     808            0 :             (void)::posix_madvise(mmap_ptr, f_size, POSIX_MADV_DONTNEED);
     809              : 
     810            0 :             ::munmap(mmap_ptr, f_size);
     811              : #endif
     812              :           }
     813            0 :         });
     814              :       }
     815            0 :       for (auto &t : threads) {
     816            0 :         if (t.joinable())
     817            0 :           t.join();
     818              :       }
     819            0 :     } else {
     820            0 :       for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
     821              :            ++iter) {
     822            0 :         (*iter)->read(model_file, false, exec_mode, fsu_mode);
     823              :       }
     824              : 
     825              :       try {
     826              :         /// this is assuming that the failure is allowed at the end of the file
     827              :         /// read. so, after this line, additional read shouldn't be called
     828            0 :         if (opt && istrequal(opt->getType(), "adam")) {
     829              :           std::string opt_type;
     830              :           opt_type.resize(4);
     831            0 :           model_file.read((char *)&opt_type[0], 4);
     832              : 
     833            0 :           if (istrequal(opt_type, "adam")) {
     834            0 :             for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
     835              :                  iter++) {
     836            0 :               (*iter)->read(model_file, true, exec_mode);
     837              :             }
     838              :           }
     839              :         }
     840              : 
     841            0 :         if (!fsu_mode && exec_mode == ml::train::ExecutionMode::TRAIN) {
     842              : 
     843            0 :           checkedRead(model_file, (char *)&epoch_idx, sizeof(epoch_idx),
     844              :                       "[NeuralNetwork::readModel] failed to read epoch_idx");
     845            0 :           checkedRead(model_file, (char *)&iter, sizeof(iter),
     846              :                       "[NeuralNetwork::readModel] failed to read iteration");
     847              :         }
     848            0 :       } catch (...) {
     849              :         std::cerr << "failed to read additional data like optimizer variable, "
     850            0 :                      "iteration, proceeding with default\n";
     851            0 :       }
     852              :     }
     853              : 
     854            0 :     ml_logi("read modelfile: %s",
     855              :             (v.size() == 2) ? v[1].c_str() : v[0].c_str());
     856              :     break;
     857            0 :   }
     858              : 
     859            1 :   case ml::train::ModelFormat::MODEL_FORMAT_INI_WITH_BIN: {
     860            1 :     int ret = loadFromConfig((v.size() == 2) ? v[1] : v[0]);
     861            1 :     throw_status(ret);
     862              :     auto &save_path = std::get<props::SavePath>(model_flex_props);
     863            1 :     if (!save_path.empty()) {
     864            0 :       checkedOpenStream<std::ifstream>(save_path,
     865              :                                        std::ios::in | std::ios::binary);
     866            0 :       load_path = save_path;
     867              :     }
     868              :     break;
     869              :   }
     870          342 :   case ml::train::ModelFormat::MODEL_FORMAT_INI: {
     871          342 :     int ret = loadFromConfig((v.size() == 2) ? v[1] : v[0]);
     872          342 :     throw_status(ret);
     873              :     break;
     874              :   }
     875              :   case ml::train::ModelFormat::MODEL_FORMAT_FLATBUFFER: {
     876              :     break;
     877              :   }
     878              : 
     879            0 :   case ml::train::ModelFormat::MODEL_FORMAT_ONNX: {
     880            0 :     int ret = loadFromConfig((v.size() == 2) ? v[1] : v[0]);
     881            0 :     throw_status(ret);
     882              :     break;
     883              :   }
     884              : 
     885            0 :   case ml::train::ModelFormat::MODEL_FORMAT_QNN: {
     886              :     // for now, we only support to QNN binary format for Inference mode.
     887              :     // expect to have the file path for qnn bin and nntrainer bin seperated by
     888              :     // ":" QNN bin ( graph ) : NNTrainer bin (weight)
     889            0 :     NNTR_THROW_IF(exec_mode != ExecutionMode::INFERENCE, std::invalid_argument)
     890              :       << "Only support QNN biarny for Infernece";
     891            0 :     NNTR_THROW_IF(!isFileExist(props::FilePath(v[0])), std::invalid_argument)
     892              :       << "Cannot open QNN context bin file";
     893              : 
     894            2 :     std::thread qnn_load([this, &v]() {
     895              :       int ret =
     896            0 :         ct_engine->getRegisteredContext("qnn")->load(props::FilePath(v[0]));
     897            0 :       throw_status(ret);
     898            0 :     });
     899              : 
     900            0 :     if (!fsu_mode && v.size() > 1) {
     901            0 :       NNTR_THROW_IF(!isFileExist(props::FilePath(v[1])), std::invalid_argument)
     902              :         << "Cannot open weight bin file";
     903            0 :       load(props::FilePath(v[1]), ml::train::ModelFormat::MODEL_FORMAT_BIN);
     904            0 :     } else if (fsu_mode) {
     905            0 :       NNTR_THROW_IF(v.size() <= 1, std::invalid_argument)
     906              :         << "Swap mode should run with loading a weight-bin file";
     907            0 :       NNTR_THROW_IF(!isFileExist(props::FilePath(v[1])), std::invalid_argument)
     908              :         << "Cannot open weight bin file";
     909              :       // model_graph.setFsuWeightPath(v[1]);
     910              :     }
     911              : 
     912            0 :     qnn_load.join();
     913              :     break;
     914              :   }
     915            0 :   default:
     916              :     throw nntrainer::exception::not_supported(
     917            0 :       "loading with given format is not supported yet");
     918              :   }
     919          349 : }
     920              : 
     921        11073 : float NeuralNetwork::getLoss() {
     922        11073 :   loss = 0.0f;
     923              : 
     924        47747 :   for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
     925        73348 :     loss += (*iter)->getLoss();
     926              :   }
     927        11073 :   return loss;
     928              : }
     929              : 
     930            0 : void NeuralNetwork::setLoss(float l) { loss = l; }
     931              : 
     932            0 : NeuralNetwork &NeuralNetwork::copy(NeuralNetwork &from) {
     933            0 :   if (this != &from) {
     934              :     model_props = from.model_props;
     935              :     model_flex_props = from.model_flex_props;
     936            0 :     loss = from.loss;
     937              :     opt = from.opt;
     938              : 
     939              :     model_graph.copy(from.model_graph);
     940              :   }
     941            0 :   return *this;
     942              : }
     943              : 
     944          240 : void NeuralNetwork::saveModelIni(const std::string &file_path) {
     945          480 :   NNTR_THROW_IF(isFileExist(file_path), std::invalid_argument)
     946              :     << "There is already a file, overriding to the existing file is not "
     947              :        "permitted, path: "
     948              :     << file_path;
     949              : 
     950              :   std::vector<IniSection> sections;
     951              : 
     952          240 :   IniSection model_section = IniSection::FromExportable("model", *this);
     953          480 :   model_section.setEntry("type", "NeuralNetwork");
     954          240 :   sections.push_back(model_section);
     955              : 
     956          960 :   auto add_section_if_any = [&sections](const std::string &section_name,
     957              :                                         auto obj_ptr, auto pred) {
     958          240 :     if (pred(obj_ptr)) {
     959          242 :       IniSection s = IniSection::FromExportable(section_name, *obj_ptr);
     960          484 :       s.setEntry("type", obj_ptr->getType());
     961          242 :       sections.push_back(s);
     962              :     }
     963         1200 :   };
     964              : 
     965          720 :   add_section_if_any("optimizer", opt,
     966              :                      [](const auto &obj) { return static_cast<bool>(obj); });
     967              : 
     968              :   auto &[train_buffer, valid_buffer, test_buffer] = data_buffers;
     969              :   auto data_buffer_valid = [](const auto &buffer) {
     970          722 :     return buffer && buffer->isSerializable(
     971          722 :                        ml::train::ExportMethods::METHOD_STRINGVECTOR);
     972              :   };
     973              : 
     974          481 :   add_section_if_any("train_set", train_buffer, data_buffer_valid);
     975          481 :   add_section_if_any("valid_set", valid_buffer, data_buffer_valid);
     976          480 :   add_section_if_any("test_set", test_buffer, data_buffer_valid);
     977              : 
     978          240 :   IniWrapper wrapper("model_saver", sections);
     979          240 :   wrapper.save_ini(file_path);
     980              : 
     981          240 :   IniGraphInterpreter interpreter;
     982          240 :   interpreter.serialize(graph_representation, file_path);
     983          480 : }
     984              : 
     985          415 : bool NeuralNetwork::validateInput(sharedConstTensors X) {
     986          415 :   auto input_dim = getInputDimension();
     987          415 :   if (X.size() != input_dim.size()) {
     988            0 :     ml_loge("Error: provided number of inputs %d, required %d", (int)X.size(),
     989              :             (int)input_dim.size());
     990            0 :     return false;
     991              :   }
     992              : 
     993         1506 :   for (unsigned int dim = 0; dim < input_dim.size(); dim++) {
     994          676 :     if (input_dim[dim] != X[dim]->getDim()) {
     995            0 :       ml_loge("Error: provided input shape does not match required shape");
     996            0 :       std::stringstream ss;
     997            0 :       ss << X[dim]->getDim();
     998            0 :       ml_loge("Provided tensor summary : %s", ss.str().c_str());
     999              : 
    1000            0 :       ss.str(std::string());
    1001            0 :       ss << input_dim[dim];
    1002            0 :       ml_loge("Required tensor summary : %s", ss.str().c_str());
    1003              :       return false;
    1004            0 :     }
    1005              :   }
    1006              : 
    1007              :   return true;
    1008          415 : }
    1009              : 
    1010          415 : sharedConstTensors NeuralNetwork::inference(sharedConstTensors X,
    1011              :                                             bool free_mem) {
    1012          415 :   return inference(X, {}, free_mem);
    1013              : }
    1014              : 
    1015          415 : sharedConstTensors NeuralNetwork::inference(sharedConstTensors X,
    1016              :                                             sharedConstTensors label,
    1017              :                                             bool free_mem) {
    1018          415 :   if (model_graph.getBatchSize() != X[0]->batch()) {
    1019            0 :     model_graph.setBatchSize(X[0]->batch());
    1020              :   }
    1021              : 
    1022              :   sharedConstTensors out;
    1023          415 :   if (!validateInput(X))
    1024            0 :     throw std::invalid_argument("Input validation failed.");
    1025              : 
    1026          415 :   allocate(ExecutionMode::INFERENCE);
    1027              : 
    1028              :   int nn_foward;
    1029              :   PROFILE_TIME_REGISTER_EVENT(nn_foward, "nn_forward");
    1030              :   PROFILE_TIME_START(nn_foward);
    1031          415 :   out = forwarding(X, label, false);
    1032              :   PROFILE_TIME_END(nn_foward);
    1033              : 
    1034          415 :   if (free_mem)
    1035              :     /**
    1036              :      * Free the memory needed for training before exiting.
    1037              :      * Note that this does not free the weights for the model.
    1038              :      * Weights of the model will be freed when the model is destroyed.
    1039              :      */
    1040              :     model_graph.deallocateTensors(false);
    1041              : 
    1042              :   /** Clear the set inputs and labels */
    1043          415 :   model_graph.setInputsLabels({}, {});
    1044              : 
    1045          415 :   return out;
    1046            0 : }
    1047              : 
    1048              : std::vector<float *>
    1049            5 : NeuralNetwork::inference(unsigned int batch_size,
    1050              :                          const std::vector<float *> &input,
    1051              :                          const std::vector<float *> &label) {
    1052              :   sharedConstTensors input_tensors, output_tensors;
    1053            5 :   auto in_dim = getInputDimension();
    1054              : 
    1055            5 :   input_tensors.reserve(input.size());
    1056           10 :   for (unsigned int idx = 0; idx < in_dim.size(); idx++) {
    1057            5 :     in_dim[idx].batch(batch_size);
    1058           15 :     input_tensors.emplace_back(MAKE_SHARED_TENSOR(Tensor::Map(
    1059              :       input[idx], in_dim[idx].getDataLen() * sizeof(float), in_dim[idx], 0)));
    1060              :   }
    1061              : 
    1062            5 :   if (!label.empty()) {
    1063              :     sharedConstTensors label_tensors;
    1064            0 :     auto label_dim = getOutputDimension();
    1065            0 :     label_tensors.reserve(label.size());
    1066            0 :     for (unsigned int idx = 0; idx < label_dim.size(); idx++) {
    1067            0 :       label_dim[idx].batch(batch_size);
    1068            0 :       label_tensors.emplace_back(MAKE_SHARED_TENSOR(
    1069              :         Tensor::Map(label[idx], label_dim[idx].getDataLen() * sizeof(float),
    1070              :                     label_dim[idx], 0)));
    1071              :     }
    1072            0 :     output_tensors = inference(input_tensors, label_tensors, false);
    1073            0 :   } else {
    1074            5 :     output_tensors = inference(input_tensors, false);
    1075              :   }
    1076              : 
    1077              :   std::vector<float *> output;
    1078            5 :   output.reserve(output_tensors.size());
    1079              : 
    1080           10 :   for (auto &out : output_tensors) {
    1081            5 :     auto out_t = *out.get();
    1082            5 :     output.push_back(out_t.getData());
    1083            5 :   }
    1084              : 
    1085            5 :   return output;
    1086            5 : }
    1087              : 
    1088              : sharedConstTensors
    1089            0 : NeuralNetwork::incremental_inference(sharedConstTensors X,
    1090              :                                      unsigned int init_seq_len,
    1091              :                                      unsigned int from, unsigned int to) {
    1092            0 :   return incremental_inference(X, {}, init_seq_len, from, to);
    1093              : }
    1094              : 
    1095            0 : sharedConstTensors NeuralNetwork::incremental_inference(
    1096              :   sharedConstTensors X, sharedConstTensors label, unsigned int init_seq_len,
    1097              :   unsigned int from, unsigned int to) {
    1098            0 :   if (model_graph.getBatchSize() != X[0]->batch()) {
    1099            0 :     model_graph.setBatchSize(X[0]->batch());
    1100              :   }
    1101              : 
    1102              :   sharedConstTensors out;
    1103            0 :   if (!validateInput(X))
    1104            0 :     throw std::invalid_argument("Input validation failed.");
    1105              : 
    1106            0 :   if (!from) {
    1107            0 :     model_graph.allocateTensors(ExecutionMode::INFERENCE);
    1108              :   }
    1109              : 
    1110              :   int nn_foward;
    1111              :   PROFILE_TIME_REGISTER_EVENT(nn_foward, "nn_forward");
    1112              :   PROFILE_TIME_START(nn_foward);
    1113              : 
    1114            0 :   out = incremental_forwarding(from, to, X, label, false);
    1115              : 
    1116              :   PROFILE_TIME_END(nn_foward);
    1117              : 
    1118              :   /** @todo: deallocate tensor after incremental inference **/
    1119              :   /** Clear the set inputs and labels */
    1120            0 :   model_graph.setInputsLabels({}, {});
    1121              : 
    1122            0 :   return out;
    1123            0 : }
    1124              : 
    1125            0 : std::vector<float *> NeuralNetwork::incremental_inference(
    1126              :   unsigned int batch_size, const std::vector<float *> &input,
    1127              :   const std::vector<float *> &label, unsigned int init_seq_len,
    1128              :   unsigned int from, unsigned int to, bool output_hidden_state) {
    1129              : 
    1130              :   // auto start_in_neuralnet = std::chrono::high_resolution_clock::now();
    1131              : 
    1132              :   sharedConstTensors input_tensors, output_tensors;
    1133            0 :   auto in_dim = getInputDimension();
    1134              : 
    1135            0 :   input_tensors.reserve(input.size());
    1136            0 :   for (unsigned int idx = 0; idx < in_dim.size(); idx++) {
    1137            0 :     in_dim[idx].batch(batch_size);
    1138            0 :     input_tensors.emplace_back(MAKE_SHARED_TENSOR(Tensor::Map(
    1139              :       input[idx], in_dim[idx].getDataLen() * sizeof(float), in_dim[idx], 0)));
    1140              :   }
    1141              : 
    1142              :   // auto start_increment = std::chrono::high_resolution_clock::now();
    1143            0 :   if (!label.empty()) {
    1144              :     sharedConstTensors label_tensors;
    1145            0 :     auto label_dim = getOutputDimension();
    1146            0 :     label_tensors.reserve(label.size());
    1147            0 :     for (unsigned int idx = 0; idx < label_dim.size(); idx++) {
    1148            0 :       label_dim[idx].batch(batch_size);
    1149            0 :       label_tensors.emplace_back(MAKE_SHARED_TENSOR(
    1150              :         Tensor::Map(label[idx], label_dim[idx].getDataLen() * sizeof(float),
    1151              :                     label_dim[idx], 0)));
    1152              :     }
    1153            0 :     output_tensors = incremental_inference(input_tensors, label_tensors,
    1154            0 :                                            init_seq_len, from, to);
    1155            0 :   } else {
    1156              :     output_tensors =
    1157            0 :       incremental_inference(input_tensors, init_seq_len, from, to);
    1158              :   }
    1159              :   // auto end_increment = std::chrono::high_resolution_clock::now();
    1160              :   std::vector<float *> output;
    1161              : 
    1162            0 :   unsigned int step = ((to - from) == 0) ? 0 : (to - from) - 1;
    1163              : 
    1164            0 :   for (auto &out : output_tensors) {
    1165            0 :     auto out_t = *out.get();
    1166              :     float *last_out_buf_data;
    1167              : 
    1168            0 :     if (output_hidden_state) {
    1169            0 :       last_out_buf_data = out_t.getData();
    1170              :     } else {
    1171            0 :       last_out_buf_data = new float[batch_size * out_t.width()];
    1172              : 
    1173            0 :       for (unsigned int batch = 0; batch < batch_size; ++batch) {
    1174            0 :         if (out->getDataType() == ml::train::TensorDim::DataType::FP16) {
    1175              : #ifdef ENABLE_FP16
    1176              :           const _FP16 *out_t_batch_ptr =
    1177              :             out_t.getData<_FP16>() + batch * out_t.getDim().getFeatureLen() +
    1178              :             step * out_t.width();
    1179              :           scopy(out_t.width(), out_t_batch_ptr, 1,
    1180              :                 last_out_buf_data + batch * out_t.width(), 1);
    1181              : 
    1182              : #else
    1183            0 :           throw std::invalid_argument("Error: enable-fp16 is not set");
    1184              : #endif
    1185            0 :         } else if (out->getDataType() == ml::train::TensorDim::DataType::FP32) {
    1186              :           const float *out_t_batch_ptr =
    1187            0 :             out_t.getData() + batch * out_t.getDim().getFeatureLen() +
    1188            0 :             step * out_t.width();
    1189              :           // std::memcpy( last_out_buf_data + batch * out_t.width(),
    1190              :           // out_t_batch_ptr, out_t.width()*sizeof(float));
    1191            0 :           scopy(out_t.width(), out_t_batch_ptr, 1,
    1192            0 :                 last_out_buf_data + batch * out_t.width(), 1);
    1193              :         }
    1194              :       }
    1195              :     }
    1196              : 
    1197            0 :     output.push_back(last_out_buf_data);
    1198            0 :   }
    1199              :   // auto end_net_inference = std::chrono::high_resolution_clock::now();
    1200              :   // auto prepare =
    1201              :   // std::chrono::duration_cast<std::chrono::nanoseconds>(start_increment-start_in_neuralnet);
    1202              :   // auto run_inf =
    1203              :   // std::chrono::duration_cast<std::chrono::nanoseconds>(end_increment-start_increment);;
    1204              :   // auto out_gen =
    1205              :   // std::chrono::duration_cast<std::chrono::nanoseconds>(end_net_inference-end_increment);;
    1206              :   // auto net_gen =
    1207              :   // std::chrono::duration_cast<std::chrono::nanoseconds>(end_net_inference-start_in_neuralnet);
    1208              : 
    1209              :   // std::cout <<"prepare : "<< prepare.count() << " run_inf : "<<
    1210              :   // run_inf.count() << " out_gen : "<< out_gen.count()<<std::endl; std::cout <<
    1211              :   // "-------- net_inference: "<< net_gen.count() << std::endl;
    1212              : 
    1213            0 :   return output;
    1214            0 : }
    1215              : 
    1216            0 : void NeuralNetwork::resetInputDimension(std::vector<TensorDim> dims) {
    1217            0 :   model_graph.resetInputDimension(dims);
    1218            0 : }
    1219              : 
    1220           30 : int NeuralNetwork::setDataset(const DatasetModeType &mode,
    1221              :                               std::shared_ptr<ml::train::Dataset> dataset) {
    1222           60 :   return setDataBuffer(mode, std::static_pointer_cast<DataBuffer>(dataset));
    1223              : }
    1224              : 
    1225          678 : int NeuralNetwork::allocate(ExecutionMode mode) {
    1226              :   model_graph.deallocateTensors();
    1227          678 :   model_graph.allocateTensors(mode);
    1228              : 
    1229          678 :   return ML_ERROR_NONE;
    1230              : }
    1231              : 
    1232         1534 : int NeuralNetwork::deallocate() {
    1233              :   try {
    1234              :     model_graph.deallocateTensors(true);
    1235              :     return ML_ERROR_NONE;
    1236            0 :   } catch (const std::exception &e) {
    1237              :     std::cerr << "Error occurred during deallocation of NeuralNetwork: "
    1238            0 :               << e.what() << std::endl;
    1239              :     return ML_ERROR_UNKNOWN;
    1240            0 :   }
    1241              : }
    1242              : 
    1243           28 : int NeuralNetwork::train(const std::vector<std::string> &values,
    1244              :                          std::function<bool(void *)> stop_cb,
    1245              :                          void *stop_user_data,
    1246              :                          std::function<void(void *)> epoch_complete_cb,
    1247              :                          void *epoch_user_data) {
    1248              :   int status = ML_ERROR_NONE;
    1249              : 
    1250           28 :   if (data_buffers[static_cast<int>(DatasetModeType::MODE_TRAIN)] == nullptr) {
    1251            0 :     ml_loge("Cannot initialize the model without the train data buffer.");
    1252            0 :     return ML_ERROR_INVALID_PARAMETER;
    1253              :   }
    1254              : 
    1255           28 :   if (!opt) {
    1256            1 :     ml_loge("Cannot train network without optimizer.");
    1257            1 :     return ML_ERROR_INVALID_PARAMETER;
    1258              :   }
    1259              : 
    1260           27 :   setTrainConfig(values);
    1261              : 
    1262              :   /** set batch size just before training */
    1263           23 :   model_graph.setBatchSize(
    1264              :     std::get<props::TrainingBatchSize>(model_flex_props));
    1265              : 
    1266           23 :   status = allocate(ExecutionMode::TRAIN);
    1267           23 :   NN_RETURN_STATUS();
    1268              : 
    1269              :   status =
    1270           46 :     train_run(stop_cb, stop_user_data, epoch_complete_cb, epoch_user_data);
    1271           23 :   NN_RETURN_STATUS();
    1272              : 
    1273              :   /**
    1274              :    * Free the memory needed for training before exiting.
    1275              :    * Note that this does not free the weights for the model.
    1276              :    * Weights of the model will be freed when the model is destroyed.
    1277              :    */
    1278              :   model_graph.deallocateTensors(false);
    1279           23 :   return status;
    1280              : }
    1281              : 
    1282              : /**
    1283              :  * @brief     Run NeuralNetwork train with callback function by user
    1284              :  */
    1285           23 : int NeuralNetwork::train_run(
    1286              :   std::function<bool(void *userdata)> stop_cb, void *stop_user_data,
    1287              :   std::function<void(void *userdata)> epoch_complete_cb,
    1288              :   void *epoch_user_data) {
    1289              :   int status = ML_ERROR_NONE;
    1290              : 
    1291           23 :   if (!std::get<props::ContinueTrain>(model_flex_props)) {
    1292           23 :     epoch_idx = 0;
    1293           23 :     iter = 0;
    1294          115 :     for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
    1295          184 :       (*iter)->clearOptVar();
    1296              :     }
    1297              :   }
    1298              : 
    1299              :   auto batch_size = std::get<props::TrainingBatchSize>(model_flex_props);
    1300              : 
    1301           23 :   auto const &outputs = model_graph.getOutputTensors();
    1302           23 :   auto in_dims = model_graph.getInputDimension();
    1303           23 :   auto label_dims = model_graph.getOutputDimension();
    1304              : 
    1305              :   auto &[train_buffer, valid_buffer, test_buffer] = data_buffers;
    1306              : 
    1307           23 :   if (train_buffer == nullptr) {
    1308            0 :     ml_loge("[NeuralNetworks] there is no train dataset!");
    1309            0 :     return ML_ERROR_INVALID_PARAMETER;
    1310              :   }
    1311              : 
    1312              :   /**
    1313              :    * @brief run a single epoch with given callback, @a auto is used instead of
    1314              :    * std::function for performance measure
    1315              :    * @param buffer buffer to run
    1316              :    * @param shuffle whether to shuffle or not
    1317              :    * @param on_iteration_fetch function that will receive reference to stat,
    1318              :    * buffer which will be called every time data is fetched and set
    1319              :    * @param on_epoch_end function that will receive reference to stat,
    1320              :    * buffer which will be called on the epoch end
    1321              :    */
    1322         1387 :   auto run_epoch = [this, &in_dims, &label_dims, &outputs, batch_size](
    1323              :                      DataBuffer *buffer, bool shuffle,
    1324              :                      auto &&on_iteration_fetch, auto &&on_iteration_update_stat,
    1325              :                      auto &&on_epoch_end, RunStats &stat) {
    1326              :     /// @todo managing metrics must be handled here as well!! for now it is
    1327              :     /// handled in individual callbacks
    1328              :     // RunStats stat;
    1329              : 
    1330         1364 :     stat.accuracy = 0.0;
    1331         1364 :     stat.loss = 0.0;
    1332         1364 :     stat.num_iterations = 0;
    1333         1364 :     stat.num_correct_predictions = 0;
    1334         1364 :     stat.max_epoch = getEpochs();
    1335         1364 :     stat.epoch_idx = epoch_idx;
    1336              : 
    1337         1364 :     std::future<std::shared_ptr<IterationQueue>> future_iq =
    1338              :       buffer->startFetchWorker(in_dims, label_dims, shuffle);
    1339         1637 :     while (true) {
    1340         8297 :       ScopedView<Iteration> iter_view = buffer->fetch();
    1341         8297 :       if (iter_view.isEmpty()) {
    1342              :         break;
    1343              :       }
    1344              :       auto &iteration = iter_view.get();
    1345         6933 :       if (iteration.batch() != static_cast<unsigned int>(batch_size)) {
    1346              :         /// @todo support partial batch
    1347              :         continue;
    1348              :       }
    1349              : 
    1350              :       auto const &labels = iteration.getLabelsRef();
    1351              :       auto const &inputs = iteration.getInputsRef();
    1352         5673 :       model_graph.setInputsLabels(inputs, labels);
    1353              : 
    1354         5673 :       on_iteration_fetch(stat, *buffer);
    1355          273 :       on_iteration_update_stat(stat, outputs, labels);
    1356              :     }
    1357         1364 :     future_iq.get();
    1358         1364 :     on_epoch_end(stat, *buffer);
    1359              : 
    1360         1364 :     if (stat.num_iterations == 0) {
    1361            0 :       throw std::runtime_error("No data came while buffer ran");
    1362              :     }
    1363              : 
    1364         1364 :     return stat;
    1365           23 :   };
    1366              : 
    1367              :   auto train_for_iteration =
    1368           23 :     [this, stop_cb, stop_user_data](RunStats &stat, DataBuffer &buffer) {
    1369         5400 :       ml_logi("train for iteration");
    1370         5400 :       forwarding(true, stop_cb, stop_user_data);
    1371         5400 :       backwarding(iter++, stop_cb, stop_user_data);
    1372              : 
    1373              :       // To avoid unconsidered memory leak, we need to clear the cache
    1374         5400 :       model_graph.flushCache();
    1375              : 
    1376        10800 :       if (!stop_cb(stop_user_data)) {
    1377        10800 :         std::cout << "#" << epoch_idx << "/" << getEpochs();
    1378         5400 :         ml_logi("# %d / %d", epoch_idx, getEpochs());
    1379         5400 :         auto loss = getLoss();
    1380         5400 :         buffer.displayProgress(stat.num_iterations, loss);
    1381              :       }
    1382         5423 :     };
    1383              : 
    1384              :   auto update_train_stat = [this](RunStats &stat,
    1385              :                                   const std::vector<Tensor> &outputs,
    1386              :                                   const std::vector<Tensor> &labels) {
    1387         5400 :     stat.loss += getLoss();
    1388         5400 :     stat.num_iterations++;
    1389           23 :   };
    1390              : 
    1391           23 :   auto train_epoch_end = [this, stop_cb, stop_user_data](RunStats &stat,
    1392              :                                                          DataBuffer &buffer) {
    1393         1233 :     if (stat.num_iterations != 0) {
    1394         1233 :       stat.loss /= static_cast<float>(stat.num_iterations);
    1395              :     } else {
    1396              :       std::cerr << "stat.num_iterations is 0" << std::endl;
    1397            0 :       return;
    1398              :     }
    1399         1233 :     auto &save_path = std::get<props::SavePath>(model_flex_props);
    1400         2466 :     if (!stop_cb(stop_user_data)) {
    1401         1233 :       if (!save_path.empty()) {
    1402         1021 :         save(save_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
    1403              :       }
    1404              : 
    1405         2466 :       std::cout << "#" << epoch_idx << "/" << getEpochs()
    1406         1233 :                 << " - Training Loss: " << stat.loss;
    1407         1233 :       ml_logi("# %d / %d - Training Loss: %f", epoch_idx, getEpochs(),
    1408              :               stat.loss);
    1409         2466 :       ml_logd("[NNTrainer] Training epoch %d / %d finished successfully.",
    1410              :               epoch_idx, getEpochs());
    1411              :     } else {
    1412            0 :       ml_logd("[NNTrainer] Training stopped by stop callback function during "
    1413              :               "epoch %d.",
    1414              :               epoch_idx);
    1415              :     }
    1416           23 :   };
    1417              : 
    1418          273 :   auto eval_for_iteration = [this, batch_size, stop_cb, stop_user_data](
    1419              :                               RunStats &stat, DataBuffer &buffer) {
    1420          273 :     forwarding(false, stop_cb, stop_user_data);
    1421          296 :   };
    1422              : 
    1423          273 :   auto update_eval_stat = [batch_size, &update_train_stat](
    1424              :                             RunStats &stat, const std::vector<Tensor> &outputs,
    1425              :                             const std::vector<Tensor> &labels) {
    1426          273 :     auto model_out = outputs[0].argmax();
    1427          273 :     auto label_out = labels[0].argmax();
    1428              : 
    1429         3996 :     for (unsigned int b = 0; b < batch_size; b++) {
    1430         3723 :       if (model_out[b] == label_out[b])
    1431         2498 :         stat.num_correct_predictions++;
    1432              :     }
    1433              : 
    1434          273 :     update_train_stat(stat, outputs, labels);
    1435          273 :   };
    1436              : 
    1437          154 :   auto eval_epoch_end = [this, batch_size, max_acc = 0.0f,
    1438           23 :                          min_loss = std::numeric_limits<float>::max()](
    1439              :                           RunStats &stat, DataBuffer &buffer) mutable {
    1440          131 :     if (stat.num_iterations != 0) {
    1441          131 :       stat.loss /= static_cast<float>(stat.num_iterations);
    1442              :     } else {
    1443              :       std::cerr << "stat.num_iterations is 0" << std::endl;
    1444            0 :       return;
    1445              :     }
    1446          262 :     stat.accuracy = stat.num_correct_predictions /
    1447          131 :                     static_cast<float>(stat.num_iterations * batch_size) *
    1448              :                     100.0f;
    1449              : 
    1450          131 :     if (stat.accuracy > max_acc ||
    1451          104 :         (stat.accuracy == max_acc && stat.loss < min_loss)) {
    1452           48 :       max_acc = stat.accuracy;
    1453              :       /// @note this is not actually 'the' min loss for whole time but records
    1454              :       /// when data change
    1455           48 :       min_loss = stat.loss;
    1456           48 :       auto &save_best_path = std::get<props::SaveBestPath>(model_flex_props);
    1457           48 :       if (!save_best_path.empty()) {
    1458            0 :         save(save_best_path);
    1459              :       }
    1460              :     }
    1461          131 :     std::cout << " >> [ Accuracy: " << stat.accuracy
    1462          131 :               << "% - Validation Loss : " << stat.loss << " ]";
    1463          262 :     ml_logi("[ Accuracy: %.2f %% - Validation Loss: %.5f", stat.accuracy,
    1464              :             stat.loss);
    1465           23 :   };
    1466              : 
    1467              :   PROFILE_MEM_ANNOTATE("TRAIN START");
    1468              :   auto epochs = getEpochs();
    1469           23 :   ml_logd("[NNTrainer] Starts training. Current epoch: %d. Total epochs: %d.",
    1470              :           epoch_idx + 1, getEpochs());
    1471         1256 :   for (epoch_idx = epoch_idx + 1; epoch_idx <= epochs; ++epoch_idx) {
    1472         1233 :     if (stop_cb(stop_user_data)) {
    1473            0 :       --epoch_idx;
    1474            0 :       break;
    1475              :     }
    1476         1233 :     training = run_epoch(train_buffer.get(), true, train_for_iteration,
    1477         1233 :                          update_train_stat, train_epoch_end, training);
    1478         1233 :     if (valid_buffer) {
    1479          131 :       validation = run_epoch(valid_buffer.get(), false, eval_for_iteration,
    1480          131 :                              update_eval_stat, eval_epoch_end, validation);
    1481              :     }
    1482         1233 :     std::cout << '\n';
    1483         1233 :     epoch_complete_cb(epoch_user_data);
    1484              :   }
    1485              :   PROFILE_MEM_ANNOTATE("TRAIN END");
    1486              : 
    1487           23 :   if (test_buffer) {
    1488            0 :     std::cout << "Evaluation with test data...\n";
    1489            0 :     testing = run_epoch(test_buffer.get(), false, eval_for_iteration,
    1490            0 :                         update_eval_stat, eval_epoch_end, testing);
    1491              :   }
    1492              : 
    1493              :   /** Clear the set inputs and labels */
    1494           23 :   model_graph.setInputsLabels({}, {});
    1495              : 
    1496              :   return status;
    1497           46 : }
    1498              : 
    1499          540 : void swap(NeuralNetwork &lhs, NeuralNetwork &rhs) {
    1500              :   {
    1501              :     using std::swap;
    1502              : 
    1503              :     swap(lhs.model_props, rhs.model_props);
    1504              :     swap(lhs.model_flex_props, rhs.model_flex_props);
    1505          540 :     swap(lhs.load_path, rhs.load_path);
    1506              :     swap(lhs.epoch_idx, rhs.epoch_idx);
    1507              :     swap(lhs.iter, rhs.iter);
    1508              :     swap(lhs.loss, rhs.loss);
    1509              :     swap(lhs.opt, rhs.opt);
    1510              :     swap(lhs.data_buffers, rhs.data_buffers);
    1511              :     swap(lhs.initialized, rhs.initialized);
    1512              :     swap(lhs.model_graph, rhs.model_graph);
    1513              :     swap(lhs.graph_representation, rhs.graph_representation);
    1514              :     swap(lhs.compiled, rhs.compiled);
    1515              :     swap(lhs.loadedFromConfig, rhs.loadedFromConfig);
    1516              :   }
    1517          540 : }
    1518              : 
    1519         4113 : int NeuralNetwork::addLayer(NodeType layer) {
    1520              :   int status = ML_ERROR_NONE;
    1521              : 
    1522         4113 :   if (initialized) {
    1523              :     return ML_ERROR_NOT_SUPPORTED;
    1524              :   }
    1525              : 
    1526              :   /** Insert the layer to the graph */
    1527         4112 :   model_graph.addLayer(layer);
    1528         4111 :   graph_representation.push_back(layer);
    1529              : 
    1530         4111 :   return status;
    1531              : }
    1532              : 
    1533            3 : NeuralNetwork &NeuralNetwork::copyConfiguration(NeuralNetwork &from) {
    1534            3 :   if (this != &from) {
    1535              :     model_props = from.model_props;
    1536              :     model_flex_props = from.model_flex_props;
    1537            3 :     loss = from.loss;
    1538              :     opt = from.opt;
    1539              : 
    1540              :     NetworkGraph f_graph = from.getNetworkGraph();
    1541            7 :     for (auto &l_node : f_graph.getLayerNodes()) {
    1542            9 :       addLayer(static_cast<std::shared_ptr<ml::train::Layer>>(
    1543            9 :         l_node->cloneConfiguration()));
    1544            3 :     }
    1545            3 :   }
    1546            2 :   return *this;
    1547              : }
    1548              : 
    1549              : NeuralNetwork::GraphType
    1550            0 : NeuralNetwork::getUnsortedLayers(const std::string &input_layer,
    1551              :                                  const std::string &output_layer) {
    1552            0 :   return model_graph.getUnsortedLayers(input_layer, output_layer);
    1553              : }
    1554              : 
    1555          741 : int NeuralNetwork::setOptimizer(
    1556              :   std::shared_ptr<ml::train::Optimizer> optimizer) {
    1557          741 :   if (initialized) {
    1558            0 :     ml_loge("Cannot set optimizer if already initialized");
    1559            0 :     return ML_ERROR_NOT_SUPPORTED;
    1560              :   }
    1561              : 
    1562          741 :   opt = std::static_pointer_cast<OptimizerWrapped>(optimizer);
    1563              : 
    1564          741 :   return ML_ERROR_NONE;
    1565              : }
    1566              : 
    1567           30 : int NeuralNetwork::setDataBuffer(const DatasetModeType &mode,
    1568              :                                  std::shared_ptr<DataBuffer> data_buffer) {
    1569           30 :   if (data_buffer == nullptr) {
    1570              :     return ML_ERROR_INVALID_PARAMETER;
    1571              :   }
    1572              : 
    1573           30 :   this->data_buffers[static_cast<int>(mode)] = data_buffer;
    1574              : 
    1575           30 :   return ML_ERROR_NONE;
    1576              : }
    1577              : 
    1578           12 : int NeuralNetwork::getLayer(const char *name,
    1579              :                             std::shared_ptr<ml::train::Layer> *layer) {
    1580              :   // We provide the layer change through the api with user's responsibility.
    1581              :   //
    1582              :   // if (compiled) {
    1583              :   //   ml_loge("Cannot get compiled layer.");
    1584              :   //   return ML_ERROR_NOT_SUPPORTED;
    1585              :   // }
    1586              : 
    1587            9 :   *layer = std::static_pointer_cast<ml::train::Layer>(
    1588           12 :     model_graph.getLayerNode(std::string(name)));
    1589            9 :   return ML_ERROR_NONE;
    1590              : }
    1591              : 
    1592           19 : void NeuralNetwork::printMetrics(std::ostream &out, unsigned int flags) {
    1593           19 :   switch (flags) {
    1594            4 :   case ML_TRAIN_SUMMARY_MODEL_TRAIN_LOSS:
    1595            4 :     out << training.loss << std::endl;
    1596              :     break;
    1597              : 
    1598            4 :   case ML_TRAIN_SUMMARY_MODEL_VALID_LOSS:
    1599            4 :     out << validation.loss << std::endl;
    1600              :     break;
    1601              : 
    1602            4 :   case ML_TRAIN_SUMMARY_MODEL_VALID_ACCURACY:
    1603            4 :     out << validation.accuracy << std::endl;
    1604              :     break;
    1605              : 
    1606              :   default:
    1607              :     break;
    1608              :   }
    1609           19 : }
    1610              : 
    1611           19 : void NeuralNetwork::printPreset(std::ostream &out, unsigned int preset) {
    1612              :   /** print neuralnet metrics */
    1613           19 :   printMetrics(out, preset);
    1614           19 :   if (preset > ML_TRAIN_SUMMARY_TENSOR)
    1615              :     return;
    1616              : 
    1617              :   LayerNode::PrintPreset layer_preset = LayerNode::PrintPreset::PRINT_NONE;
    1618              : 
    1619              :   ///@todo match flags with preset
    1620              :   unsigned int flags = PRINT_INST_INFO | PRINT_GRAPH_INFO | PRINT_PROP |
    1621              :                        PRINT_OPTIMIZER | PRINT_METRIC;
    1622              : 
    1623            7 :   switch (preset) {
    1624            1 :   case ML_TRAIN_SUMMARY_TENSOR:
    1625              :     layer_preset = LayerNode::PrintPreset::PRINT_ALL;
    1626            1 :     break;
    1627            0 :   case ML_TRAIN_SUMMARY_LAYER:
    1628            0 :     layer_preset = initialized ? LayerNode::PrintPreset::PRINT_SUMMARY
    1629              :                                : LayerNode::PrintPreset::PRINT_SUMMARY_META;
    1630              :     break;
    1631              :   case ML_TRAIN_SUMMARY_MODEL:
    1632              :     break;
    1633              :   default:
    1634              :     throw std::invalid_argument("given verbosity is invalid");
    1635              :   }
    1636              : 
    1637            7 :   print(out, flags, layer_preset);
    1638              : }
    1639              : 
    1640            2 : void NeuralNetwork::addWithReferenceLayers(
    1641              :   const std::vector<std::shared_ptr<ml::train::Layer>> &reference,
    1642              :   const std::string &scope, const std::vector<std::string> &input_layers,
    1643              :   const std::vector<std::string> &start_layers,
    1644              :   const std::vector<std::string> &end_layers,
    1645              :   ml::train::ReferenceLayersType type,
    1646              :   const std::vector<std::string> &type_properties) {
    1647              :   std::vector<NodeType> casted_reference;
    1648            2 :   casted_reference.reserve(reference.size());
    1649            6 :   for (auto &node : reference) {
    1650            8 :     casted_reference.emplace_back(std::static_pointer_cast<LayerNode>(node));
    1651              :   }
    1652              : 
    1653            2 :   addWithReferenceLayers(casted_reference, scope, input_layers, start_layers,
    1654              :                          end_layers, type, type_properties);
    1655            2 : }
    1656              : 
    1657           52 : void NeuralNetwork::addWithReferenceLayers(
    1658              :   const std::vector<std::shared_ptr<LayerNode>> &reference,
    1659              :   const std::string &scope, const std::vector<std::string> &input_layers,
    1660              :   const std::vector<std::string> &start_layers,
    1661              :   const std::vector<std::string> &end_layers,
    1662              :   ml::train::ReferenceLayersType type,
    1663              :   const std::vector<std::string> &type_properties) {
    1664              :   /// @todo below configuration should be extracted as a free function to make
    1665              :   /// it more testable, and reused inside graph interpreter
    1666              : 
    1667              :   /// @note we can exploit connection to connection more fine grained, for now
    1668              :   /// it is not supported but we can easily make this supported
    1669              :   std::vector<std::shared_ptr<LayerNode>> nodes;
    1670           52 :   nodes.reserve(reference.size());
    1671          316 :   for (auto &node : reference) {
    1672          528 :     nodes.push_back(node->cloneConfiguration());
    1673              :   }
    1674              : 
    1675              :   auto start_conns =
    1676           52 :     std::vector<Connection>(start_layers.begin(), start_layers.end());
    1677              :   auto input_conns =
    1678           52 :     std::vector<Connection>(input_layers.begin(), input_layers.end());
    1679              :   auto end_conns =
    1680           52 :     std::vector<Connection>(end_layers.begin(), end_layers.end());
    1681              : 
    1682              :   std::vector<std::unique_ptr<GraphRealizer>> realizers;
    1683              : 
    1684           52 :   realizers.emplace_back(new PreviousInputRealizer(start_conns));
    1685           52 :   realizers.emplace_back(new SliceRealizer(start_conns, end_conns));
    1686              : 
    1687           52 :   if (!input_conns.empty()) {
    1688           51 :     realizers.emplace_back(new InputRealizer(start_conns, input_conns));
    1689              :   }
    1690              : 
    1691           52 :   if (type == ml::train::ReferenceLayersType::RECURRENT) {
    1692           51 :     realizers.emplace_back(
    1693           51 :       new RecurrentRealizer(type_properties, input_conns, end_conns));
    1694              :   }
    1695              : 
    1696           52 :   if (!scope.empty()) {
    1697           52 :     realizers.emplace_back(
    1698           52 :       new RemapRealizer([&scope, &input_conns](std::string &name) {
    1699         4394 :         for (auto &i : input_conns) {
    1700         3578 :           if (i.getName() == name) {
    1701              :             return;
    1702              :           }
    1703              :         }
    1704         1632 :         name = scope + "/" + name;
    1705          104 :       }));
    1706              :   }
    1707              : 
    1708          310 :   for (auto &realizer : realizers) {
    1709          258 :     nodes = realizer->realize(nodes);
    1710              :   }
    1711              : 
    1712          314 :   for (auto &node : nodes) {
    1713          524 :     addLayer(node);
    1714              :   }
    1715           52 : }
    1716              : 
    1717          240 : void NeuralNetwork::exportTo(Exporter &exporter,
    1718              :                              const ml::train::ExportMethods &method) const {
    1719          240 :   exporter.saveResult(model_props, method, this);
    1720          240 :   exporter.saveResult(model_flex_props, method, this);
    1721          240 : }
    1722              : 
    1723            7 : void NeuralNetwork::print(std::ostream &out, unsigned int flags,
    1724              :                           LayerNode::PrintPreset layerPrintPreset) {
    1725              :   if (flags & PRINT_INST_INFO) {
    1726              :     /// @todo uncomment this after implement getProperty (#1875)
    1727              :     // out << "===================";
    1728              :     // printInstance(out, this);
    1729              :   }
    1730              : 
    1731            7 :   if (flags & PRINT_GRAPH_INFO) {
    1732              :     unsigned int total_col_size = 80;
    1733            7 :     std::vector<unsigned int> column_size = {20, 20, 20, 20};
    1734              :     auto print_graph_layer_info =
    1735            7 :       [column_size](std::ostream &out, std::vector<std::string> layer_info) {
    1736          140 :         const auto &trim_string = [](std::string str,
    1737              :                                      unsigned int column_width) {
    1738          140 :           return str.size() < column_width ? str
    1739           14 :                                            : str.substr(0, column_width - 1);
    1740              :         };
    1741              : 
    1742          175 :         for (unsigned int i = 0; i < column_size.size(); ++i) {
    1743              :           out << std::setw(column_size[i])
    1744          280 :               << trim_string(layer_info[i], column_size[i]);
    1745              :         }
    1746           35 :         out << "\n";
    1747           42 :       };
    1748              : 
    1749            7 :     out << std::string(total_col_size, '=') << '\n';
    1750            7 :     print_graph_layer_info(
    1751              :       out, {"Layer name", "Layer type", "Output dimension", "Input layer"});
    1752            7 :     out << std::string(total_col_size, '=') << '\n';
    1753            7 :     if (compiled) {
    1754              :       props::GenericShape dim_property;
    1755              : 
    1756           25 :       for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
    1757              :            iter++) {
    1758              :         std::string first_dim;
    1759           44 :         if (iter->getOutputDimensions().empty()) {
    1760              :           first_dim = "";
    1761              :         } else {
    1762           44 :           dim_property.set(iter->getOutputDimensions()[0]);
    1763           44 :           first_dim = to_string(dim_property);
    1764              :         }
    1765              :         const std::vector<std::string> &input_layer_names =
    1766           22 :           iter->getInputConnections();
    1767              :         std::string first_input_name =
    1768           41 :           input_layer_names.empty() ? "" : input_layer_names[0];
    1769          110 :         print_graph_layer_info(
    1770              :           out, {iter->getName(), iter->getType(), first_dim, first_input_name});
    1771           22 :         for (unsigned int i = 1; i < input_layer_names.size(); ++i) {
    1772            0 :           dim_property.set(iter->getInputDimensions()[i]);
    1773            0 :           print_graph_layer_info(out, {"", "", "", input_layer_names[i]});
    1774              :         }
    1775           44 :         out << std::string(total_col_size,
    1776              :                            iter == model_graph.cend() - 1 ? '=' : '-')
    1777           22 :             << '\n';
    1778           22 :       }
    1779              :     } else {
    1780              :       auto &input_connection =
    1781              :         std::get<std::vector<props::InputConnection>>(model_props);
    1782              :       auto model_input = std::vector<Connection>(input_connection.begin(),
    1783            4 :                                                  input_connection.end());
    1784              :       auto is_actually_an_input_node =
    1785            4 :         [model_input](graph_const_iterator<LayerNode> node) {
    1786              :           return node->hasInputShapeProperty() or
    1787              :                  std::any_of(model_input.begin(), model_input.end(),
    1788              :                              [node](auto &conn) {
    1789              :                                return node->getName() == conn.getName();
    1790              :                              });
    1791            4 :         };
    1792              : 
    1793           10 :       for (auto iter = model_graph.cbegin(); iter != model_graph.cend();
    1794              :            iter++) {
    1795              :         const std::vector<std::string> &input_layer_names =
    1796            6 :           iter->getInputConnections();
    1797              : 
    1798              :         /// @brief connection information.
    1799              :         // Intended comment.
    1800              :         // std::string first_input_name =
    1801              :         //   input_layer_names.empty()
    1802              :         //     ? (is_actually_an_input_node(iter) || iter ==
    1803              :         //     model_graph.cbegin()
    1804              :         //          ? ""
    1805              :         //          : (iter - 1)->getName())
    1806              :         //     : input_layer_names[0];
    1807           30 :         print_graph_layer_info(out, {iter->getName(), iter->getType(), "", ""});
    1808            6 :         for (unsigned int i = 1; i < input_layer_names.size(); ++i) {
    1809            0 :           print_graph_layer_info(out, {"", "", "", ""});
    1810              :         }
    1811            6 :         out << std::string(total_col_size,
    1812              :                            iter == model_graph.cend() - 1 ? '=' : '-')
    1813            6 :             << '\n';
    1814            6 :       }
    1815            4 :     }
    1816            7 :   }
    1817              : 
    1818              :   if (flags & PRINT_PROP) {
    1819              :     /// @todo print neuralnet property
    1820              :     /// @todo print mode (if it is eval or training)
    1821              :   }
    1822              : 
    1823              :   if (flags & PRINT_OPTIMIZER) {
    1824              :     /// @todo print optimizer (with print optimizer prop)
    1825              :   }
    1826              : 
    1827              :   if (flags & PRINT_METRIC) {
    1828              :     /// @todo print metric (currently it is done at printPreset as a
    1829              :     /// workaround)
    1830              :     /// @todo print loss function when it is not initialized. (if it is
    1831              :     /// initialized, loss layer will be printed)
    1832              :   }
    1833              : 
    1834            7 :   if (model_graph.empty()) {
    1835              :     out << "model is empty!" << std::endl;
    1836            0 :     return;
    1837              :   }
    1838              : 
    1839              :   /** print layer properties */
    1840           35 :   for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++)
    1841           56 :     (*iter)->printPreset(out, layerPrintPreset);
    1842              : 
    1843              :   /// @todo Add status to check neuralnet has been run. #290
    1844          118 : }
    1845              : 
    1846            0 : void NeuralNetwork::forEachLayer(
    1847              :   std::function<void(ml::train::Layer &, RunLayerContext &, void *)> fn,
    1848              :   void *user_data) {
    1849            0 :   for (auto iter = model_graph.cbegin(); iter != model_graph.cend(); iter++) {
    1850            0 :     auto ln = std::static_pointer_cast<LayerNode>(*iter).get();
    1851            0 :     fn(*ln, std::forward<RunLayerContext &>(ln->getRunContext()), user_data);
    1852              :   };
    1853            0 : }
    1854              : 
    1855            4 : void NeuralNetwork::exports(const ml::train::ExportMethods &method,
    1856              :                             const std::string file_path) {
    1857            4 :   switch (method) {
    1858            4 :   case ml::train::ExportMethods::METHOD_TFLITE: {
    1859              : #ifdef ENABLE_TFLITE_INTERPRETER
    1860            4 :     nntrainer::TfliteInterpreter interpreter;
    1861              : 
    1862              :     /// We will call "serialize" method for the model which is already trained
    1863              :     /// or allocated. So, we need to call deallocateTensors first to make sure
    1864              :     /// `dealloc_weights == false`
    1865              :     model_graph.deallocateTensors();
    1866            4 :     model_graph.allocateTensors(ExecutionMode::INFERENCE);
    1867            4 :     model_graph.setBatchSize(1); // For now, to inference batch size to be 1
    1868            4 :     interpreter.serialize(graph_representation, file_path);
    1869              :     model_graph.deallocateTensors();
    1870              : #else
    1871              :     throw std::runtime_error{
    1872              :       "Export methods METHOD_TFLITE is not supported. Please enable tflite "
    1873              :       "interpreter by set ENABLE_TFLITE_INTERPRETER=1"};
    1874              : #endif
    1875              :     break;
    1876              :   }
    1877            0 :   case ml::train::ExportMethods::METHOD_FLATBUFFER: {
    1878              : 
    1879              :     /**
    1880              :      * @todo The current FLATBUFFER exporter only supports TRAIN execution mode.
    1881              :      * It should be updated to support both train and inference mode.
    1882              :      * It would be more natural to support inference by default since tflite is
    1883              :      * typically used solely for inference
    1884              :      */
    1885              :     model_graph.deallocateTensors();
    1886            0 :     model_graph.allocateTensors(ExecutionMode::TRAIN);
    1887            0 :     break;
    1888              :   }
    1889            0 :   default:
    1890            0 :     throw std::runtime_error{"Unsupported export method"};
    1891              :   }
    1892            4 : }
    1893              : } /* namespace nntrainer */

Generated by: LCOV version 2.0-1