LCOV - coverage_filtered.info - nntrainer/models/dynamic_training

LCOV - code coverage report

Current view:	top level - nntrainer/models - dynamic_training_optimization.h (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	100.0 %	1	1
Test Date:	2025-12-14 20:38:17	Functions:	-	0	0

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2020 Parichay Kapoor <pk.kapoor@samsung.com>
       4              :  *
       5              :  * @file   dynamic_training_optimization.h
       6              :  * @date   4 January 2021
       7              :  * @see    https://github.com/nnstreamer/nntrainer
       8              :  * @author Parichay Kapoor <pk.kapoor@samsung.com>
       9              :  * @bug    No known bugs except for NYI items
      10              :  * @brief  This is Dynamic Training Optimization for Neural Network
      11              :  *
      12              :  * Dynamic training aims to optimize the cost of applying the gradient.
      13              :  * The cost of applying the gradient includes the cost of the optimizer (adam,
      14              :  * etc) where the optimizer variables are updated, and the cost of actually
      15              :  * updating the weights (which can be non-trivial with bigger weights and
      16              :  * distributed training).
      17              :  *
      18              :  * There are two supported modes:
      19              :  * 1. Gradient Mode: The already calculated gradient is used to estimate if this
      20              :  * gradient must be used to update the weight, or if this update must be
      21              :  * skipped.
      22              :  *
      23              :  * 2. Derivative Mode: This mode tries to estimate an approximate gradient with
      24              :  * low cost in order to save the cost of calculating gradient. This cost of
      25              :  * calculating gradient is wasted if the gradient is not going to be applied.
      26              :  *
      27              :  * There are two supported reduction operations which reduce the gradient and
      28              :  * the weight to a single value in order to compare it with a threshold.
      29              :  * If the reduced value is less than threshold, the update is performed with
      30              :  * some probability proportional to the value. If the reduced value is higher
      31              :  * than threshold, then the update is always performed.
      32              :  *
      33              :  */
      34              : 
      35              : #ifndef __DYNAMIC_TRAINING_OPT_H__
      36              : #define __DYNAMIC_TRAINING_OPT_H__
      37              : #ifdef __cplusplus
      38              : 
      39              : #include <random>
      40              : #include <vector>
      41              : 
      42              : #include <layer_devel.h>
      43              : #include <tensor.h>
      44              : 
      45              : namespace nntrainer {
      46              : 
      47              : class Weight;
      48              : class Var_Grad;
      49              : class OptimizerWrapped;
      50              : 
      51              : /**
      52              :  * @class   DynamicTraining Optimization
      53              :  * @brief   Dynamic Training Optimization
      54              :  */
      55              : class DynamicTrainingOptimization {
      56              : public:
      57              :   /**
      58              :    * @brief     Constructor of DynamicFineTuning Optimization
      59              :    */
      60              :   DynamicTrainingOptimization(float threshold_ = 1.0f, int skip_n_iter = 1);
      61              : 
      62              :   /**
      63              :    * @brief     Set threshold for optimization
      64              :    */
      65              :   void setThreshold(float threshold_) {
      66              :     if (threshold_ < epsilon)
      67              :       throw std::invalid_argument("Threshold is too small or negative");
      68              : 
      69              :     threshold = threshold_;
      70              :   };
      71              : 
      72              :   /**
      73              :    * @brief     Set the reduce operation for dynamic optimization
      74              :    */
      75              :   void setOp(const std::string &op) {
      76              :     if (op == dft_opt_max)
      77              :       reduce_op = reduceByMax;
      78              :     else if (op == dft_opt_norm)
      79              :       reduce_op = reduceByNorm;
      80              :     else
      81              :       throw std::invalid_argument(
      82              :         "Unsupported reduction op in dynamic training");
      83              :   };
      84              : 
      85              :   /**
      86              :    * @brief     Enable the optimization
      87              :    */
      88              :   void enable() { enabled = true; }
      89              : 
      90              :   /**
      91              :    * @brief     Disable the optimization
      92              :    */
      93              :   void disable() { enabled = false; }
      94              : 
      95              :   /**
      96              :    * @brief     Set the mode for optimization
      97              :    */
      98              :   void setMode(const std::string &mode_) {
      99              :     calc_ratio_mode = mode_;
     100              :     if (mode_ == dft_opt_mode_derivative)
     101              :       calc_ratio_op = ratioUsingDerivative;
     102              :     else if (mode_ == dft_opt_mode_gradient)
     103              :       calc_ratio_op = ratioUsingGradient;
     104              :     else
     105              :       throw std::invalid_argument("Unsupported mode in dynamic training");
     106              :   }
     107              : 
     108              :   /**
     109              :    * @brief     Check if the derivative mode is used for optimization
     110              :    * @note Use the derivative to calculate an approximate gradient to estimate
     111              :    * if the actual gradient needs applying
     112              :    */
     113              :   bool isDerivativeMode() {
     114              :     if (enabled && calc_ratio_mode == dft_opt_mode_derivative)
     115              :       return true;
     116              :     return false;
     117              :   }
     118              : 
     119              :   /**
     120              :    * @brief     Check if the gradient mode is used for optimization
     121              :    * @note Use the gradient to estimate if this gradient needs applying
     122              :    */
     123              :   bool isGradientMode() {
     124         7467 :     if (enabled && calc_ratio_mode == dft_opt_mode_gradient)
     125              :       return true;
     126              :     return false;
     127              :   }
     128              : 
     129              :   /**
     130              :    * @brief    Initial iterations to not perform dynamic training optimization
     131              :    * @note If the current iteration is less than skip_n_iterations, the weights
     132              :    * will updated and dynamic training optimization will not be performed.
     133              :    *
     134              :    */
     135              :   void setSkipIterations(int skip_n_iter) { skip_n_iterations = skip_n_iter; }
     136              : 
     137              :   /**
     138              :    * @brief     Check if the given weights can skip updating
     139              :    * @param[in] weights All the weight tensors for a layer
     140              :    * @param[in] input Input tensor for a layer
     141              :    * @param[in] output Output tensor for a layer, from forward operation
     142              :    * @param[in] opt Optimizer used to update the layer weights
     143              :    * @param[in] iteration Current iteration number in training
     144              :    * @note true if should be applied, else false
     145              :    */
     146              :   bool checkIfApply(const std::vector<Weight> &weights,
     147              :                     const std::shared_ptr<Var_Grad> &input,
     148              :                     const std::shared_ptr<Var_Grad> &output,
     149              :                     const std::shared_ptr<OptimizerWrapped> &opt,
     150              :                     int iteration);
     151              : 
     152              :   /**
     153              :    * @brief     Check if the given weight can skip updating
     154              :    * @param[in] weight Weight tensor for a layer
     155              :    * @param[in] input Input tensor for a layer
     156              :    * @param[in] output Output tensor for a layer, from forward operation
     157              :    * @param[in] opt Optimizer used to update the layer weights
     158              :    * @param[in] iteration Current iteration number in training
     159              :    * @note true if should be applied, else false
     160              :    */
     161              :   bool checkIfApply(const Weight &weight,
     162              :                     const std::shared_ptr<Var_Grad> &input,
     163              :                     const std::shared_ptr<Var_Grad> &output,
     164              :                     const std::shared_ptr<OptimizerWrapped> &opt,
     165              :                     int iteration);
     166              : 
     167              :   /**< Different types of reduce operations */
     168              :   static constexpr const char *dft_opt_max = "max";
     169              :   static constexpr const char *dft_opt_norm = "norm";
     170              : 
     171              :   /**< Different types of optimization modes */
     172              :   static constexpr const char *dft_opt_mode_gradient = "gradient";
     173              :   static constexpr const char *dft_opt_mode_derivative = "derivative";
     174              : 
     175              : private:
     176              :   std::mt19937 rng; /**< random number generator */
     177              :   std::uniform_real_distribution<float>
     178              :     dist;                      /**< uniform random distribution */
     179              :   float threshold;             /**< threshold to decide when to skip updating */
     180              :   bool enabled;                /**< if optimization is enabled */
     181              :   float epsilon;               /**< epsilon to skip overflow */
     182              :   int skip_n_iterations;       /**< skip initial iterations from optimization */
     183              :   std::string calc_ratio_mode; /**< the mode to calc the ratio */
     184              : 
     185              :   std::function<float(Tensor const &)>
     186              :     reduce_op; /**< operation to reduce update ratio to value */
     187              :   std::function<float(const Weight &, const std::shared_ptr<Var_Grad> &,
     188              :                       const std::shared_ptr<Var_Grad> &,
     189              :                       std::function<float(Tensor const &)> reduce_op)>
     190              :     calc_ratio_op; /**< calculate the ratio of update to the weight */
     191              : 
     192              :   /**
     193              :    * @brief   Calculate the ratio of update to the weight using derivative
     194              :    * @param[in] weight Weight tensor for a layer
     195              :    * @param[in] input Input tensor for a layer
     196              :    * @param[in] output Output tensor for a layer, from forward operation
     197              :    * @param[in] reduce_op Operation to reduce the ratio
     198              :    */
     199              :   static float
     200              :   ratioUsingDerivative(const Weight &weight,
     201              :                        const std::shared_ptr<Var_Grad> &input,
     202              :                        const std::shared_ptr<Var_Grad> &output,
     203              :                        std::function<float(Tensor const &)> reduce_op);
     204              : 
     205              :   /**
     206              :    * @brief   Calculate the ratio of update to the weight using gradient
     207              :    * @param[in] weight Weight tensor for a layer
     208              :    * @param[in] input Input tensor for a layer
     209              :    * @param[in] output Output tensor for a layer, from forward operation
     210              :    * @param[in] reduce_op Operation to reduce the ratio
     211              :    */
     212              :   static float
     213              :   ratioUsingGradient(const Weight &weight,
     214              :                      const std::shared_ptr<Var_Grad> &input,
     215              :                      const std::shared_ptr<Var_Grad> &output,
     216              :                      std::function<float(Tensor const &)> reduce_op);
     217              : 
     218              :   /**
     219              :    * @brief   Check if the update should be applied or skipped
     220              :    * @note true if should be applied, else false
     221              :    */
     222              :   bool checkIfApply(float reduced_ratio, float learning_rate);
     223              : 
     224              :   /**
     225              :    * @brief     Operation to decide if update should be skipped
     226              :    * @note      Calculate l0 norm of the tensor
     227              :    */
     228              :   static float reduceByMax(Tensor const &ratio);
     229              : 
     230              :   /**
     231              :    * @brief     Operation to decide if update should be skipped
     232              :    * @note      Calculate l2 norm of the tensor averaged by its size
     233              :    */
     234              :   static float reduceByNorm(Tensor const &ratio);
     235              : };
     236              : 
     237              : } /* namespace nntrainer */
     238              : 
     239              : #endif /* __cplusplus */
     240              : #endif /* __DYNAMIC_TRAINING_OPT_H__ */

Generated by: LCOV version 2.0-1