Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2020 Parichay Kapoor <pk.kapoor@samsung.com>
4 : *
5 : * @file dynamic_training_optimization.h
6 : * @date 4 January 2021
7 : * @see https://github.com/nnstreamer/nntrainer
8 : * @author Parichay Kapoor <pk.kapoor@samsung.com>
9 : * @bug No known bugs except for NYI items
10 : * @brief This is Dynamic Training Optimization for Neural Network
11 : *
12 : * Dynamic training aims to optimize the cost of applying the gradient.
13 : * The cost of applying the gradient includes the cost of the optimizer (adam,
14 : * etc) where the optimizer variables are updated, and the cost of actually
15 : * updating the weights (which can be non-trivial with bigger weights and
16 : * distributed training).
17 : *
18 : * There are two supported modes:
19 : * 1. Gradient Mode: The already calculated gradient is used to estimate if this
20 : * gradient must be used to update the weight, or if this update must be
21 : * skipped.
22 : *
23 : * 2. Derivative Mode: This mode tries to estimate an approximate gradient with
24 : * low cost in order to save the cost of calculating gradient. This cost of
25 : * calculating gradient is wasted if the gradient is not going to be applied.
26 : *
27 : * There are two supported reduction operations which reduce the gradient and
28 : * the weight to a single value in order to compare it with a threshold.
29 : * If the reduced value is less than threshold, the update is performed with
30 : * some probability proportional to the value. If the reduced value is higher
31 : * than threshold, then the update is always performed.
32 : *
33 : */
34 :
35 : #ifndef __DYNAMIC_TRAINING_OPT_H__
36 : #define __DYNAMIC_TRAINING_OPT_H__
37 : #ifdef __cplusplus
38 :
39 : #include <random>
40 : #include <vector>
41 :
42 : #include <layer_devel.h>
43 : #include <tensor.h>
44 :
45 : namespace nntrainer {
46 :
47 : class Weight;
48 : class Var_Grad;
49 : class OptimizerWrapped;
50 :
51 : /**
52 : * @class DynamicTraining Optimization
53 : * @brief Dynamic Training Optimization
54 : */
55 : class DynamicTrainingOptimization {
56 : public:
57 : /**
58 : * @brief Constructor of DynamicFineTuning Optimization
59 : */
60 : DynamicTrainingOptimization(float threshold_ = 1.0f, int skip_n_iter = 1);
61 :
62 : /**
63 : * @brief Set threshold for optimization
64 : */
65 : void setThreshold(float threshold_) {
66 : if (threshold_ < epsilon)
67 : throw std::invalid_argument("Threshold is too small or negative");
68 :
69 : threshold = threshold_;
70 : };
71 :
72 : /**
73 : * @brief Set the reduce operation for dynamic optimization
74 : */
75 : void setOp(const std::string &op) {
76 : if (op == dft_opt_max)
77 : reduce_op = reduceByMax;
78 : else if (op == dft_opt_norm)
79 : reduce_op = reduceByNorm;
80 : else
81 : throw std::invalid_argument(
82 : "Unsupported reduction op in dynamic training");
83 : };
84 :
85 : /**
86 : * @brief Enable the optimization
87 : */
88 : void enable() { enabled = true; }
89 :
90 : /**
91 : * @brief Disable the optimization
92 : */
93 : void disable() { enabled = false; }
94 :
95 : /**
96 : * @brief Set the mode for optimization
97 : */
98 : void setMode(const std::string &mode_) {
99 : calc_ratio_mode = mode_;
100 : if (mode_ == dft_opt_mode_derivative)
101 : calc_ratio_op = ratioUsingDerivative;
102 : else if (mode_ == dft_opt_mode_gradient)
103 : calc_ratio_op = ratioUsingGradient;
104 : else
105 : throw std::invalid_argument("Unsupported mode in dynamic training");
106 : }
107 :
108 : /**
109 : * @brief Check if the derivative mode is used for optimization
110 : * @note Use the derivative to calculate an approximate gradient to estimate
111 : * if the actual gradient needs applying
112 : */
113 : bool isDerivativeMode() {
114 : if (enabled && calc_ratio_mode == dft_opt_mode_derivative)
115 : return true;
116 : return false;
117 : }
118 :
119 : /**
120 : * @brief Check if the gradient mode is used for optimization
121 : * @note Use the gradient to estimate if this gradient needs applying
122 : */
123 : bool isGradientMode() {
124 7467 : if (enabled && calc_ratio_mode == dft_opt_mode_gradient)
125 : return true;
126 : return false;
127 : }
128 :
129 : /**
130 : * @brief Initial iterations to not perform dynamic training optimization
131 : * @note If the current iteration is less than skip_n_iterations, the weights
132 : * will updated and dynamic training optimization will not be performed.
133 : *
134 : */
135 : void setSkipIterations(int skip_n_iter) { skip_n_iterations = skip_n_iter; }
136 :
137 : /**
138 : * @brief Check if the given weights can skip updating
139 : * @param[in] weights All the weight tensors for a layer
140 : * @param[in] input Input tensor for a layer
141 : * @param[in] output Output tensor for a layer, from forward operation
142 : * @param[in] opt Optimizer used to update the layer weights
143 : * @param[in] iteration Current iteration number in training
144 : * @note true if should be applied, else false
145 : */
146 : bool checkIfApply(const std::vector<Weight> &weights,
147 : const std::shared_ptr<Var_Grad> &input,
148 : const std::shared_ptr<Var_Grad> &output,
149 : const std::shared_ptr<OptimizerWrapped> &opt,
150 : int iteration);
151 :
152 : /**
153 : * @brief Check if the given weight can skip updating
154 : * @param[in] weight Weight tensor for a layer
155 : * @param[in] input Input tensor for a layer
156 : * @param[in] output Output tensor for a layer, from forward operation
157 : * @param[in] opt Optimizer used to update the layer weights
158 : * @param[in] iteration Current iteration number in training
159 : * @note true if should be applied, else false
160 : */
161 : bool checkIfApply(const Weight &weight,
162 : const std::shared_ptr<Var_Grad> &input,
163 : const std::shared_ptr<Var_Grad> &output,
164 : const std::shared_ptr<OptimizerWrapped> &opt,
165 : int iteration);
166 :
167 : /**< Different types of reduce operations */
168 : static constexpr const char *dft_opt_max = "max";
169 : static constexpr const char *dft_opt_norm = "norm";
170 :
171 : /**< Different types of optimization modes */
172 : static constexpr const char *dft_opt_mode_gradient = "gradient";
173 : static constexpr const char *dft_opt_mode_derivative = "derivative";
174 :
175 : private:
176 : std::mt19937 rng; /**< random number generator */
177 : std::uniform_real_distribution<float>
178 : dist; /**< uniform random distribution */
179 : float threshold; /**< threshold to decide when to skip updating */
180 : bool enabled; /**< if optimization is enabled */
181 : float epsilon; /**< epsilon to skip overflow */
182 : int skip_n_iterations; /**< skip initial iterations from optimization */
183 : std::string calc_ratio_mode; /**< the mode to calc the ratio */
184 :
185 : std::function<float(Tensor const &)>
186 : reduce_op; /**< operation to reduce update ratio to value */
187 : std::function<float(const Weight &, const std::shared_ptr<Var_Grad> &,
188 : const std::shared_ptr<Var_Grad> &,
189 : std::function<float(Tensor const &)> reduce_op)>
190 : calc_ratio_op; /**< calculate the ratio of update to the weight */
191 :
192 : /**
193 : * @brief Calculate the ratio of update to the weight using derivative
194 : * @param[in] weight Weight tensor for a layer
195 : * @param[in] input Input tensor for a layer
196 : * @param[in] output Output tensor for a layer, from forward operation
197 : * @param[in] reduce_op Operation to reduce the ratio
198 : */
199 : static float
200 : ratioUsingDerivative(const Weight &weight,
201 : const std::shared_ptr<Var_Grad> &input,
202 : const std::shared_ptr<Var_Grad> &output,
203 : std::function<float(Tensor const &)> reduce_op);
204 :
205 : /**
206 : * @brief Calculate the ratio of update to the weight using gradient
207 : * @param[in] weight Weight tensor for a layer
208 : * @param[in] input Input tensor for a layer
209 : * @param[in] output Output tensor for a layer, from forward operation
210 : * @param[in] reduce_op Operation to reduce the ratio
211 : */
212 : static float
213 : ratioUsingGradient(const Weight &weight,
214 : const std::shared_ptr<Var_Grad> &input,
215 : const std::shared_ptr<Var_Grad> &output,
216 : std::function<float(Tensor const &)> reduce_op);
217 :
218 : /**
219 : * @brief Check if the update should be applied or skipped
220 : * @note true if should be applied, else false
221 : */
222 : bool checkIfApply(float reduced_ratio, float learning_rate);
223 :
224 : /**
225 : * @brief Operation to decide if update should be skipped
226 : * @note Calculate l0 norm of the tensor
227 : */
228 : static float reduceByMax(Tensor const &ratio);
229 :
230 : /**
231 : * @brief Operation to decide if update should be skipped
232 : * @note Calculate l2 norm of the tensor averaged by its size
233 : */
234 : static float reduceByNorm(Tensor const &ratio);
235 : };
236 :
237 : } /* namespace nntrainer */
238 :
239 : #endif /* __cplusplus */
240 : #endif /* __DYNAMIC_TRAINING_OPT_H__ */
|