Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * Copyright (C) 2022 hyeonseok Lee <hs89.lee@samsung.com>
4 : *
5 : * @file layer_normalization_layer.cpp
6 : * @date 25 July 2022
7 : * @see https://github.com/nnstreamer/nntrainer
8 : * https://arxiv.org/abs/1607.06450
9 : * @author hyeonseok Lee <hs89.lee@samsung.com>
10 : * @bug No known bugs except for NYI items
11 : * @brief This is Layer Normalization Layer Class for Neural Network
12 : *
13 : */
14 :
15 : #include <algorithm>
16 : #include <numeric>
17 :
18 : #include <layer_context.h>
19 : #include <layer_normalization_layer.h>
20 : #include <nntrainer_error.h>
21 : #include <nntrainer_log.h>
22 : #include <node_exporter.h>
23 : #include <util_func.h>
24 :
25 : namespace nntrainer {
26 :
27 : static constexpr size_t SINGLE_INOUT_IDX = 0;
28 :
29 : enum LNParams {
30 : gamma,
31 : beta,
32 : deviation,
33 : variance,
34 : inv_std_dev,
35 : temp_origin_size,
36 : temp_normalized_size,
37 : };
38 :
39 189 : LayerNormalizationLayer::LayerNormalizationLayer() :
40 : Layer(),
41 189 : layer_normalization_props(std::vector<props::Axis>(), props::Epsilon(),
42 378 : props::GammaInitializer(), props::BetaInitializer(),
43 378 : props::WeightDecay(), props::BiasDecay()) {
44 : wt_idx.fill(std::numeric_limits<unsigned>::max());
45 189 : }
46 :
47 175 : void LayerNormalizationLayer::finalize(InitLayerContext &context) {
48 175 : if (context.getNumInputs() != 1) {
49 : throw std::invalid_argument(
50 0 : "Only one input is allowed for layer normalization layer");
51 : }
52 :
53 : auto gamma_initializer =
54 175 : std::get<props::GammaInitializer>(layer_normalization_props).get();
55 : auto beta_initializer =
56 175 : std::get<props::BetaInitializer>(layer_normalization_props).get();
57 : auto weight_decay = std::get<props::WeightDecay>(layer_normalization_props);
58 : auto bias_decay = std::get<props::BiasDecay>(layer_normalization_props);
59 :
60 : auto const &input_dim = context.getInputDimensions()[0];
61 175 : context.setOutputDimensions({input_dim});
62 :
63 : std::vector<props::Axis> axes_prop =
64 175 : std::get<std::vector<props::Axis>>(layer_normalization_props);
65 :
66 175 : NNTR_THROW_IF(axes_prop.empty(), std::invalid_argument)
67 : << "[Layer normalization]axis property is empty";
68 :
69 175 : normalize_axes.insert(normalize_axes.end(), axes_prop.begin(),
70 : axes_prop.end());
71 175 : std::sort(normalize_axes.begin(), normalize_axes.end());
72 : normalize_axes.erase(
73 175 : std::unique(normalize_axes.begin(), normalize_axes.end()),
74 : normalize_axes.end());
75 :
76 175 : TensorDim normalize_dim(context.getFormat(), context.getWeightDataType());
77 355 : for (unsigned int axis : normalize_axes) {
78 180 : normalize_dim.setTensorDim(axis, input_dim.getTensorDim(axis));
79 : }
80 :
81 350 : wt_idx[LNParams::gamma] = context.requestWeight(
82 : normalize_dim, gamma_initializer, WeightRegularizer::NONE, 1.0f,
83 : weight_decay, "gamma", true);
84 350 : wt_idx[LNParams::beta] = context.requestWeight(
85 : normalize_dim, beta_initializer, WeightRegularizer::NONE, 1.0f, bias_decay,
86 : "beta", true);
87 :
88 175 : TensorDim remain_dim(context.getFormat(), context.getWeightDataType());
89 : std::vector<unsigned int> total_axes;
90 175 : total_axes.resize(ml::train::TensorDim::MAXDIM);
91 : std::iota(total_axes.begin(), total_axes.end(), 0u);
92 175 : std::set_difference(total_axes.begin(), total_axes.end(),
93 : normalize_axes.begin(), normalize_axes.end(),
94 175 : std::back_inserter(remain_axes));
95 695 : for (unsigned int axis : remain_axes) {
96 520 : remain_dim.setTensorDim(axis, input_dim.getTensorDim(axis));
97 : }
98 :
99 : /** caches the deviation -> input - avg(input) */
100 175 : wt_idx[LNParams::deviation] =
101 175 : context.requestTensor(input_dim, "deviation", Initializer::NONE, false,
102 : TensorLifespan::ITERATION_LIFESPAN);
103 : /** caches variance + epsilon as well */
104 175 : wt_idx[LNParams::variance] =
105 175 : context.requestTensor(remain_dim, "variance", Initializer::NONE, false,
106 : TensorLifespan::ITERATION_LIFESPAN);
107 : /** caches the inverse standard deviation */
108 175 : wt_idx[LNParams::inv_std_dev] =
109 175 : context.requestTensor(remain_dim, "inv_std_dev", Initializer::NONE, false,
110 : TensorLifespan::ITERATION_LIFESPAN);
111 :
112 : /** temporary tensor (origin size) */
113 175 : wt_idx[LNParams::temp_origin_size] =
114 175 : context.requestTensor(input_dim, "temp_origin_size", Initializer::NONE,
115 : false, TensorLifespan::CALC_DERIV_LIFESPAN);
116 : /** temporary tensor (normalized size) */
117 175 : wt_idx[LNParams::temp_normalized_size] =
118 175 : context.requestTensor(remain_dim, "temp_normalized_size", Initializer::NONE,
119 : false, TensorLifespan::CALC_DERIV_LIFESPAN);
120 350 : }
121 :
122 1024 : void LayerNormalizationLayer::setProperty(
123 : const std::vector<std::string> &values) {
124 1024 : auto remain_props = loadProperties(values, layer_normalization_props);
125 1023 : NNTR_THROW_IF(!remain_props.empty(), std::invalid_argument)
126 2 : << "[Layer Normalization Layer] Unknown Layer Properties count " +
127 4 : std::to_string(values.size());
128 1023 : }
129 :
130 281 : void LayerNormalizationLayer::forwarding(RunLayerContext &context,
131 : bool training) {
132 : const float epsilon =
133 281 : std::get<props::Epsilon>(layer_normalization_props).get();
134 :
135 281 : const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
136 281 : Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
137 :
138 281 : Tensor &gamma = context.getWeight(wt_idx[LNParams::gamma]);
139 281 : Tensor &beta = context.getWeight(wt_idx[LNParams::beta]);
140 :
141 281 : Tensor &deviation = context.getTensor(wt_idx[LNParams::deviation]);
142 281 : Tensor &variance = context.getTensor(wt_idx[LNParams::variance]);
143 281 : Tensor &inv_std_dev = context.getTensor(wt_idx[LNParams::inv_std_dev]);
144 :
145 : Tensor &temp_full_size = output;
146 : Tensor &temp_norm_size = inv_std_dev;
147 :
148 281 : input.average(normalize_axes, temp_norm_size);
149 281 : input.subtract(temp_norm_size, deviation);
150 :
151 281 : deviation.pow(2.0, temp_full_size);
152 281 : temp_full_size.average(normalize_axes, variance);
153 :
154 281 : variance.add_i(epsilon);
155 281 : variance.pow(-0.5, inv_std_dev);
156 :
157 281 : deviation.multiply(inv_std_dev, output);
158 281 : output.multiply_i(gamma);
159 281 : output.add_i(beta);
160 281 : }
161 :
162 0 : void LayerNormalizationLayer::incremental_forwarding(RunLayerContext &context,
163 : unsigned int from,
164 : unsigned int to,
165 : bool training) {
166 : const float epsilon =
167 0 : std::get<props::Epsilon>(layer_normalization_props).get();
168 :
169 0 : const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
170 0 : Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
171 :
172 0 : Tensor &gamma = context.getWeight(wt_idx[LNParams::gamma]);
173 0 : Tensor &beta = context.getWeight(wt_idx[LNParams::beta]);
174 :
175 0 : Tensor &deviation = context.getTensor(wt_idx[LNParams::deviation]);
176 0 : Tensor &variance = context.getTensor(wt_idx[LNParams::variance]);
177 0 : Tensor &inv_std_dev = context.getTensor(wt_idx[LNParams::inv_std_dev]);
178 :
179 : // @todo: consider NHWC format
180 : bool is_height_normalize =
181 0 : std::find(normalize_axes.begin(), normalize_axes.end(), 1) !=
182 : normalize_axes.end()
183 : ? true
184 : : false;
185 :
186 0 : TensorDim input_dim = input.getDim();
187 0 : TensorDim output_dim = output.getDim();
188 0 : TensorDim normalize_dim = gamma.getDim();
189 0 : TensorDim remain_dim = variance.getDim();
190 :
191 0 : TensorDim input_step_dim = input_dim;
192 0 : TensorDim output_step_dim = output_dim;
193 0 : TensorDim normalize_step_dim = normalize_dim;
194 0 : TensorDim remain_step_dim = remain_dim;
195 :
196 0 : input_step_dim.height(to - from);
197 0 : output_step_dim.height(to - from);
198 0 : normalize_step_dim.height(is_height_normalize ? to - from : 1);
199 0 : remain_step_dim.height(is_height_normalize ? 1 : to - from);
200 :
201 : Tensor &temp_full_size = output;
202 : Tensor &temp_norm_size = inv_std_dev;
203 :
204 0 : input.average(normalize_axes, temp_norm_size);
205 0 : input.subtract(temp_norm_size, deviation);
206 :
207 : #ifndef ENABLE_FP16
208 0 : deviation.pow(2.0f, temp_full_size);
209 0 : temp_full_size.average(normalize_axes, variance);
210 :
211 0 : variance.add_i(epsilon);
212 0 : variance.pow(-0.5f, inv_std_dev);
213 : #else
214 : unsigned int axis_dim = deviation.getDim()[normalize_axes[0]];
215 : for (unsigned int i = 0; i < deviation.getDim()[normalize_axes[0] - 1]; ++i) {
216 : float sum = 0.0;
217 :
218 : _FP16 *data = deviation.getAddress<_FP16>(0, 0, i, 0);
219 :
220 : for (unsigned int j = 0; j < axis_dim; ++j) {
221 : sum += powf(static_cast<float>(data[j]), 2.0f);
222 : }
223 : inv_std_dev.setValue(0, 0, i, 0, 1.0 / sqrt(sum / axis_dim - epsilon));
224 : }
225 : #endif
226 :
227 0 : deviation.multiply(inv_std_dev, output);
228 0 : output.multiply_i(gamma);
229 0 : output.add_i(beta);
230 0 : }
231 :
232 130 : void LayerNormalizationLayer::calcDerivative(RunLayerContext &context) {
233 : const bool trainable = context.getTrainable();
234 :
235 : TensorDim::TensorType weight_tensor_type =
236 130 : context.getWeight(wt_idx[LNParams::gamma]).getTensorType();
237 :
238 : Tensor empty =
239 130 : Tensor("empty", weight_tensor_type.format, weight_tensor_type.data_type);
240 :
241 130 : Tensor &outgoing_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
242 : const Tensor &incoming_derivative =
243 130 : context.getIncomingDerivative(SINGLE_INOUT_IDX);
244 :
245 130 : const Tensor &gamma = context.getWeight(wt_idx[LNParams::gamma]);
246 : Tensor &d_gamma =
247 130 : trainable ? context.getWeightGrad(wt_idx[LNParams::gamma]) : empty;
248 :
249 130 : Tensor &deviation = context.getTensor(wt_idx[LNParams::deviation]);
250 130 : Tensor &variance = context.getTensor(wt_idx[LNParams::variance]);
251 130 : Tensor &inv_std_dev = context.getTensor(wt_idx[LNParams::inv_std_dev]);
252 :
253 : Tensor &temp_origin_size =
254 130 : context.getTensor(wt_idx[LNParams::temp_origin_size]);
255 : Tensor &temp_normalized_size =
256 130 : context.getTensor(wt_idx[LNParams::temp_normalized_size]);
257 :
258 130 : incoming_derivative.multiply(deviation, temp_origin_size);
259 130 : temp_origin_size.average(normalize_axes, temp_normalized_size);
260 130 : temp_normalized_size.divide_i(variance);
261 130 : deviation.multiply_i(temp_normalized_size);
262 :
263 130 : if (trainable) {
264 : /** calculate d_gamma */
265 130 : temp_origin_size.multiply_i(inv_std_dev);
266 130 : temp_origin_size.sum(remain_axes, d_gamma);
267 : }
268 130 : incoming_derivative.average(normalize_axes, temp_normalized_size);
269 130 : incoming_derivative.subtract(temp_normalized_size, outgoing_derivative);
270 130 : outgoing_derivative.subtract_i(deviation);
271 :
272 130 : inv_std_dev.multiply_i(gamma);
273 130 : outgoing_derivative.multiply_i(inv_std_dev);
274 130 : }
275 :
276 130 : void LayerNormalizationLayer::calcGradient(RunLayerContext &context) {
277 : /** d_gamma is calculated in calcDerivative. d_beta is calculated here */
278 : const Tensor &incoming_derivative =
279 130 : context.getIncomingDerivative(SINGLE_INOUT_IDX);
280 130 : Tensor &d_beta = context.getWeightGrad(wt_idx[LNParams::beta]);
281 :
282 130 : incoming_derivative.sum(remain_axes, d_beta);
283 130 : }
284 :
285 82 : void LayerNormalizationLayer::exportTo(
286 : Exporter &exporter, const ml::train::ExportMethods &method) const {
287 82 : exporter.saveResult(layer_normalization_props, method, this);
288 82 : }
289 :
290 164 : void LayerNormalizationLayer::setBatch(RunLayerContext &context,
291 : unsigned int batch) {
292 164 : context.updateTensor(wt_idx[LNParams::deviation], batch);
293 164 : context.updateTensor(wt_idx[LNParams::variance], batch);
294 164 : context.updateTensor(wt_idx[LNParams::inv_std_dev], batch);
295 164 : context.updateTensor(wt_idx[LNParams::temp_origin_size], batch);
296 164 : context.updateTensor(wt_idx[LNParams::temp_normalized_size], batch);
297 164 : }
298 :
299 : } /* namespace nntrainer */
|