LCOV - coverage_filtered.info - nntrainer/layers/layer_normalization

LCOV - code coverage report

Current view:	top level - nntrainer/layers - layer_normalization_layer.cpp (source / functions)		Coverage	Total	Hit
Test:	coverage_filtered.info	Lines:	75.6 %	135	102
Test Date:	2025-12-14 20:38:17	Functions:	88.9 %	9	8

            Line data    Source code

       1              : // SPDX-License-Identifier: Apache-2.0
       2              : /**
       3              :  * Copyright (C) 2022 hyeonseok Lee <hs89.lee@samsung.com>
       4              :  *
       5              :  * @file   layer_normalization_layer.cpp
       6              :  * @date   25 July 2022
       7              :  * @see    https://github.com/nnstreamer/nntrainer
       8              :  *         https://arxiv.org/abs/1607.06450
       9              :  * @author hyeonseok Lee <hs89.lee@samsung.com>
      10              :  * @bug    No known bugs except for NYI items
      11              :  * @brief  This is Layer Normalization Layer Class for Neural Network
      12              :  *
      13              :  */
      14              : 
      15              : #include <algorithm>
      16              : #include <numeric>
      17              : 
      18              : #include <layer_context.h>
      19              : #include <layer_normalization_layer.h>
      20              : #include <nntrainer_error.h>
      21              : #include <nntrainer_log.h>
      22              : #include <node_exporter.h>
      23              : #include <util_func.h>
      24              : 
      25              : namespace nntrainer {
      26              : 
      27              : static constexpr size_t SINGLE_INOUT_IDX = 0;
      28              : 
      29              : enum LNParams {
      30              :   gamma,
      31              :   beta,
      32              :   deviation,
      33              :   variance,
      34              :   inv_std_dev,
      35              :   temp_origin_size,
      36              :   temp_normalized_size,
      37              : };
      38              : 
      39          189 : LayerNormalizationLayer::LayerNormalizationLayer() :
      40              :   Layer(),
      41          189 :   layer_normalization_props(std::vector<props::Axis>(), props::Epsilon(),
      42          378 :                             props::GammaInitializer(), props::BetaInitializer(),
      43          378 :                             props::WeightDecay(), props::BiasDecay()) {
      44              :   wt_idx.fill(std::numeric_limits<unsigned>::max());
      45          189 : }
      46              : 
      47          175 : void LayerNormalizationLayer::finalize(InitLayerContext &context) {
      48          175 :   if (context.getNumInputs() != 1) {
      49              :     throw std::invalid_argument(
      50            0 :       "Only one input is allowed for layer normalization layer");
      51              :   }
      52              : 
      53              :   auto gamma_initializer =
      54          175 :     std::get<props::GammaInitializer>(layer_normalization_props).get();
      55              :   auto beta_initializer =
      56          175 :     std::get<props::BetaInitializer>(layer_normalization_props).get();
      57              :   auto weight_decay = std::get<props::WeightDecay>(layer_normalization_props);
      58              :   auto bias_decay = std::get<props::BiasDecay>(layer_normalization_props);
      59              : 
      60              :   auto const &input_dim = context.getInputDimensions()[0];
      61          175 :   context.setOutputDimensions({input_dim});
      62              : 
      63              :   std::vector<props::Axis> axes_prop =
      64          175 :     std::get<std::vector<props::Axis>>(layer_normalization_props);
      65              : 
      66          175 :   NNTR_THROW_IF(axes_prop.empty(), std::invalid_argument)
      67              :     << "[Layer normalization]axis property is empty";
      68              : 
      69          175 :   normalize_axes.insert(normalize_axes.end(), axes_prop.begin(),
      70              :                         axes_prop.end());
      71          175 :   std::sort(normalize_axes.begin(), normalize_axes.end());
      72              :   normalize_axes.erase(
      73          175 :     std::unique(normalize_axes.begin(), normalize_axes.end()),
      74              :     normalize_axes.end());
      75              : 
      76          175 :   TensorDim normalize_dim(context.getFormat(), context.getWeightDataType());
      77          355 :   for (unsigned int axis : normalize_axes) {
      78          180 :     normalize_dim.setTensorDim(axis, input_dim.getTensorDim(axis));
      79              :   }
      80              : 
      81          350 :   wt_idx[LNParams::gamma] = context.requestWeight(
      82              :     normalize_dim, gamma_initializer, WeightRegularizer::NONE, 1.0f,
      83              :     weight_decay, "gamma", true);
      84          350 :   wt_idx[LNParams::beta] = context.requestWeight(
      85              :     normalize_dim, beta_initializer, WeightRegularizer::NONE, 1.0f, bias_decay,
      86              :     "beta", true);
      87              : 
      88          175 :   TensorDim remain_dim(context.getFormat(), context.getWeightDataType());
      89              :   std::vector<unsigned int> total_axes;
      90          175 :   total_axes.resize(ml::train::TensorDim::MAXDIM);
      91              :   std::iota(total_axes.begin(), total_axes.end(), 0u);
      92          175 :   std::set_difference(total_axes.begin(), total_axes.end(),
      93              :                       normalize_axes.begin(), normalize_axes.end(),
      94          175 :                       std::back_inserter(remain_axes));
      95          695 :   for (unsigned int axis : remain_axes) {
      96          520 :     remain_dim.setTensorDim(axis, input_dim.getTensorDim(axis));
      97              :   }
      98              : 
      99              :   /** caches the deviation -> input - avg(input) */
     100          175 :   wt_idx[LNParams::deviation] =
     101          175 :     context.requestTensor(input_dim, "deviation", Initializer::NONE, false,
     102              :                           TensorLifespan::ITERATION_LIFESPAN);
     103              :   /** caches variance + epsilon as well */
     104          175 :   wt_idx[LNParams::variance] =
     105          175 :     context.requestTensor(remain_dim, "variance", Initializer::NONE, false,
     106              :                           TensorLifespan::ITERATION_LIFESPAN);
     107              :   /** caches the inverse standard deviation */
     108          175 :   wt_idx[LNParams::inv_std_dev] =
     109          175 :     context.requestTensor(remain_dim, "inv_std_dev", Initializer::NONE, false,
     110              :                           TensorLifespan::ITERATION_LIFESPAN);
     111              : 
     112              :   /** temporary tensor (origin size) */
     113          175 :   wt_idx[LNParams::temp_origin_size] =
     114          175 :     context.requestTensor(input_dim, "temp_origin_size", Initializer::NONE,
     115              :                           false, TensorLifespan::CALC_DERIV_LIFESPAN);
     116              :   /** temporary tensor (normalized size) */
     117          175 :   wt_idx[LNParams::temp_normalized_size] =
     118          175 :     context.requestTensor(remain_dim, "temp_normalized_size", Initializer::NONE,
     119              :                           false, TensorLifespan::CALC_DERIV_LIFESPAN);
     120          350 : }
     121              : 
     122         1024 : void LayerNormalizationLayer::setProperty(
     123              :   const std::vector<std::string> &values) {
     124         1024 :   auto remain_props = loadProperties(values, layer_normalization_props);
     125         1023 :   NNTR_THROW_IF(!remain_props.empty(), std::invalid_argument)
     126            2 :     << "[Layer Normalization Layer] Unknown Layer Properties count " +
     127            4 :          std::to_string(values.size());
     128         1023 : }
     129              : 
     130          281 : void LayerNormalizationLayer::forwarding(RunLayerContext &context,
     131              :                                          bool training) {
     132              :   const float epsilon =
     133          281 :     std::get<props::Epsilon>(layer_normalization_props).get();
     134              : 
     135          281 :   const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
     136          281 :   Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
     137              : 
     138          281 :   Tensor &gamma = context.getWeight(wt_idx[LNParams::gamma]);
     139          281 :   Tensor &beta = context.getWeight(wt_idx[LNParams::beta]);
     140              : 
     141          281 :   Tensor &deviation = context.getTensor(wt_idx[LNParams::deviation]);
     142          281 :   Tensor &variance = context.getTensor(wt_idx[LNParams::variance]);
     143          281 :   Tensor &inv_std_dev = context.getTensor(wt_idx[LNParams::inv_std_dev]);
     144              : 
     145              :   Tensor &temp_full_size = output;
     146              :   Tensor &temp_norm_size = inv_std_dev;
     147              : 
     148          281 :   input.average(normalize_axes, temp_norm_size);
     149          281 :   input.subtract(temp_norm_size, deviation);
     150              : 
     151          281 :   deviation.pow(2.0, temp_full_size);
     152          281 :   temp_full_size.average(normalize_axes, variance);
     153              : 
     154          281 :   variance.add_i(epsilon);
     155          281 :   variance.pow(-0.5, inv_std_dev);
     156              : 
     157          281 :   deviation.multiply(inv_std_dev, output);
     158          281 :   output.multiply_i(gamma);
     159          281 :   output.add_i(beta);
     160          281 : }
     161              : 
     162            0 : void LayerNormalizationLayer::incremental_forwarding(RunLayerContext &context,
     163              :                                                      unsigned int from,
     164              :                                                      unsigned int to,
     165              :                                                      bool training) {
     166              :   const float epsilon =
     167            0 :     std::get<props::Epsilon>(layer_normalization_props).get();
     168              : 
     169            0 :   const Tensor &input = context.getInput(SINGLE_INOUT_IDX);
     170            0 :   Tensor &output = context.getOutput(SINGLE_INOUT_IDX);
     171              : 
     172            0 :   Tensor &gamma = context.getWeight(wt_idx[LNParams::gamma]);
     173            0 :   Tensor &beta = context.getWeight(wt_idx[LNParams::beta]);
     174              : 
     175            0 :   Tensor &deviation = context.getTensor(wt_idx[LNParams::deviation]);
     176            0 :   Tensor &variance = context.getTensor(wt_idx[LNParams::variance]);
     177            0 :   Tensor &inv_std_dev = context.getTensor(wt_idx[LNParams::inv_std_dev]);
     178              : 
     179              :   // @todo: consider NHWC format
     180              :   bool is_height_normalize =
     181            0 :     std::find(normalize_axes.begin(), normalize_axes.end(), 1) !=
     182              :         normalize_axes.end()
     183              :       ? true
     184              :       : false;
     185              : 
     186            0 :   TensorDim input_dim = input.getDim();
     187            0 :   TensorDim output_dim = output.getDim();
     188            0 :   TensorDim normalize_dim = gamma.getDim();
     189            0 :   TensorDim remain_dim = variance.getDim();
     190              : 
     191            0 :   TensorDim input_step_dim = input_dim;
     192            0 :   TensorDim output_step_dim = output_dim;
     193            0 :   TensorDim normalize_step_dim = normalize_dim;
     194            0 :   TensorDim remain_step_dim = remain_dim;
     195              : 
     196            0 :   input_step_dim.height(to - from);
     197            0 :   output_step_dim.height(to - from);
     198            0 :   normalize_step_dim.height(is_height_normalize ? to - from : 1);
     199            0 :   remain_step_dim.height(is_height_normalize ? 1 : to - from);
     200              : 
     201              :   Tensor &temp_full_size = output;
     202              :   Tensor &temp_norm_size = inv_std_dev;
     203              : 
     204            0 :   input.average(normalize_axes, temp_norm_size);
     205            0 :   input.subtract(temp_norm_size, deviation);
     206              : 
     207              : #ifndef ENABLE_FP16
     208            0 :   deviation.pow(2.0f, temp_full_size);
     209            0 :   temp_full_size.average(normalize_axes, variance);
     210              : 
     211            0 :   variance.add_i(epsilon);
     212            0 :   variance.pow(-0.5f, inv_std_dev);
     213              : #else
     214              :   unsigned int axis_dim = deviation.getDim()[normalize_axes[0]];
     215              :   for (unsigned int i = 0; i < deviation.getDim()[normalize_axes[0] - 1]; ++i) {
     216              :     float sum = 0.0;
     217              : 
     218              :     _FP16 *data = deviation.getAddress<_FP16>(0, 0, i, 0);
     219              : 
     220              :     for (unsigned int j = 0; j < axis_dim; ++j) {
     221              :       sum += powf(static_cast<float>(data[j]), 2.0f);
     222              :     }
     223              :     inv_std_dev.setValue(0, 0, i, 0, 1.0 / sqrt(sum / axis_dim - epsilon));
     224              :   }
     225              : #endif
     226              : 
     227            0 :   deviation.multiply(inv_std_dev, output);
     228            0 :   output.multiply_i(gamma);
     229            0 :   output.add_i(beta);
     230            0 : }
     231              : 
     232          130 : void LayerNormalizationLayer::calcDerivative(RunLayerContext &context) {
     233              :   const bool trainable = context.getTrainable();
     234              : 
     235              :   TensorDim::TensorType weight_tensor_type =
     236          130 :     context.getWeight(wt_idx[LNParams::gamma]).getTensorType();
     237              : 
     238              :   Tensor empty =
     239          130 :     Tensor("empty", weight_tensor_type.format, weight_tensor_type.data_type);
     240              : 
     241          130 :   Tensor &outgoing_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
     242              :   const Tensor &incoming_derivative =
     243          130 :     context.getIncomingDerivative(SINGLE_INOUT_IDX);
     244              : 
     245          130 :   const Tensor &gamma = context.getWeight(wt_idx[LNParams::gamma]);
     246              :   Tensor &d_gamma =
     247          130 :     trainable ? context.getWeightGrad(wt_idx[LNParams::gamma]) : empty;
     248              : 
     249          130 :   Tensor &deviation = context.getTensor(wt_idx[LNParams::deviation]);
     250          130 :   Tensor &variance = context.getTensor(wt_idx[LNParams::variance]);
     251          130 :   Tensor &inv_std_dev = context.getTensor(wt_idx[LNParams::inv_std_dev]);
     252              : 
     253              :   Tensor &temp_origin_size =
     254          130 :     context.getTensor(wt_idx[LNParams::temp_origin_size]);
     255              :   Tensor &temp_normalized_size =
     256          130 :     context.getTensor(wt_idx[LNParams::temp_normalized_size]);
     257              : 
     258          130 :   incoming_derivative.multiply(deviation, temp_origin_size);
     259          130 :   temp_origin_size.average(normalize_axes, temp_normalized_size);
     260          130 :   temp_normalized_size.divide_i(variance);
     261          130 :   deviation.multiply_i(temp_normalized_size);
     262              : 
     263          130 :   if (trainable) {
     264              :     /** calculate d_gamma */
     265          130 :     temp_origin_size.multiply_i(inv_std_dev);
     266          130 :     temp_origin_size.sum(remain_axes, d_gamma);
     267              :   }
     268          130 :   incoming_derivative.average(normalize_axes, temp_normalized_size);
     269          130 :   incoming_derivative.subtract(temp_normalized_size, outgoing_derivative);
     270          130 :   outgoing_derivative.subtract_i(deviation);
     271              : 
     272          130 :   inv_std_dev.multiply_i(gamma);
     273          130 :   outgoing_derivative.multiply_i(inv_std_dev);
     274          130 : }
     275              : 
     276          130 : void LayerNormalizationLayer::calcGradient(RunLayerContext &context) {
     277              :   /** d_gamma is calculated in calcDerivative. d_beta is calculated here */
     278              :   const Tensor &incoming_derivative =
     279          130 :     context.getIncomingDerivative(SINGLE_INOUT_IDX);
     280          130 :   Tensor &d_beta = context.getWeightGrad(wt_idx[LNParams::beta]);
     281              : 
     282          130 :   incoming_derivative.sum(remain_axes, d_beta);
     283          130 : }
     284              : 
     285           82 : void LayerNormalizationLayer::exportTo(
     286              :   Exporter &exporter, const ml::train::ExportMethods &method) const {
     287           82 :   exporter.saveResult(layer_normalization_props, method, this);
     288           82 : }
     289              : 
     290          164 : void LayerNormalizationLayer::setBatch(RunLayerContext &context,
     291              :                                        unsigned int batch) {
     292          164 :   context.updateTensor(wt_idx[LNParams::deviation], batch);
     293          164 :   context.updateTensor(wt_idx[LNParams::variance], batch);
     294          164 :   context.updateTensor(wt_idx[LNParams::inv_std_dev], batch);
     295          164 :   context.updateTensor(wt_idx[LNParams::temp_origin_size], batch);
     296          164 :   context.updateTensor(wt_idx[LNParams::temp_normalized_size], batch);
     297          164 : }
     298              : 
     299              : } /* namespace nntrainer */

Generated by: LCOV version 2.0-1