Line data Source code
1 : /**
2 : * Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved.
3 : *
4 : * Licensed under the Apache License, Version 2.0 (the "License");
5 : * you may not use this file except in compliance with the License.
6 : * You may obtain a copy of the License at
7 : * http://www.apache.org/licenses/LICENSE-2.0
8 : * Unless required by applicable law or agreed to in writing, software
9 : * distributed under the License is distributed on an "AS IS" BASIS,
10 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 : * See the License for the specific language governing permissions and
12 : * limitations under the License.
13 : *
14 : *
15 : * @file fc_layer.cpp
16 : * @date 14 May 2020
17 : * @brief This is Fully Connected Layer Class for Neural Network
18 : * @see https://github.com/nnstreamer/nntrainer
19 : * @author Jijoong Moon <jijoong.moon@samsung.com>
20 : * @bug No known bugs except for NYI items
21 : *
22 : */
23 :
24 : #include <common_properties.h>
25 : #include <fc_layer.h>
26 : #include <layer_context.h>
27 : #include <lazy_tensor.h>
28 : #include <nntrainer_error.h>
29 : #include <nntrainer_log.h>
30 : #include <node_exporter.h>
31 : #include <util_func.h>
32 :
33 : #include <iostream>
34 :
35 : namespace nntrainer {
36 :
37 : static constexpr size_t SINGLE_INOUT_IDX = 0;
38 :
39 : enum FCParams { weight, bias };
40 : enum LORAParams { loraA, loraB, loraTmp, loraOut };
41 :
42 943 : FullyConnectedLayer::FullyConnectedLayer() :
43 : LayerImpl(),
44 943 : lora_scaling(1.0f),
45 943 : fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()),
46 943 : quantizer(nullptr) {
47 : weight_idx.fill(std::numeric_limits<unsigned>::max());
48 : lora_idx.fill(std::numeric_limits<unsigned>::max());
49 943 : }
50 :
51 652 : void FullyConnectedLayer::finalize(InitLayerContext &context) {
52 : auto &weight_regularizer =
53 : std::get<props::WeightRegularizer>(*layer_impl_props);
54 : auto &weight_regularizer_constant =
55 : std::get<props::WeightRegularizerConstant>(*layer_impl_props);
56 : auto &weight_initializer =
57 : std::get<props::WeightInitializer>(*layer_impl_props);
58 : auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
59 : auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
60 : auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
61 : auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
62 :
63 652 : const auto &unit = std::get<props::Unit>(fc_props).get();
64 : const auto &lora_rank = (std::get<props::LoraRank>(fc_props).empty())
65 652 : ? 0
66 0 : : std::get<props::LoraRank>(fc_props).get();
67 652 : lora_scaling = (lora_rank && !std::get<props::LoraAlpha>(fc_props).empty())
68 0 : ? (float)std::get<props::LoraAlpha>(fc_props) / lora_rank
69 : : 1;
70 :
71 652 : NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
72 : << "Fully connected layer takes only one input";
73 :
74 652 : std::vector<TensorDim> output_dims(1);
75 :
76 : /// @todo fc actaully supports multidimensions. EffDimFlag shouldn't be fixed
77 : /// like this.
78 652 : context.setEffDimFlagInputDimension(0, 0b1001);
79 652 : context.setDynDimFlagInputDimension(0, 0b1000);
80 :
81 : bool is_nchw = (context.getFormat() == Tformat::NCHW);
82 : /** set output dimensions */
83 : auto const &in_dim = context.getInputDimensions()[0];
84 652 : output_dims[0] = in_dim;
85 652 : is_nchw ? output_dims[0].width(unit) : output_dims[0].channel(unit);
86 :
87 : output_dims[0].setTensorType(
88 : {context.getFormat(), context.getActivationDataType()});
89 :
90 652 : context.setOutputDimensions(output_dims);
91 :
92 : /** set weight specifications */
93 : // @todo : This NCHW format setting is just temporal, it needs to be set by
94 : // global configuration
95 :
96 : /** Bias Dimension : (1, 1, 1, unit) */
97 : /// @note bias is directly added to activation
98 : /// since we have no dequantizer for add operation,
99 : /// we have to set its data type as same as activation.
100 : /// This should be updated when the dequantizer is supported.
101 : TensorDim bias_dim(
102 652 : 1, is_nchw ? 1 : unit, 1, is_nchw ? unit : 1,
103 : TensorDim::TensorType(context.getFormat(), context.getActivationDataType()),
104 1304 : is_nchw ? 0b0001 : 0b0100);
105 :
106 : /** Weight Dimension : (1, 1, in_dim.width(), unit)*/
107 : TensorDim weight_dim(
108 649 : 1, is_nchw ? 1 : unit, is_nchw ? in_dim.width() : 1,
109 3 : is_nchw ? unit : in_dim.channel(),
110 : TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
111 1956 : is_nchw ? 0b0011 : 0b0101);
112 :
113 652 : weight_idx[FCParams::weight] = context.requestWeight(
114 : weight_dim, weight_initializer, weight_regularizer,
115 : weight_regularizer_constant, weight_decay, "weight", true);
116 :
117 652 : if (disable_bias.empty() || disable_bias.get() == false) {
118 652 : weight_idx[FCParams::bias] =
119 1304 : context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE,
120 : 1.0f, bias_decay, "bias", true);
121 : }
122 :
123 : /** create weights for LoRA */
124 652 : if (lora_rank) {
125 :
126 : /** loraA Dimension : (1, 1, in_dim.width, lora_rank) */
127 : TensorDim loraA_dim(
128 0 : 1, is_nchw ? 1 : lora_rank, is_nchw ? in_dim.width() : 1,
129 0 : is_nchw ? lora_rank : in_dim.channel(),
130 : TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
131 0 : is_nchw ? 0b0011 : 0b0101);
132 :
133 : /** loraB Dimension : (1, 1, lora_rank, unit) */
134 : TensorDim loraB_dim(
135 0 : 1, is_nchw ? 1 : unit, is_nchw ? lora_rank : 1,
136 0 : is_nchw ? unit : lora_rank,
137 : TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
138 0 : is_nchw ? 0b0011 : 0b0101);
139 :
140 : /** loraTmp Dimension : (B, 1, in_dim.height(), lora_rank) */
141 : TensorDim loraTmp_dim(
142 0 : in_dim.batch(), is_nchw ? 1 : lora_rank, is_nchw ? in_dim.height() : 1,
143 0 : is_nchw ? lora_rank : in_dim.width(),
144 : TensorDim::TensorType(context.getFormat(),
145 : context.getActivationDataType()),
146 0 : is_nchw ? 0b1011 : 0b1101);
147 :
148 : /** loraTmp Dimension : (B, 1, in_dim.height(), unit) */
149 : TensorDim loraOut_dim(
150 0 : in_dim.batch(), is_nchw ? 1 : unit, is_nchw ? in_dim.height() : 1,
151 0 : is_nchw ? unit : in_dim.width(),
152 : TensorDim::TensorType(context.getFormat(),
153 : context.getActivationDataType()),
154 0 : is_nchw ? 0b1011 : 0b1101);
155 :
156 0 : lora_idx[LORAParams::loraA] = context.requestWeight(
157 : loraA_dim, Initializer::ZEROS, weight_regularizer,
158 : weight_regularizer_constant, weight_decay, "loraA", true);
159 :
160 0 : lora_idx[LORAParams::loraB] = context.requestWeight(
161 : loraB_dim, Initializer::LECUN_NORMAL, weight_regularizer,
162 : weight_regularizer_constant, weight_decay, "loraB", true);
163 :
164 0 : lora_idx[LORAParams::loraTmp] =
165 0 : context.requestTensor(loraTmp_dim, "hidden_tmp_lora", Initializer::NONE,
166 : true, TensorLifespan::FORWARD_GRAD_LIFESPAN);
167 :
168 0 : lora_idx[LORAParams::loraOut] =
169 0 : context.requestTensor(loraOut_dim, "hidden_lora", Initializer::NONE, true,
170 : TensorLifespan::FORWARD_FUNC_LIFESPAN);
171 : }
172 :
173 : ///@todo this quantizaer should be moved to tensor, not layer!
174 652 : switch (context.getWeightDataType()) {
175 0 : case ml::train::TensorDim::DataType::QINT4:
176 : case ml::train::TensorDim::DataType::QINT8:
177 : case ml::train::TensorDim::DataType::QINT16:
178 : quantizer =
179 0 : Quantization::createQuantizer(nntrainer::QScheme::PER_TENSOR_AFFINE);
180 0 : break;
181 652 : default:
182 : quantizer = nullptr;
183 : break;
184 : }
185 652 : }
186 :
187 437 : void FullyConnectedLayer::exportTo(
188 : Exporter &exporter, const ml::train::ExportMethods &method) const {
189 437 : LayerImpl::exportTo(exporter, method);
190 437 : exporter.saveResult(fc_props, method, this);
191 437 : }
192 :
193 4510 : void FullyConnectedLayer::setProperty(const std::vector<std::string> &values) {
194 4510 : auto remain_props = loadProperties(values, fc_props);
195 4509 : LayerImpl::setProperty(remain_props);
196 4509 : }
197 :
198 523 : void FullyConnectedLayer::setBatch(nntrainer::RunLayerContext &context,
199 : unsigned int batch) {
200 523 : if (!std::get<props::LoraRank>(fc_props).empty()) {
201 : // update Lora Tensor's batch info.
202 0 : context.updateTensor(lora_idx[LORAParams::loraTmp], batch);
203 0 : context.updateTensor(lora_idx[LORAParams::loraOut], batch);
204 : }
205 523 : }
206 :
207 7177 : void FullyConnectedLayer::forwarding(RunLayerContext &context, bool training) {
208 7177 : Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
209 7177 : Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
210 7177 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
211 :
212 : ///@todo This dequantization action should be moved to tensor.dot()
213 7177 : if (quantizer != nullptr) {
214 0 : Tensor weight_ = quantizer->dequantize(weight, input_.getDataType());
215 0 : input_.dot(weight_, hidden_, false, false);
216 0 : } else {
217 7177 : input_.dot(weight, hidden_, false, false);
218 : }
219 :
220 7177 : if (!std::get<props::LoraRank>(fc_props).empty()) {
221 0 : Tensor &loraA = context.getWeight(lora_idx[LORAParams::loraA]);
222 0 : Tensor &loraB = context.getWeight(lora_idx[LORAParams::loraB]);
223 0 : Tensor &hidden_tmp_lora = context.getTensor(lora_idx[LORAParams::loraTmp]);
224 0 : Tensor &hidden_out_lora = context.getTensor(lora_idx[LORAParams::loraOut]);
225 :
226 0 : input_.dot(loraA, hidden_tmp_lora, false, false);
227 0 : hidden_tmp_lora.dot(loraB, hidden_out_lora, false, false);
228 0 : hidden_out_lora.multiply_i(lora_scaling);
229 0 : hidden_.add_i(hidden_out_lora);
230 : }
231 :
232 : if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
233 7177 : disable_bias.empty() || disable_bias.get() == false) {
234 7177 : Tensor &bias = context.getWeight(weight_idx[FCParams::bias]);
235 7177 : hidden_.add_i(bias);
236 : }
237 7177 : }
238 :
239 0 : void FullyConnectedLayer::incremental_forwarding(RunLayerContext &context,
240 : unsigned int from,
241 : unsigned int to,
242 : bool training) {
243 0 : Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
244 0 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
245 0 : Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
246 0 : Tensor loraA, loraB, hidden_tmp_lora, hidden_out_lora;
247 :
248 0 : if (!std::get<props::LoraRank>(fc_props).empty()) {
249 0 : loraA = context.getWeight(lora_idx[LORAParams::loraA]);
250 0 : loraB = context.getWeight(lora_idx[LORAParams::loraB]);
251 0 : hidden_tmp_lora = context.getTensor(lora_idx[LORAParams::loraTmp]);
252 0 : hidden_out_lora = context.getTensor(lora_idx[LORAParams::loraOut]);
253 : }
254 :
255 0 : TensorDim input_dim = input_.getDim();
256 0 : TensorDim hidden_dim = hidden_.getDim();
257 :
258 0 : TensorDim input_step_dim = input_dim;
259 0 : TensorDim hidden_step_dim = hidden_dim;
260 :
261 0 : input_step_dim.batch(1);
262 0 : input_step_dim.height(to - from);
263 0 : hidden_step_dim.batch(1);
264 0 : hidden_step_dim.height(to - from);
265 :
266 : // @todo make it parallelized with batch axis
267 0 : for (unsigned int b = 0; b < hidden_.batch(); ++b) {
268 : Tensor input_step = input_.getSharedDataTensor(
269 0 : input_step_dim, b * hidden_dim.getFeatureLen(), true);
270 : Tensor hidden_step = hidden_.getSharedDataTensor(
271 0 : hidden_step_dim, b * hidden_dim.getFeatureLen(), true);
272 :
273 0 : input_step.dot(weight, hidden_step, false, false);
274 :
275 0 : if (!std::get<props::LoraRank>(fc_props).empty()) {
276 0 : nntrainer::TensorDim hidden_tmp_lora_step_dim = hidden_tmp_lora.getDim();
277 0 : hidden_tmp_lora_step_dim.batch(1);
278 0 : hidden_tmp_lora_step_dim.height(to - from);
279 0 : nntrainer::TensorDim hidden_out_lora_step_dim = hidden_out_lora.getDim();
280 0 : hidden_out_lora_step_dim.batch(1);
281 0 : hidden_out_lora_step_dim.height(to - from);
282 :
283 : nntrainer::Tensor hidden_tmp_lora_step =
284 : hidden_tmp_lora.getSharedDataTensor(
285 : hidden_tmp_lora_step_dim,
286 0 : b * hidden_tmp_lora.height() * hidden_tmp_lora.width(), true);
287 : nntrainer::Tensor hidden_out_lora_step =
288 : hidden_out_lora.getSharedDataTensor(
289 : hidden_out_lora_step_dim,
290 0 : b * hidden_out_lora.height() * hidden_out_lora.width(), true);
291 :
292 0 : input_step.dot(loraA, hidden_tmp_lora_step, false, false);
293 0 : hidden_tmp_lora_step.dot(loraB, hidden_out_lora_step, false, false);
294 0 : hidden_out_lora_step.multiply_i(lora_scaling);
295 0 : hidden_step.add_i(hidden_out_lora_step);
296 0 : }
297 :
298 : if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
299 0 : disable_bias.empty() || disable_bias.get() == false) {
300 0 : Tensor &bias = context.getWeight(weight_idx[FCParams::bias]);
301 0 : hidden_step.add_i(bias);
302 : }
303 0 : }
304 0 : }
305 :
306 1117 : void FullyConnectedLayer::calcDerivative(RunLayerContext &context) {
307 1117 : Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
308 :
309 1117 : const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
310 1117 : Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
311 :
312 1117 : if (!std::get<props::LoraRank>(fc_props).empty()) {
313 0 : Tensor &lora_A = context.getWeight(lora_idx[LORAParams::loraA]);
314 0 : Tensor &lora_B = context.getWeight(lora_idx[LORAParams::loraB]);
315 0 : ret_.dot_deriv_wrt_1(weight.add(lora_A.dot(lora_B).multiply(lora_scaling)),
316 : derivative_, false, false);
317 : } else {
318 1117 : ret_.dot_deriv_wrt_1(weight, derivative_, false, false);
319 : }
320 1117 : }
321 :
322 6296 : void FullyConnectedLayer::calcGradient(RunLayerContext &context) {
323 :
324 : /** (default) calcGradient - compute gradient of weight and bias */
325 6296 : if (std::get<props::LoraRank>(fc_props).empty()) {
326 6296 : Tensor &djdw = context.getWeightGrad(weight_idx[FCParams::weight]);
327 6296 : djdw.setZero();
328 :
329 6296 : const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
330 6296 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
331 :
332 : if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
333 6296 : disable_bias.empty() || disable_bias.get() == false) {
334 6296 : Tensor &djdb = context.getWeightGrad(weight_idx[FCParams::bias]);
335 6296 : djdb.setZero();
336 :
337 6296 : if (context.isGradientFirstAccess(weight_idx[FCParams::bias])) {
338 6286 : derivative_.sum({0, 1, 2}, djdb);
339 : } else {
340 : /// @todo optimize below by adding beta to Tensor::sum
341 10 : Tensor t = derivative_.sum({0, 1, 2});
342 10 : djdb.add_i(t);
343 10 : }
344 : }
345 :
346 6296 : input_.dot_deriv_wrt_2(
347 : djdw, derivative_, false, false,
348 6296 : !context.isGradientFirstAccess(weight_idx[FCParams::weight]));
349 6296 : } else {
350 : /** (lora) calcGradient - compute gradients of LoRA params only */
351 0 : Tensor &djdla = context.getWeightGrad(lora_idx[LORAParams::loraA]);
352 0 : Tensor &djdlb = context.getWeightGrad(lora_idx[LORAParams::loraB]);
353 0 : Tensor &djdtmp = context.getTensorGrad(lora_idx[LORAParams::loraTmp]);
354 :
355 0 : const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
356 0 : Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
357 0 : Tensor &loraA = context.getWeight(lora_idx[LORAParams::loraA]);
358 0 : Tensor &loraB = context.getWeight(lora_idx[LORAParams::loraB]);
359 0 : Tensor &loraTmp = context.getTensor(lora_idx[LORAParams::loraTmp]);
360 0 : const auto &lora_derivative_ = derivative_.multiply(lora_scaling);
361 :
362 0 : loraTmp.dot_deriv_wrt_2(
363 : djdlb, lora_derivative_, false, false,
364 0 : !context.isGradientFirstAccess(lora_idx[LORAParams::loraB]));
365 0 : djdtmp.dot_deriv_wrt_1(
366 : loraB, lora_derivative_, false, false,
367 0 : !context.isGradientFirstAccess(lora_idx[LORAParams::loraTmp]));
368 0 : input_.dot_deriv_wrt_2(
369 : djdla, djdtmp, false, false,
370 0 : !context.isGradientFirstAccess(lora_idx[LORAParams::loraA]));
371 0 : }
372 6296 : }
373 :
374 : } /* namespace nntrainer */
|