Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * @file float_tensor.cpp
4 : * @date 01 December 2023
5 : * @brief This is FloatTensor class for 32-bit floating point calculation
6 : * @see https://github.com/nntrainer/nntrainer
7 : * @author Jijoong Moon <jijoong.moon@samsung.com>
8 : * @author Donghyeon Jeong <dhyeon.jeong@samsung.com>
9 : * @bug No known bugs except for NYI items
10 : */
11 :
12 : #include <iomanip>
13 : #include <iostream>
14 : #include <numeric>
15 :
16 : #include <chrono>
17 : #include <cpu_backend.h>
18 : #include <float_tensor.h>
19 : #include <int4_tensor.h>
20 : #include <q4_0_utils.h>
21 :
22 : #include <tensor.h>
23 : #include <util_func.h>
24 :
25 : #ifdef ENABLE_OPENCL
26 : #include "blas_kernels.h"
27 : #endif
28 :
29 : namespace nntrainer {
30 :
31 282558 : FloatTensor::FloatTensor(std::string name_, Tformat fm) :
32 565116 : TensorBase(name_, fm, Tdatatype::FP32) {}
33 :
34 375564 : FloatTensor::FloatTensor(const TensorDim &d, bool alloc_now, Initializer init,
35 375564 : std::string name) :
36 375564 : TensorBase(d, alloc_now, init, name) {
37 375564 : if (alloc_now)
38 341393 : allocate();
39 375564 : }
40 :
41 335982 : FloatTensor::FloatTensor(const TensorDim &d, const void *buf) :
42 335982 : FloatTensor(d, true) {
43 335982 : if (d.getDataLen() != 0) {
44 335982 : if (buf != nullptr)
45 13863 : copy(buf);
46 : }
47 335982 : }
48 :
49 10794 : bool FloatTensor::operator==(const FloatTensor &rhs) const {
50 10794 : const float *_data = (float *)getData();
51 10794 : const float *_rdata = (float *)rhs.getData();
52 1403019 : for (size_t i = 0; i < size(); ++i) {
53 : /** not checking sign change is intentional to avoid float calculation
54 : * errors around 0 */
55 1392265 : if (std::isnan(_data[i]) || std::isnan(_rdata[i]) ||
56 1392264 : std::fabs(_data[i] - _rdata[i]) > epsilon)
57 : return false;
58 : }
59 :
60 : return true;
61 : }
62 :
63 : /// @todo support allocation by src_tensor
64 643260 : void FloatTensor::allocate() {
65 643260 : if (empty() || data)
66 : return;
67 :
68 643250 : if (src_tensor) {
69 : /// allocate data based on the source tensor
70 301850 : allocateSrcTensor();
71 : /** as this memory is shared, do NOT initialize */
72 : } else {
73 : /// allocate new memory for the tensor data
74 : MemoryData *mem_data;
75 :
76 4522423560 : mem_data = new MemoryData((void *)(new float[dim.getDataLen()]{}));
77 341400 : data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
78 341400 : delete[] mem_data->template getAddr<float>();
79 341400 : delete mem_data;
80 : });
81 :
82 341400 : offset = 0;
83 341400 : initialize();
84 : }
85 : }
86 :
87 906 : void FloatTensor::deallocate() {
88 : data = nullptr;
89 906 : offset = 0;
90 906 : }
91 :
92 136172133 : void *FloatTensor::getData() const {
93 136172133 : if (!data)
94 : return nullptr;
95 :
96 : data->validate();
97 136170514 : return data->getAddr<float>() + offset;
98 : }
99 :
100 0 : void *FloatTensor::getData(size_t idx) const {
101 0 : if (!data)
102 : return nullptr;
103 :
104 : data->validate();
105 0 : return data->getAddr<float>() + offset + idx;
106 : }
107 :
108 5427307 : void *FloatTensor::getAddress(unsigned int i) {
109 5427307 : size_t index = getIndex(batch(), channel(), height(), width());
110 5427307 : if (i > index) {
111 : return nullptr;
112 : }
113 5427307 : return &((float *)getData())[i];
114 : }
115 :
116 148414 : const void *FloatTensor::getAddress(unsigned int i) const {
117 148414 : size_t index = getIndex(batch(), channel(), height(), width());
118 148414 : if (i > index) {
119 : return nullptr;
120 : }
121 148414 : return &((float *)getData())[i];
122 : }
123 :
124 4584 : const float &FloatTensor::getValue(unsigned int i) const {
125 4584 : return ((float *)getData())[i];
126 : }
127 :
128 1291 : float &FloatTensor::getValue(unsigned int i) { return ((float *)getData())[i]; }
129 :
130 4584 : const float &FloatTensor::getValue(unsigned int b, unsigned int c,
131 : unsigned int h, unsigned int w) const {
132 4584 : return getValue(getIndex(b, c, h, w));
133 : }
134 :
135 1260 : float &FloatTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
136 : unsigned int w) {
137 1260 : return getValue(getIndex(b, c, h, w));
138 : }
139 :
140 81317 : void FloatTensor::setValue(float value) {
141 81317 : float *data = (float *)getData();
142 81317 : std::fill(data, data + size(), value);
143 81317 : }
144 :
145 35425636 : void FloatTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
146 : unsigned int w, float value) {
147 35425636 : ((float *)getData())[getIndex(b, c, h, w)] = value;
148 35425636 : }
149 :
150 10913 : void FloatTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
151 : unsigned int w, float value, float beta) {
152 10913 : auto const &idx = getIndex(b, c, h, w);
153 10913 : ((float *)getData())[idx] *= beta;
154 10913 : ((float *)getData())[idx] += value;
155 10913 : }
156 :
157 42378 : void FloatTensor::setZero() {
158 42378 : if (contiguous) {
159 : // sscal(size(), 0, getData<float>(), 1);
160 : /// @note we cannot use sscal, when we set zero. if the data is inf or
161 : /// NaN, then the inf or NaN still remain.
162 42378 : memset((float *)getData(), 0, sizeof(float) * size());
163 : } else {
164 : /// @todo implement apply_i
165 : // apply_i<float>([](float val) -> float { return 0; });
166 0 : setValue(0);
167 : }
168 42378 : }
169 :
170 110 : void FloatTensor::setRandNormal(float mean, float stddev) {
171 110 : setDist<std::normal_distribution<float>>(
172 : std::normal_distribution<float>(mean, stddev));
173 110 : }
174 :
175 18101 : void FloatTensor::setRandUniform(float min, float max) {
176 18101 : setDist<std::uniform_real_distribution<float>>(
177 : std::uniform_real_distribution<float>(min, max));
178 18101 : }
179 :
180 3 : void FloatTensor::setRandBernoulli(float probability) {
181 3 : setDist<std::bernoulli_distribution>(
182 : std::bernoulli_distribution(probability));
183 3 : }
184 :
185 360589 : void FloatTensor::initialize() {
186 360589 : if (empty() || !isAllocated())
187 : return;
188 :
189 : unsigned int fan_in, fan_out;
190 :
191 : /// @fixme: when unit is equal to one, this does not work, we need to rely on
192 : /// effective dimension then actual numbers here. For now, some heuristics
193 : /// added to infer what would be fan_in/fan_out
194 360588 : if (dim.batch() * dim.channel() * dim.height() == 1) {
195 233471 : fan_out = fan_in = dim.width();
196 127117 : } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
197 12222 : fan_in = dim.height();
198 12222 : fan_out = dim.width();
199 : } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
200 114895 : auto field_size = dim.height() * dim.width();
201 :
202 : // this also handles below cases.
203 : // 1. fan_in = fan_out = 1 as well.
204 : // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
205 114895 : fan_in = dim.channel() * field_size;
206 114895 : fan_out = dim.batch() * field_size;
207 : }
208 :
209 360588 : switch (initializer) {
210 5666 : case Initializer::ZEROS:
211 5666 : setZero();
212 5666 : break;
213 242 : case Initializer::ONES:
214 242 : setValue(1.0f);
215 242 : break;
216 0 : case Initializer::LECUN_NORMAL:
217 0 : setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
218 0 : break;
219 0 : case Initializer::XAVIER_NORMAL:
220 0 : setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
221 0 : break;
222 2 : case Initializer::HE_NORMAL:
223 2 : setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
224 2 : break;
225 0 : case Initializer::LECUN_UNIFORM:
226 0 : setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
227 0 : break;
228 2294 : case Initializer::XAVIER_UNIFORM:
229 2294 : setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
230 2294 : sqrtFloat(6.0 / (fan_in + fan_out)));
231 2294 : break;
232 0 : case Initializer::HE_UNIFORM:
233 0 : setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
234 0 : sqrtFloat(6.0 / (fan_in)));
235 : default:
236 : break;
237 : }
238 :
239 360588 : putData();
240 : }
241 :
242 7 : void FloatTensor::initialize(Initializer init) {
243 7 : initializer = init;
244 7 : initialize();
245 7 : }
246 :
247 70014 : Tensor &FloatTensor::apply(std::function<float(float)> f,
248 : Tensor &output) const {
249 83169 : CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
250 :
251 70014 : if (contiguous && output.getContiguous()) {
252 64137 : const float *data = (float *)getData();
253 : float *rdata = output.getData<float>();
254 :
255 128274 : std::transform(data, data + size(), rdata, f);
256 5877 : } else if (strides[3] == 1 && output.getStrides()[3] == 1) {
257 : /** @todo optimize this with combining these loops where stride is 1 */
258 12768 : for (unsigned int b = 0; b < batch(); ++b) {
259 13782 : for (unsigned int c = 0; c < channel(); ++c) {
260 13800 : for (unsigned int h = 0; h < height(); ++h) {
261 6909 : float *out_data = output.getAddress<float>(b, c, h, 0);
262 6909 : const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
263 20727 : std::transform(in_data, in_data + width(), out_data, f);
264 : }
265 : }
266 : }
267 : } else {
268 0 : for (unsigned int b = 0; b < batch(); ++b) {
269 0 : for (unsigned int c = 0; c < channel(); ++c) {
270 0 : for (unsigned int h = 0; h < height(); ++h) {
271 0 : for (unsigned int w = 0; w < width(); ++w) {
272 0 : output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
273 : }
274 : }
275 : }
276 : }
277 : }
278 :
279 70014 : return output;
280 : }
281 :
282 12608 : Tensor FloatTensor::multiply_strided(Tensor const &m, Tensor &output,
283 : const float beta) const {
284 12649 : CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
285 :
286 12608 : if (size() != m.size() || size() != output.size())
287 : throw std::invalid_argument(
288 3 : "Strided multiplication does not support broadcasting");
289 :
290 12606 : NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
291 : << getName() << " is not allocated";
292 12605 : NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
293 1 : << m.getName() << " is not allocated";
294 12604 : NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
295 1 : << output.getName() << " is not allocated";
296 :
297 12602 : if (strides[3] != 1 || m.getStrides()[3] != 1 ||
298 37806 : output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
299 2834 : for (unsigned int b = 0; b < batch(); ++b) {
300 3048 : for (unsigned int c = 0; c < channel(); ++c) {
301 3054 : for (unsigned int h = 0; h < height(); ++h) {
302 5655 : for (unsigned int w = 0; w < width(); ++w) {
303 4125 : output.addValue(
304 4125 : b, c, h, w, getValue(b, c, h, w) * m.getValue<float>(b, c, h, w),
305 : beta);
306 : }
307 : }
308 : }
309 : }
310 : } else {
311 : /** @todo optimize by combining these loops where stride is 1 */
312 11292 : if (getFormat() == Tformat::NCHW) {
313 29292 : for (unsigned int b = 0; b < batch(); ++b) {
314 44925 : for (unsigned int c = 0; c < channel(); ++c) {
315 167899 : for (unsigned int h = 0; h < height(); ++h) {
316 140974 : float *out_data = output.getAddress<float>(b, c, h, 0);
317 140974 : const float *m_data = m.getAddress<float>(b, c, h, 0);
318 140974 : const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
319 140974 : std::transform(in_data, in_data + width(), m_data, out_data,
320 : std::multiplies<float>());
321 : }
322 : }
323 : }
324 : } else {
325 0 : for (unsigned int b = 0; b < batch(); ++b) {
326 0 : for (unsigned int h = 0; h < height(); ++h) {
327 0 : for (unsigned int w = 0; w < width(); ++w) {
328 0 : float *out_data = output.getAddress<float>(b, 0, h, w);
329 0 : const float *m_data = m.getAddress<float>(b, 0, h, w);
330 0 : const float *in_data = (float *)getAddress(getIndex(b, 0, h, w));
331 0 : std::transform(in_data, in_data + channel(), m_data, out_data,
332 : std::multiplies<float>());
333 : }
334 : }
335 : }
336 : }
337 : }
338 :
339 12602 : return output;
340 : }
341 :
342 2609 : int FloatTensor::multiply_i(float const &value) {
343 2609 : float *data = (float *)getData();
344 2609 : unsigned int len = size();
345 :
346 2609 : sscal(len, value, data, 1);
347 :
348 2609 : return ML_ERROR_NONE;
349 : }
350 :
351 7691 : Tensor &FloatTensor::multiply(float const &value, Tensor &out) const {
352 : auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
353 7691 : apply(f, out);
354 7691 : return out;
355 : }
356 :
357 17213 : Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
358 : const float beta) const {
359 : auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
360 : float *out_buf) {
361 29863 : ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
362 29863 : strides[3]);
363 : };
364 :
365 17213 : NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
366 : << "Tensor Format of " << getName() << ":"
367 : << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
368 0 : << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
369 :
370 17213 : NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
371 : std::invalid_argument)
372 : << getName() << " is not contiguous, cannot multiply";
373 :
374 17213 : NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
375 : std::invalid_argument)
376 : << getName() << " is not contiguous, cannot multiply";
377 :
378 17213 : apply_broadcast(m, f, output);
379 17074 : return output;
380 : }
381 :
382 6273 : Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
383 : auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
384 6273 : apply(f, output);
385 6273 : return output;
386 : }
387 :
388 198 : Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
389 : auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
390 : float *out_buf) {
391 333 : ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
392 : };
393 :
394 198 : apply_broadcast(m, f, output);
395 190 : return output;
396 : }
397 :
398 211 : Tensor &FloatTensor::add_strided(Tensor const &input, Tensor &output,
399 : const float beta) const {
400 212 : NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
401 : << getName() << " is not allocated";
402 210 : NNTR_THROW_IF(input.getData<float>() == nullptr, std::invalid_argument)
403 0 : << input.getName() << " is not allocated";
404 211 : NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
405 1 : << output.getName() << " is not allocated";
406 :
407 209 : if (strides[3] != 1 || input.getStrides()[3] != 1 ||
408 627 : output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
409 112 : for (unsigned int b = 0; b < batch(); ++b) {
410 174 : for (unsigned int c = 0; c < channel(); ++c) {
411 198 : for (unsigned int h = 0; h < height(); ++h) {
412 567 : for (unsigned int w = 0; w < width(); ++w) {
413 459 : output.setValue(b, c, h, w,
414 459 : getValue(b, c, h, w) +
415 459 : input.getValue<float>(b, c, h, w) * beta);
416 : }
417 : }
418 : }
419 : }
420 : } else {
421 : /** @todo optimize this with combining these loops where stride is 1 */
422 181 : if (this->getFormat() == Tformat::NCHW) {
423 616 : for (unsigned int b = 0; b < batch(); ++b) {
424 876 : for (unsigned int c = 0; c < channel(); ++c) {
425 972 : for (unsigned int h = 0; h < height(); ++h) {
426 531 : float *out_data = output.getAddress<float>(b, c, h, 0);
427 531 : const float *in_data = input.getAddress<float>(b, c, h, 0);
428 531 : const float *_data = (float *)getAddress(getIndex(b, c, h, 0));
429 531 : std::transform(_data, _data + width(), in_data, out_data,
430 : std::plus<float>());
431 : }
432 : }
433 : }
434 : } else {
435 0 : for (unsigned int b = 0; b < batch(); ++b) {
436 0 : for (unsigned int h = 0; h < height(); ++h) {
437 0 : for (unsigned int w = 0; w < width(); ++w) {
438 0 : float *out_data = output.getAddress<float>(b, 0, h, w);
439 0 : const float *in_data = input.getAddress<float>(b, 0, h, w);
440 0 : const float *_data = (float *)getAddress(getIndex(b, 0, h, w));
441 0 : std::transform(_data, _data + channel(), in_data, out_data,
442 : std::plus<float>());
443 : }
444 : }
445 : }
446 : }
447 : }
448 :
449 209 : return output;
450 : }
451 :
452 2698 : int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
453 : Tensor &m, unsigned int incX, unsigned int incY,
454 : const Tensor alphas, unsigned int alpha_idx) {
455 2698 : saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
456 2698 : (float *)getAddress(addr_idx), incY);
457 :
458 2698 : return ML_ERROR_NONE;
459 : }
460 :
461 7432 : Tensor &FloatTensor::add(float const &value, Tensor &output) const {
462 : auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
463 7432 : apply(f, output);
464 7432 : return output;
465 : }
466 :
467 59384 : Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
468 : float const alpha) const {
469 : auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
470 : float *out_buf) {
471 112969 : ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
472 112969 : strides[3]);
473 : };
474 59384 : apply_broadcast(m, f, output);
475 59353 : return output;
476 : }
477 :
478 174 : Tensor &FloatTensor::subtract(float const &value, Tensor &output) const {
479 : auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
480 174 : apply(f, output);
481 174 : return output;
482 : }
483 :
484 453 : void FloatTensor::sum_by_batch(Tensor &output) const {
485 453 : size_t feat_len = dim.getFeatureLen();
486 453 : size_t batch = dim.batch();
487 :
488 453 : const float *data = (float *)getData();
489 : float *out_data = output.getData<float>();
490 :
491 453 : Tensor ones(1, 1, 1, feat_len, this->getFormat());
492 453 : ones.setValue(1.0);
493 453 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)batch, (int)feat_len,
494 : 1, data, (int)feat_len, ones.getData<float>(), 1, 0.0, out_data, 1);
495 453 : }
496 :
497 126666 : Tensor &FloatTensor::sum(unsigned int axis, Tensor &output, float alpha,
498 : float beta) const {
499 126666 : const float *data = (float *)getData();
500 :
501 126666 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
502 : << getName() << " is not contiguous, cannot sum";
503 :
504 126666 : if (axis >= 4)
505 3 : throw std::out_of_range("Error: axis is invalid");
506 :
507 126663 : if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
508 111118 : CREATE_IF_EMPTY_DIMS(output, dim);
509 58861 : scopy(size(), (float *)getData(), 1, output.getData<float>(), 1);
510 58861 : return output;
511 : }
512 :
513 67802 : switch (axis) {
514 1654 : case 0: {
515 1939 : CREATE_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
516 : getTensorType());
517 1654 : size_t feat_len = dim.getFeatureLen();
518 1654 : size_t batch = dim.batch();
519 1654 : Tensor ones(1, 1, 1, batch, getTensorType());
520 1654 : ones.setValue(alpha);
521 1654 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)batch, (int)feat_len,
522 : 1, data, (int)feat_len, ones.getData<float>(), 1, beta,
523 : output.getData<float>(), 1);
524 1654 : } break;
525 64 : case 1: {
526 90 : CREATE_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
527 64 : if (this->getFormat() == Tformat::NHWC) {
528 0 : unsigned int feat_len = output.getDim().getDataLen();
529 0 : unsigned int t_axis = dim[1];
530 0 : Tensor ones(1, 1, 1, t_axis, getTensorType());
531 0 : ones.setValue(alpha);
532 0 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)feat_len,
533 : (int)t_axis, 1, data, (int)t_axis, ones.getData<float>(), 1, beta,
534 : output.getData<float>(), 1);
535 0 : } else {
536 64 : unsigned int feat_len = dim[2] * dim[3];
537 64 : unsigned int t_axis = dim[1];
538 64 : Tensor ones(1, 1, 1, t_axis, getTensorType());
539 64 : ones.setValue(alpha);
540 : float *rdata = output.getData<float>();
541 218 : for (unsigned int k = 0; k < dim[0]; ++k) {
542 154 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
543 154 : (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
544 154 : ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
545 : }
546 64 : }
547 : } break;
548 6658 : case 2: {
549 13211 : CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
550 : if (this->getFormat() == Tformat::NHWC) {
551 0 : unsigned int feat_len = dim[1] * dim[3];
552 0 : unsigned int t_axis = dim[2];
553 0 : Tensor ones(1, 1, 1, t_axis, getTensorType());
554 0 : ones.setValue(alpha);
555 : float *rdata = output.getData<float>();
556 0 : for (unsigned int k = 0; k < dim[0]; ++k) {
557 0 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
558 0 : (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
559 0 : ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
560 : }
561 0 : } else {
562 6658 : unsigned int t_3 = dim[3];
563 6658 : unsigned int t_axis = dim[2];
564 6658 : Tensor ones(1, 1, 1, t_axis, getTensorType());
565 6658 : ones.setValue(alpha);
566 :
567 6658 : if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
568 : float *rdata = output.getData<float>();
569 13364 : for (unsigned int k = 0; k < dim[0]; ++k) {
570 13516 : for (unsigned int c = 0; c < dim[1]; ++c) {
571 6810 : unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
572 : unsigned int ridx =
573 6810 : k * output.getDim().getFeatureLen() + c * dim[3];
574 :
575 13620 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
576 6810 : (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
577 6810 : beta, &rdata[ridx], 1);
578 : }
579 : }
580 : } else {
581 0 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
582 0 : (int)output.getDim().getDataLen(), 1, data, (int)t_axis,
583 : ones.getData<float>(), 1, beta, output.getData<float>(), 1);
584 : }
585 6658 : }
586 : } break;
587 59426 : case 3: {
588 117785 : CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1,
589 : this->getTensorType());
590 59426 : if (this->getFormat() == Tformat::NHWC) {
591 0 : unsigned int t_3 = dim[1];
592 0 : unsigned int t_axis = dim[3];
593 0 : Tensor ones(1, 1, 1, t_axis, getTensorType());
594 0 : ones.setValue(alpha);
595 : float *rdata = output.getData<float>();
596 0 : for (unsigned int k = 0; k < dim[0]; ++k) {
597 0 : for (unsigned int c = 0; c < dim[2]; ++c) {
598 0 : unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
599 0 : unsigned int ridx = k * output.getDim().getFeatureLen() + c * dim[1];
600 0 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
601 0 : (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
602 0 : beta, &rdata[ridx], 1);
603 : }
604 : }
605 0 : } else {
606 59426 : unsigned int m = output.getDim().getDataLen();
607 59426 : unsigned int n = dim[3];
608 59426 : Tensor ones(1, 1, 1, n, getTensorType());
609 59426 : ones.setValue(alpha);
610 :
611 59426 : if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
612 59426 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)m, (int)n, 1,
613 : data, (int)n, ones.getData<float>(), 1, beta,
614 : output.getData<float>(), 1);
615 : } else {
616 : float *rdata = output.getData<float>();
617 :
618 0 : for (unsigned int k = 0; k < dim[0]; ++k) {
619 0 : for (unsigned int c = 0; c < dim[1]; ++c) {
620 0 : unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
621 0 : unsigned int ridx = k * dim[1] * dim[2] + c * dim[2];
622 :
623 0 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)dim[2],
624 0 : (int)n, 1, &data[idx], (int)dim[2], ones.getData<float>(), 1,
625 0 : beta, &rdata[ridx], 1);
626 : }
627 : }
628 : }
629 59426 : }
630 : } break;
631 : default:
632 : throw std::out_of_range("Error: Dimension cannot exceed 3");
633 : }
634 :
635 : return output;
636 : }
637 :
638 0 : Tensor &FloatTensor::abs(Tensor &output) const {
639 : auto f = [](float in) { return std::abs(in); };
640 0 : apply(f, output);
641 0 : return output;
642 : }
643 :
644 1931 : float FloatTensor::l2norm() const {
645 1931 : return snrm2(size(), (float *)getData(), 1);
646 : }
647 :
648 8 : void FloatTensor::normalization_i(unsigned int dim, float p, float epsilon) {
649 8 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
650 : << getName() << " is not contiguous, cannot do normalization.";
651 :
652 10 : NNTR_THROW_IF(p != 2.0f, std::invalid_argument)
653 : << "Only L2 norm (p=2.0) is supported currently";
654 :
655 6 : float *data = (float *)getData();
656 6 : size_t dim_size = this->dim.getTensorDim(dim);
657 6 : size_t stride = strides[dim];
658 :
659 6 : if (dim == 3 && stride == 1) {
660 3 : size_t total_elements = size();
661 3 : int num_vectors = static_cast<int>(total_elements / dim_size);
662 :
663 3 : #pragma omp parallel for
664 : for (int i = 0; i < num_vectors; ++i) {
665 : float *vec_ptr = data + i * dim_size;
666 : float norm = snrm2(dim_size, vec_ptr, 1);
667 : float scale = 1.0f / std::max(norm, epsilon);
668 : sscal(dim_size, scale, vec_ptr, 1);
669 : }
670 : } else {
671 : throw nntrainer::exception::not_supported(
672 : "FloatTensor::normalization_i currently only optimizes for the last "
673 6 : "dimension (dim=3) with stride 1.");
674 : }
675 3 : }
676 :
677 790 : Tensor &FloatTensor::pow(float exponent, Tensor &output) const {
678 128994 : auto f = [exponent](float in) { return powf(in, exponent); };
679 790 : apply(f, output);
680 790 : return output;
681 : }
682 :
683 6 : Tensor &FloatTensor::sqrt(Tensor &output) const {
684 : auto f = [](float in) { return std::sqrt(in); };
685 6 : apply(f, output);
686 6 : return output;
687 : }
688 :
689 1 : Tensor &FloatTensor::erf(Tensor &output) const {
690 : auto f = [](float in) { return std::erf(in); };
691 1 : apply(f, output);
692 1 : return output;
693 : }
694 :
695 11 : void FloatTensor::sin(Tensor &out, float alpha) {
696 11 : if (!contiguous) {
697 90 : auto f = [alpha](float val) -> float { return std::sin(alpha * val); };
698 2 : apply(f, out);
699 : } else {
700 10 : sine(size(), (float *)getData(), out.getData<float>(), alpha);
701 : }
702 11 : }
703 :
704 14 : void FloatTensor::cos(Tensor &out, float alpha) {
705 14 : if (!contiguous) {
706 90 : auto f = [alpha](float val) -> float { return std::cos(alpha * val); };
707 2 : apply(f, out);
708 : } else {
709 13 : cosine(size(), (float *)getData(), out.getData<float>(), alpha);
710 : }
711 14 : }
712 :
713 6 : void FloatTensor::tan(Tensor &output, float alpha) {
714 12 : auto f = [alpha](float val) -> float { return std::tan(alpha * val); };
715 6 : apply(f, output);
716 6 : }
717 :
718 4 : void FloatTensor::inv_sqrt(Tensor &out) {
719 4 : apply([](float val) -> float { return 1 / std::sqrt(val); }, out);
720 4 : }
721 :
722 36830 : Tensor &FloatTensor::dot(Tensor const &input, Tensor &output, bool trans,
723 : bool trans_in, float beta) const {
724 : /**
725 : * @note FP32.dot(input);
726 : * according to the input type, invoked kernels can be varied.
727 : */
728 36830 : switch (input.getDataType()) {
729 : /** applying sgemm/sgemv after type casting to FP32 */
730 36824 : case Tdatatype::FP32:
731 36824 : dotFloat(input, output, trans, trans_in, beta);
732 36821 : break;
733 0 : case Tdatatype::FP16:
734 0 : dotFloat32Float16(input, output, trans, trans_in, beta);
735 0 : break;
736 : /** applying gemm_q4_k / gemm_q6_k / gemm_q4_0 */
737 6 : case Tdatatype::Q4_K:
738 : case Tdatatype::Q6_K:
739 : case Tdatatype::Q4_0:
740 6 : dotQnK(input, output, trans, trans_in, beta, input.getDataType());
741 6 : break;
742 0 : case Tdatatype::QINT16:
743 : case Tdatatype::QINT8:
744 : case Tdatatype::QINT4:
745 0 : dotQInteger(input, output, trans, trans_in, beta, input.getDataType());
746 0 : break;
747 0 : default:
748 0 : throw std::invalid_argument("Error: unsupported datatype");
749 : }
750 36827 : return output;
751 : }
752 :
753 0 : void FloatTensor::dot(std::vector<Tensor *> input, std::vector<Tensor *> output,
754 : bool trans, bool trans_in, float beta) const {
755 0 : float *data = (float *)getData();
756 0 : unsigned int M = getDim().height();
757 0 : unsigned int K = getDim().width();
758 0 : Tdatatype input_dtype = input[0]->getDataType();
759 :
760 : // Handle standard inputs
761 0 : if (input_dtype != Tdatatype::Q4_0 && input_dtype != Tdatatype::QINT4) {
762 0 : for (unsigned int i = 0; i < input.size(); ++i) {
763 0 : dot(*input[i], *output[i], trans, trans_in, beta);
764 : }
765 0 : return;
766 : }
767 :
768 : std::vector<unsigned int> Ns;
769 : std::vector<void *> mdatas;
770 : std::vector<float *> rdatas;
771 :
772 0 : for (unsigned int i = 0; i < input.size(); ++i) {
773 0 : Ns.push_back(input[i]->getDim().width());
774 0 : mdatas.push_back((void *)input[i]->getData<uint8_t>());
775 0 : rdatas.push_back(output[i]->getData<float>());
776 : }
777 :
778 : #ifdef ENABLE_OPENCL
779 : if (input_dtype == Tdatatype::Q4_0) {
780 : if (M == 1) {
781 : for (unsigned int i = 0; i < input.size(); ++i) {
782 : gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
783 : }
784 : } else {
785 : gemm_q4_0_async_cl(mdatas, data, rdatas, M, Ns, K);
786 : }
787 : } else { // QINT4
788 : /// Run on GPU only when memory is a Shared Virual Memory
789 : if (input[0]->getMemoryData()->isSVM() &&
790 : output[0]->getMemoryData()->isSVM() && getMemoryData()->isSVM()) {
791 : std::vector<uint16_t *> scales;
792 : for (unsigned int i = 0; i < input.size(); ++i) {
793 : scales.push_back(input[i]->getScale<uint16_t>());
794 : }
795 : if (M == 1) {
796 : gemv_int4_async_cl(mdatas, scales, data, rdatas, K, Ns,
797 : Int4QTensor::getGroupSize());
798 : } else {
799 : gemm_int4_async_cl(data, mdatas, scales, rdatas, M, Ns, K,
800 : Int4QTensor::getGroupSize());
801 : }
802 : } else {
803 : /// @todo This should be replaced with standard CPU INT4 computation
804 : for (unsigned int i = 0; i < input.size(); ++i) {
805 : gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
806 : rdatas[i], Ns[i]);
807 : }
808 : }
809 : }
810 : #else
811 0 : if (input_dtype == Tdatatype::Q4_0) {
812 : /// @todo Support multi-weight q4_0 for x64
813 0 : for (unsigned int i = 0; i < input.size(); ++i) {
814 0 : gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
815 : }
816 : } else { // QINT4
817 : /// @note It is essential to understand that this section of the code
818 : /// requires the `input` data to be converted to Q4_0 type, not QINT4 type.
819 : /// This should be replaced with standard CPU INT4 computation instead of
820 : /// using Q4_0.
821 0 : for (unsigned int i = 0; i < input.size(); ++i) {
822 0 : gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
823 : rdatas[i], Ns[i]);
824 : }
825 : }
826 : #endif
827 0 : }
828 :
829 36824 : Tensor &FloatTensor::dotFloat(Tensor const &input, Tensor &output, bool trans,
830 : bool trans_in, float beta) const {
831 : // Comment out with intension to support the calculation wrt. batch and
832 : // height direction. It supposes to have this->dim as [ BxCxH,W ] and
833 : // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
834 : // throw exception::not_supported("Error: support only for rank of dot "
835 : // "matrix <= 2");
836 : // }
837 :
838 : // Comment out with intension to support the calculation wrt. batch and
839 : // height direction of this tensor. It is OK as long as input is 2D
840 36824 : if (trans && dim.rank() > 2) {
841 932 : ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
842 : }
843 : unsigned int first_three_flat, last_axis, input_first_three_flat,
844 : input_last_axis, M, N, K, lda, ldb, ldc;
845 :
846 36824 : calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
847 : last_axis, input_first_three_flat, input_last_axis, M, N,
848 : K, lda, ldb, ldc);
849 :
850 36821 : const float *data = (float *)getData();
851 : const float *mdata = input.getData<float>();
852 : float *rdata = output.getData<float>();
853 : const float alpha = 1.0f;
854 :
855 : /// shortcut handling in case of vector
856 : /// for vector, (1 * K) == (K * 1) in current memory layout...
857 : /// and please note that N, K, M is a fixed place holder after considering
858 : /// transpose.
859 : /// For example, there is no case like (1 * K) X (1 * K) while
860 : /// (1 * K) X (1 * M) can be a case
861 : /// case1: (1 * K) X (K * 1)
862 36821 : if (M == 1 && N == 1) {
863 182 : *rdata =
864 182 : sdot(K, data, 1, mdata, 1) + ((0.0f == beta) ? 0.0f : beta * *rdata);
865 : }
866 : /// case2: (M * K) X (K * 1)
867 36639 : else if (N == 1) {
868 10955 : sgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
869 : last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
870 : }
871 : /// case3: (1 * K) X (K * N) = 1 * N = R
872 : /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
873 : /// Effectively a translation of sgemv
874 25684 : else if (M == 1) {
875 8736 : sgemv((unsigned int)dim.getStorageOrder(), !trans_in,
876 : input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
877 : beta, rdata, 1);
878 : }
879 : /// case others: use gemm
880 : else {
881 16948 : sgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
882 : data, lda, mdata, ldb, beta, rdata, ldc);
883 : }
884 :
885 36821 : return output;
886 : }
887 :
888 0 : Tensor &FloatTensor::dotFloat32Float16(Tensor const &input, Tensor &output,
889 : bool trans, bool trans_in,
890 : float beta) const {
891 : /// @todo remove #ifdef ENABLE_FP16
892 : #ifdef ENABLE_FP16
893 :
894 : // Comment out with intension to support the calculation wrt. batch and
895 : // height direction. It supposes to have this->dim as [ BxCxH,W ] and
896 : // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
897 : // throw exception::not_supported("Error: support only for rank of dot "
898 : // "matrix <= 2");
899 : // }
900 :
901 : // Comment out with intension to support the calculation wrt. batch and
902 : // height direction of this tensor. It is OK as long as input is 2D
903 : if (trans && dim.rank() > 2) {
904 : ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
905 : }
906 : unsigned int first_three_flat, last_axis, input_first_three_flat,
907 : input_last_axis, M, N, K, lda, ldb, ldc;
908 :
909 : calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
910 : last_axis, input_first_three_flat, input_last_axis, M, N,
911 : K, lda, ldb, ldc);
912 :
913 : const float *data = (float *)getData();
914 : const _FP16 *mdata = input.getData<_FP16>();
915 : float *rdata = output.getData<float>();
916 : const float alpha = 1.0f;
917 :
918 : /// shortcut handling in case of vector
919 : /// for vector, (1 * K) == (K * 1) in current memory layout...
920 : /// and please note that N, K, M is a fixed place holder after considering
921 : /// transpose.
922 : /// For example, there is no case like (1 * K) X (1 * K) while
923 : /// (1 * K) X (1 * M) can be a case
924 : /// case1: (1 * K) X (K * 1)
925 : NNTR_THROW_IF((M == 1 && N == 1), std::invalid_argument)
926 : << "dotQnK does not support trans / trans_in";
927 : /// case2: (M * K) X (K * 1)
928 : if (N == 1) {
929 : shgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
930 : last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
931 : }
932 : /// case3: (1 * K) X (K * N) = 1 * N = R
933 : /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
934 : /// Effectively a translation of sgemv
935 : else if (M == 1) {
936 : hsgemv((unsigned int)dim.getStorageOrder(), !trans_in,
937 : input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
938 : beta, rdata, 1);
939 : }
940 : /// case others: use gemm
941 : else {
942 : shgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
943 : data, lda, mdata, ldb, beta, rdata, ldc);
944 : }
945 :
946 : return output;
947 : #else
948 0 : throw std::invalid_argument("Error: enable-fp16 is not enabled");
949 : #endif
950 : }
951 :
952 6 : Tensor &FloatTensor::dotQnK(Tensor const &input, Tensor &output, bool trans,
953 : bool trans_in, float beta, Tdatatype dtype) const {
954 : ///@note Be cautious.
955 : /// Qn_K does not support transpose in principle.
956 : /// This trans option only aims to support Tensor Dimension only,
957 : /// not data.
958 : ///@note trans is not yet applied
959 6 : NNTR_THROW_IF(trans, std::invalid_argument)
960 : << "dotQnK does not support trans";
961 :
962 6 : float *data = (float *)getData();
963 : uint8_t *mdata = input.getData<uint8_t>();
964 : float *rdata = output.getData<float>();
965 :
966 : unsigned int M, N, K;
967 6 : M = getDim().height();
968 6 : K = getDim().width();
969 6 : N = trans_in ? input.getDim().height() : input.getDim().width();
970 :
971 6 : switch (dtype) {
972 3 : case Tdatatype::Q4_K:
973 3 : gemm_q4_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
974 3 : break;
975 2 : case Tdatatype::Q6_K:
976 2 : gemm_q6_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
977 2 : break;
978 1 : case Tdatatype::Q4_0:
979 1 : M = getDim().height();
980 1 : K = getDim().width();
981 1 : N = input.getDim().width();
982 : #ifdef ENABLE_OPENCL
983 : if (M == 1) {
984 : gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
985 : } else {
986 : gemm_q4_0_cl((void *)mdata, data, rdata, M, N, K);
987 : }
988 : #else
989 1 : gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
990 : #endif
991 1 : break;
992 :
993 0 : default:
994 0 : throw std::invalid_argument("Error: unsupported datatype");
995 : }
996 :
997 6 : return output;
998 : }
999 :
1000 0 : Tensor &FloatTensor::dotQInteger(Tensor const &input, Tensor &output,
1001 : bool trans, bool trans_in, float beta,
1002 : Tdatatype dtype) const {
1003 :
1004 0 : float *data = (float *)getData();
1005 : char *mdata = input.getData<char>();
1006 : float *rdata = output.getData<float>();
1007 :
1008 0 : unsigned int M = getDim().height();
1009 0 : unsigned int K = getDim().width();
1010 0 : unsigned int N = output.getDim().width();
1011 :
1012 : #ifndef ENABLE_OPENCL
1013 : #ifdef ENABLE_FP16
1014 : if (input.q_scheme() == QScheme::PER_CHANNEL_AFFINE) {
1015 : uint32_t opt_kernel_idx = (M == 1) ? 1 : 5;
1016 : nntr_gemm_qai8dxp_qsi4cxp_packed(
1017 : M, N, K, (void *)data, (void *)mdata, rdata, opt_kernel_idx,
1018 : true); /// @todo kernel supports both trans / noTrans situation
1019 : } else {
1020 : throw std::runtime_error(
1021 : "Error: QINT4 Dot on CPU only supports PER_CHANNEL_AFFINE scheme");
1022 : }
1023 : #else
1024 : /// @note It is essential to understand that this section of the code requires
1025 : /// the `input` data to be converted to Q4_0 type, not QINT4 type. This should
1026 : /// be replaced with standard CPU INT4 computation instead of using Q4_0.
1027 0 : gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
1028 : #endif
1029 : #else
1030 : if (input.getMemoryData()->isSVM() && output.getMemoryData()->isSVM() &&
1031 : getMemoryData()->isSVM()) {
1032 : if (M == 1) {
1033 : gemv_int4_cl(mdata, input.getScale<uint16_t>(), data, rdata, K, N,
1034 : Int4QTensor::getGroupSize());
1035 : } else {
1036 : sgemm_int4_cl(data, mdata, input.getScale<uint16_t>(), rdata, M, N, K,
1037 : Int4QTensor::getGroupSize());
1038 : }
1039 : } else {
1040 : /// @todo This should be replaced with standard CPU INT4 computation
1041 : gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
1042 : }
1043 : #endif
1044 :
1045 0 : return output;
1046 : }
1047 :
1048 84504 : void FloatTensor::copy(const Tensor &from) {
1049 84504 : reshape(from.getDim());
1050 84504 : copy(from.getData<float>());
1051 84504 : }
1052 :
1053 13730 : void FloatTensor::copyData(const Tensor &from) {
1054 13730 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
1055 : << getName() << " is not contiguous, cannot copy.";
1056 :
1057 13730 : NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
1058 : << "Size of tensor to copy must match";
1059 :
1060 13730 : switch (from.getDataType()) {
1061 : case ml::train::TensorDim::DataType::FP32:
1062 13721 : copy(from.getData<float>());
1063 13721 : break;
1064 0 : case ml::train::TensorDim::DataType::FP16:
1065 : /// @todo remove #ifdef ENABLE_FP16
1066 : #ifdef ENABLE_FP16
1067 : scopy(size(), from.getData<_FP16>(), 1, (float *)getData(), 1);
1068 : #else
1069 0 : throw std::invalid_argument("Error: enable-fp16 is not enabled");
1070 : #endif
1071 : break;
1072 1 : case ml::train::TensorDim::DataType::QINT16:
1073 2 : copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
1074 1 : break;
1075 5 : case ml::train::TensorDim::DataType::QINT8:
1076 5 : scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
1077 5 : (float *)getData(), 1);
1078 5 : break;
1079 1 : case ml::train::TensorDim::DataType::UINT16:
1080 2 : copy_u16_fp32(from.size(), from.getData<uint16_t>(), (float *)getData());
1081 1 : break;
1082 2 : case ml::train::TensorDim::DataType::UINT8:
1083 2 : scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
1084 2 : (float *)getData(), 1);
1085 2 : break;
1086 0 : default:
1087 : throw std::invalid_argument(
1088 0 : "[FloatTensor::copyData] Error: Unsupported data type");
1089 : break;
1090 : }
1091 13730 : }
1092 :
1093 3509 : void FloatTensor::copy_with_stride(const Tensor &input, Tensor &output) {
1094 7132 : for (unsigned int b = 0; b < output.batch(); ++b) {
1095 7246 : for (unsigned int c = 0; c < output.channel(); ++c) {
1096 16688 : for (unsigned int h = 0; h < output.height(); ++h) {
1097 91392 : for (unsigned int w = 0; w < output.width(); ++w) {
1098 78327 : output.setValue(b, c, h, w, input.getValue<float>(b, c, h, w));
1099 : }
1100 : }
1101 : }
1102 : }
1103 3509 : }
1104 :
1105 648 : std::vector<unsigned int> FloatTensor::argmax() const {
1106 : std::vector<unsigned int> result;
1107 648 : const float *data = (float *)getData();
1108 : size_t batch_size = batch();
1109 648 : size_t feature_len = dim.getFeatureLen();
1110 :
1111 648 : result.resize(batch_size);
1112 :
1113 8198 : for (unsigned int b = 0; b < batch_size; b++) {
1114 : auto max_iter =
1115 7550 : std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
1116 7550 : result[b] = std::distance(data, max_iter) - (b * feature_len);
1117 : }
1118 648 : return result;
1119 0 : }
1120 :
1121 0 : std::vector<unsigned int> FloatTensor::argmin() const {
1122 : std::vector<unsigned int> result;
1123 0 : const float *data = (float *)getData();
1124 : size_t batch_size = batch();
1125 0 : size_t feature_len = dim.getFeatureLen();
1126 :
1127 0 : result.resize(batch_size);
1128 :
1129 0 : for (unsigned int b = 0; b < batch_size; b++) {
1130 : auto min_iter =
1131 0 : std::min_element(data + b * feature_len, data + (b + 1) * feature_len);
1132 0 : result[b] = std::distance(data, min_iter) - (b * feature_len);
1133 : }
1134 0 : return result;
1135 0 : }
1136 :
1137 6 : void FloatTensor::topK(unsigned int k, void *output_data,
1138 : uint32_t *indices_data) {
1139 : const auto &input_dim = getDim();
1140 : const Tformat format = input_dim.getFormat();
1141 6 : const auto batch = input_dim.batch();
1142 6 : const auto channel = input_dim.channel();
1143 6 : const auto height = input_dim.height();
1144 6 : const auto width = input_dim.width();
1145 :
1146 6 : if (k == 0 || k > width) {
1147 : throw std::invalid_argument(
1148 0 : "k must be greater than 0 and less than or equal to width");
1149 : }
1150 :
1151 : float *output_buffer = static_cast<float *>(output_data);
1152 :
1153 : // Calculate strides for input and output
1154 6 : const auto input_strides = input_dim.computeStrides();
1155 6 : TensorDim output_dim = input_dim;
1156 6 : output_dim.width(k);
1157 6 : const auto output_strides = output_dim.computeStrides();
1158 :
1159 : #ifdef _MSC_VER
1160 : #pragma warning(push)
1161 : #pragma warning(disable : 4849)
1162 : #endif
1163 6 : #pragma omp parallel for collapse(3)
1164 : #ifdef _MSC_VER
1165 : #pragma warning(pop)
1166 : #endif
1167 : for (int b = 0; b < static_cast<int>(batch); ++b) {
1168 : for (int c = 0; c < static_cast<int>(channel); ++c) {
1169 : for (int h = 0; h < static_cast<int>(height); ++h) {
1170 :
1171 : size_t offset;
1172 : if (format == Tformat::NCHW) {
1173 : // NCHW: [b][c][h][i]
1174 : offset =
1175 : b * input_strides[0] + c * input_strides[1] + h * input_strides[2];
1176 : } else {
1177 : // NHWC: [b][h][i][c]
1178 : offset = b * input_strides[0] + h * input_strides[1] + c;
1179 : }
1180 :
1181 : const unsigned int width_stride =
1182 : format == Tformat::NHWC ? input_strides[2] : 1;
1183 : const float *B = static_cast<const float *>(getData()) + offset;
1184 : std::vector<size_t> idx(width);
1185 : std::iota(idx.begin(), idx.end(), 0);
1186 : std::partial_sort(idx.begin(), idx.begin() + k, idx.end(),
1187 : [&B, width_stride](size_t i1, size_t i2) {
1188 103 : return B[i1 * width_stride] > B[i2 * width_stride];
1189 : });
1190 :
1191 : // write top-k values and their indices to output
1192 : for (unsigned int i = 0; i < k; ++i) {
1193 : size_t output_idx;
1194 : if (format == Tformat::NCHW) {
1195 : // NCHW: [b][c][h][i]
1196 : output_idx = b * output_strides[0] + c * output_strides[1] +
1197 : h * output_strides[2] + i;
1198 : } else {
1199 : // NHWC: [b][h][i][c]
1200 : output_idx = b * output_strides[0] + h * output_strides[1] +
1201 : i * output_strides[2] + c;
1202 : }
1203 : output_buffer[output_idx] = B[idx[i]];
1204 : indices_data[output_idx] = static_cast<uint32_t>(idx[i]);
1205 : }
1206 : }
1207 : }
1208 : }
1209 6 : }
1210 :
1211 3 : float FloatTensor::max_abs() const {
1212 3 : const float *data = (float *)getData();
1213 3 : unsigned int idx = isamax(size(), data, 1);
1214 3 : return *(data + idx);
1215 : }
1216 :
1217 169 : float FloatTensor::maxValue() const {
1218 169 : const float *data = (float *)getData();
1219 169 : return *std::max_element(data, data + size());
1220 : }
1221 :
1222 169 : float FloatTensor::minValue() const {
1223 169 : const float *data = (float *)getData();
1224 169 : return *std::min_element(data, data + size());
1225 : }
1226 :
1227 1114 : Tensor &FloatTensor::transpose(const std::string &direction,
1228 : Tensor &output) const {
1229 : unsigned int SL, SI, SJ, SK;
1230 :
1231 1114 : output.reshape(dim.transpose(direction));
1232 :
1233 1113 : int indexI = direction[0] - '0';
1234 1113 : int indexJ = direction[2] - '0';
1235 :
1236 1113 : SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
1237 :
1238 : bool is_format_nchw = (getFormat() == Tformat::NCHW);
1239 :
1240 1113 : const float *inptr = (float *)getData();
1241 : float *outptr = output.getData<float>();
1242 1113 : switch (indexI) {
1243 17 : case 0:
1244 17 : if (indexJ == 1) {
1245 2 : if (is_format_nchw) {
1246 308 : transposeloop(l, i, j, k, SL, SI, SJ, SK);
1247 : } else {
1248 0 : transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
1249 : }
1250 : } else {
1251 15 : if (is_format_nchw) {
1252 34 : for (unsigned int b = 0; b < batch(); ++b) {
1253 44 : for (unsigned int c = 0; c < channel(); ++c) {
1254 50 : transpose_matrix(
1255 25 : height(), width(), (float *)getData() + getIndex(b, c, 0, 0),
1256 25 : width(), (float *)output.getData() + output.getIndex(b, c, 0, 0),
1257 25 : output.width());
1258 : }
1259 : }
1260 : } else {
1261 0 : transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
1262 : }
1263 : }
1264 : break;
1265 1092 : case 1:
1266 1092 : if (indexJ == 0) {
1267 1086 : if (is_format_nchw) {
1268 118863 : transposeloop(l, j, i, k, SL, SJ, SI, SK);
1269 : } else {
1270 0 : transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
1271 : }
1272 : } else {
1273 6 : if (is_format_nchw) {
1274 2928 : transposeloop(l, j, k, i, SL, SJ, SK, SI);
1275 : } else {
1276 0 : transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
1277 : }
1278 : }
1279 : break;
1280 4 : case 2:
1281 4 : if (indexJ == 0) {
1282 2 : if (is_format_nchw) {
1283 338 : transposeloop(l, k, i, j, SL, SK, SI, SJ);
1284 : } else {
1285 0 : transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
1286 : }
1287 : } else {
1288 2 : if (is_format_nchw) {
1289 398 : transposeloop(l, k, j, i, SL, SK, SJ, SI);
1290 : } else {
1291 0 : transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
1292 : }
1293 : }
1294 : break;
1295 : }
1296 :
1297 1113 : return output;
1298 : }
1299 :
1300 10 : void FloatTensor::dropout_mask(float dropout) {
1301 10 : float scale = 1.0 / (1 - dropout);
1302 10 : float *data_ = (float *)getData();
1303 370 : for (unsigned int i = 0; i < size(); ++i) {
1304 360 : if (data_[i] >= dropout)
1305 148 : data_[i] = scale;
1306 : else
1307 212 : data_[i] = 0.0;
1308 : }
1309 10 : }
1310 :
1311 0 : void FloatTensor::filter_mask(const Tensor &mask_len, bool reverse) {
1312 : float fill_mask_val = 0.0;
1313 : float en_mask_val = 1.0 - fill_mask_val;
1314 :
1315 0 : if (reverse) {
1316 : fill_mask_val = 1.0;
1317 : en_mask_val = 1.0 - fill_mask_val;
1318 : }
1319 :
1320 0 : setValue(fill_mask_val);
1321 :
1322 0 : NNTR_THROW_IF(mask_len.batch() != batch(), std::invalid_argument)
1323 : << "Number of filter masks mismatched";
1324 :
1325 0 : for (unsigned int b = 0; b < batch(); b++) {
1326 0 : float *addr = (float *)getAddress(getIndex(b, 0, 0, 0));
1327 : const unsigned int *mask_len_val =
1328 0 : mask_len.getAddress<unsigned int>(b, 0, 0, 0);
1329 0 : std::fill(addr, addr + (*mask_len_val), en_mask_val);
1330 : }
1331 0 : }
1332 :
1333 3 : void FloatTensor::zoneout_mask(Tensor &opposite, float zoneout) {
1334 3 : opposite.setRandBernoulli(zoneout);
1335 :
1336 3 : float *data = (float *)getData();
1337 : float *opposite_data = opposite.getData<float>();
1338 :
1339 2010003 : for (unsigned int i = 0; i < size(); ++i) {
1340 2010000 : if (opposite_data[i] > epsilon) {
1341 603513 : data[i] = 0.0f;
1342 : } else {
1343 1406487 : data[i] = 1.0f;
1344 : }
1345 : }
1346 3 : }
1347 :
1348 13 : std::vector<Tensor> FloatTensor::split(std::vector<size_t> sizes, int axis) {
1349 : size_t num_size = sizes.size();
1350 :
1351 13 : if (axis == -1) {
1352 : axis = 3;
1353 : }
1354 :
1355 : size_t total_size =
1356 : std::accumulate(sizes.begin(), sizes.end(), static_cast<size_t>(0));
1357 14 : NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
1358 : << "given sum of sizes did not match with origin tensor dim, tensor dim: "
1359 1 : << dim.getTensorDim(axis) << " total size: " << total_size;
1360 :
1361 12 : std::vector<TensorDim> ret_dims(num_size, dim);
1362 43 : for (unsigned int i = 0; i < num_size; ++i) {
1363 31 : ret_dims[i].setTensorDim(axis, sizes[i]);
1364 : }
1365 :
1366 12 : bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
1367 : std::vector<Tensor> ret;
1368 :
1369 1248 : auto iter_value = [this, is_format_nchw](
1370 : std::array<size_t, 4> &loc,
1371 : const std::array<size_t, 4> &end_loc,
1372 : const std::array<size_t, 4> &reset_dim_arr) -> float & {
1373 1248 : auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
1374 0 : : getValue(loc[0], loc[3], loc[1], loc[2]);
1375 1960 : for (int i = 3; i >= 0; --i) {
1376 1929 : loc[i]++;
1377 1929 : if (loc[i] == end_loc[i]) {
1378 712 : loc[i] -= reset_dim_arr[i];
1379 : continue;
1380 : }
1381 : break;
1382 : }
1383 1248 : return value;
1384 12 : };
1385 :
1386 : unsigned int accumulated_size = 0;
1387 43 : for (unsigned int i = 0; i < num_size; ++i) {
1388 31 : std::array<size_t, 4> loc = {0, 0, 0, 0};
1389 :
1390 31 : if (is_format_nchw) {
1391 31 : loc[axis] += accumulated_size;
1392 : } else {
1393 0 : if (axis == 0) {
1394 0 : loc[0] += accumulated_size;
1395 0 : } else if (axis == 1) {
1396 0 : loc[3] += accumulated_size;
1397 0 : } else if (axis == 2 || axis == 3) {
1398 0 : loc[axis - 1] += accumulated_size;
1399 : }
1400 : }
1401 :
1402 62 : ret.push_back(Tensor(ret_dims[i]));
1403 : auto &ret_t = ret.back();
1404 :
1405 : std::array<size_t, 4> end_loc;
1406 :
1407 31 : if (is_format_nchw) {
1408 62 : end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1409 31 : ret_dims[i].height(), ret_dims[i].width()};
1410 : } else {
1411 0 : end_loc = {ret_dims[i].batch(), ret_dims[i].height(), ret_dims[i].width(),
1412 0 : ret_dims[i].channel()};
1413 : }
1414 :
1415 31 : accumulated_size += sizes[i];
1416 :
1417 31 : if (is_format_nchw) {
1418 31 : end_loc[axis] = accumulated_size;
1419 : } else {
1420 0 : if (axis == 0) {
1421 0 : end_loc[0] = accumulated_size;
1422 0 : } else if (axis == 1) {
1423 0 : end_loc[3] = accumulated_size;
1424 0 : } else if (axis == 2 || axis == 3) {
1425 0 : end_loc[axis - 1] = accumulated_size;
1426 : }
1427 : }
1428 :
1429 : std::array<size_t, 4> reset_dim_arr;
1430 : if (is_format_nchw) {
1431 62 : reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1432 31 : ret_dims[i].height(), ret_dims[i].width()};
1433 : } else {
1434 0 : reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1435 0 : ret_dims[i].width(), ret_dims[i].channel()};
1436 : }
1437 :
1438 31 : ret_t.apply_i<float>(
1439 62 : [&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1440 1248 : return iter_value(loc, end_loc, reset_dim_arr);
1441 : });
1442 : }
1443 :
1444 12 : return ret;
1445 12 : }
1446 :
1447 5 : Tensor FloatTensor::concat(const std::vector<Tensor> &tensors, int axis,
1448 : Tensor &output) {
1449 5 : bool is_format_nchw = (tensors.front().getDim().getFormat() == Tformat::NCHW);
1450 :
1451 : auto iter_value =
1452 746 : [is_format_nchw](std::array<unsigned, 4> &loc,
1453 : const std::array<unsigned, 4> &start_loc, Tensor &t,
1454 : const std::array<unsigned, 4> &ref_dim_arr) -> float & {
1455 746 : auto &value = is_format_nchw
1456 746 : ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
1457 0 : : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
1458 :
1459 1044 : for (int i = 3; i >= 0; --i) {
1460 1033 : loc[i]++;
1461 1033 : if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1462 298 : loc[i] = start_loc[i];
1463 : continue;
1464 : }
1465 : break;
1466 : }
1467 746 : return value;
1468 5 : };
1469 :
1470 5 : std::array<unsigned, 4> loc = {0, 0, 0, 0};
1471 16 : for (auto &t : tensors) {
1472 11 : std::array<unsigned, 4> start_loc = loc;
1473 : std::array<unsigned, 4> tensor_dim_arr;
1474 11 : TensorDim curr_dim = t.getDim();
1475 :
1476 11 : tensor_dim_arr[0] = curr_dim.getTensorDim(0);
1477 22 : tensor_dim_arr[1] =
1478 11 : is_format_nchw ? curr_dim.getTensorDim(1) : curr_dim.getTensorDim(2);
1479 22 : tensor_dim_arr[2] =
1480 11 : is_format_nchw ? curr_dim.getTensorDim(2) : curr_dim.getTensorDim(3);
1481 22 : tensor_dim_arr[3] =
1482 11 : is_format_nchw ? curr_dim.getTensorDim(3) : curr_dim.getTensorDim(1);
1483 :
1484 757 : for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1485 746 : iter_value(loc, start_loc, output, tensor_dim_arr) = t.getValue<float>(i);
1486 : }
1487 :
1488 11 : if (is_format_nchw) {
1489 11 : loc[axis] += curr_dim.getTensorDim(axis);
1490 : } else {
1491 0 : if (axis == 0) {
1492 0 : loc[0] += curr_dim.getTensorDim(axis);
1493 0 : } else if (axis == 1) {
1494 0 : loc[3] += curr_dim.getTensorDim(axis);
1495 0 : } else if (axis == 2 || axis == 3) {
1496 0 : loc[axis - 1] += curr_dim.getTensorDim(axis);
1497 : }
1498 : }
1499 : }
1500 :
1501 5 : return output;
1502 : }
1503 :
1504 10 : void FloatTensor::print(std::ostream &out) const {
1505 10 : const float *data = (float *)getData();
1506 10 : unsigned int len = size();
1507 10 : out << "data addr: " << data << '\n';
1508 10 : out << dim;
1509 :
1510 10 : if (len > 100) {
1511 6 : out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
1512 6 : << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1] << ']'
1513 : << std::endl;
1514 6 : return;
1515 : }
1516 :
1517 4 : std::ios init(NULL);
1518 4 : init.copyfmt(out);
1519 :
1520 4 : if (getFormat() == Tformat::NCHW) {
1521 10 : for (unsigned int k = 0; k < batch(); k++) {
1522 12 : for (unsigned int l = 0; l < channel(); l++) {
1523 17 : for (unsigned int i = 0; i < height(); i++) {
1524 47 : for (unsigned int j = 0; j < width(); j++) {
1525 : out << std::setw(10) << std::setprecision(10)
1526 36 : << data[getIndex(k, l, i, j)] << " ";
1527 : }
1528 : out << std::endl;
1529 : }
1530 : out << std::endl;
1531 : }
1532 : out << "-------" << std::endl;
1533 : }
1534 : } else {
1535 0 : for (unsigned int k = 0; k < batch(); k++) {
1536 0 : for (unsigned int i = 0; i < height(); i++) {
1537 0 : for (unsigned int j = 0; j < width(); j++) {
1538 0 : for (unsigned int l = 0; l < channel(); l++) {
1539 : out << std::setw(10) << std::setprecision(10)
1540 0 : << data[getIndex(k, l, i, j)] << " ";
1541 : }
1542 : out << std::endl;
1543 : }
1544 : out << std::endl;
1545 : }
1546 : out << "-------" << std::endl;
1547 : }
1548 : }
1549 4 : out.copyfmt(init);
1550 : }
1551 :
1552 112088 : void FloatTensor::copy(const void *buf) {
1553 112088 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
1554 : << getName() << " is not contiguous, cannot copy.";
1555 :
1556 112088 : if (buf == getData()) {
1557 : return;
1558 : }
1559 :
1560 111785 : scopy(size(), (float *)buf, 1, (float *)getData(), 1);
1561 : }
1562 :
1563 100882 : void FloatTensor::apply_broadcast_util(
1564 : Tensor const &m,
1565 : std::function<void(const BroadcastInfo &e, const float *, const float *,
1566 : float *)>
1567 : v_func,
1568 : Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1569 : size_t m_offset) const {
1570 :
1571 100882 : const float *buf = (float *)this->getData();
1572 : const float *m_buf = m.getData<float>();
1573 : float *out_buf = output.getData<float>();
1574 :
1575 100882 : if (e.buffer_axis == cur_axis) {
1576 77211 : v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1577 77211 : return;
1578 : }
1579 :
1580 23671 : cur_axis++;
1581 23671 : unsigned int continuity[4] = {0, 1, 2, 3};
1582 23671 : if (getFormat() == Tformat::NHWC) {
1583 10 : continuity[1] = 2;
1584 10 : continuity[2] = 3;
1585 10 : continuity[3] = 1;
1586 : }
1587 113890 : for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
1588 90219 : size_t next_offset = offset + i * strides[cur_axis];
1589 90219 : size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1590 180438 : apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1591 : next_m_offset);
1592 : }
1593 : }
1594 :
1595 76795 : void FloatTensor::apply_broadcast(
1596 : Tensor const &m,
1597 : std::function<void(const BroadcastInfo &e, const float *, const float *,
1598 : float *)>
1599 : v_func,
1600 : Tensor &output) const {
1601 91891 : CREATE_IF_EMPTY_DIMS(output, dim);
1602 :
1603 76799 : NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
1604 : << getName() << " is not allocated";
1605 76795 : NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
1606 4 : << m.getName() << " is not allocated";
1607 76791 : NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
1608 4 : << output.getName() << " is not allocated";
1609 :
1610 : /// shortcut to cover when dimension matches
1611 : /// note that buffer_size, the last stride is only used in v_func but it
1612 : /// might be changed
1613 76783 : if (dim == m.getDim()) {
1614 : BroadcastInfo e;
1615 65954 : e.buffer_size = size();
1616 65954 : e.strides[3] = 1;
1617 65954 : e.tensor_type = getTensorType();
1618 131908 : v_func(e, (float *)getData(), m.getData<float>(), output.getData<float>());
1619 : return;
1620 : }
1621 :
1622 21492 : return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1623 : }
1624 :
1625 12 : bool FloatTensor::isValid() const {
1626 12 : return is_valid(dim.getDataLen(), (float *)getData());
1627 : }
1628 :
1629 : } // namespace nntrainer
|