Line data Source code
1 : // SPDX-License-Identifier: Apache-2.0
2 : /**
3 : * @file float_tensor.cpp
4 : * @date 01 December 2023
5 : * @brief This is FloatTensor class for 32-bit floating point calculation
6 : * @see https://github.com/nnstreamer/nntrainer
7 : * @author Jijoong Moon <jijoong.moon@samsung.com>
8 : * @author Donghyeon Jeong <dhyeon.jeong@samsung.com>
9 : * @bug No known bugs except for NYI items
10 : */
11 :
12 : #include <iomanip>
13 : #include <iostream>
14 : #include <numeric>
15 :
16 : #include <chrono>
17 : #include <cpu_backend.h>
18 : #include <float_tensor.h>
19 : #include <int4_tensor.h>
20 : #include <q4_0_utils.h>
21 :
22 : #include <tensor.h>
23 : #include <util_func.h>
24 :
25 : #ifdef ENABLE_OPENCL
26 : #include "blas_kernels.h"
27 : #endif
28 :
29 : namespace nntrainer {
30 :
31 282558 : FloatTensor::FloatTensor(std::string name_, Tformat fm) :
32 565116 : TensorBase(name_, fm, Tdatatype::FP32) {}
33 :
34 375559 : FloatTensor::FloatTensor(const TensorDim &d, bool alloc_now, Initializer init,
35 375559 : std::string name) :
36 375559 : TensorBase(d, alloc_now, init, name) {
37 375559 : if (alloc_now)
38 341388 : allocate();
39 375559 : }
40 :
41 335977 : FloatTensor::FloatTensor(const TensorDim &d, const void *buf) :
42 335977 : FloatTensor(d, true) {
43 335977 : if (d.getDataLen() != 0) {
44 335977 : if (buf != nullptr)
45 13863 : copy(buf);
46 : }
47 335977 : }
48 :
49 10794 : bool FloatTensor::operator==(const FloatTensor &rhs) const {
50 10794 : const float *_data = (float *)getData();
51 10794 : const float *_rdata = (float *)rhs.getData();
52 1403019 : for (size_t i = 0; i < size(); ++i) {
53 : /** not checking sign change is intentional to avoid float calculation
54 : * errors around 0 */
55 1392265 : if (std::isnan(_data[i]) || std::isnan(_rdata[i]) ||
56 1392264 : std::fabs(_data[i] - _rdata[i]) > epsilon)
57 : return false;
58 : }
59 :
60 : return true;
61 : }
62 :
63 : /// @todo support allocation by src_tensor
64 643255 : void FloatTensor::allocate() {
65 643255 : if (empty() || data)
66 : return;
67 :
68 643245 : if (src_tensor) {
69 : /// allocate data based on the source tensor
70 301850 : allocateSrcTensor();
71 : /** as this memory is shared, do NOT initialize */
72 : } else {
73 : /// allocate new memory for the tensor data
74 : MemoryData *mem_data;
75 :
76 4522423516 : mem_data = new MemoryData((void *)(new float[dim.getDataLen()]{}));
77 341395 : data = std::shared_ptr<MemoryData>(mem_data, [](auto *mem_data) {
78 341395 : delete[] mem_data->template getAddr<float>();
79 341395 : delete mem_data;
80 : });
81 :
82 341395 : offset = 0;
83 341395 : initialize();
84 : }
85 : }
86 :
87 906 : void FloatTensor::deallocate() {
88 : data = nullptr;
89 906 : offset = 0;
90 906 : }
91 :
92 136172122 : void *FloatTensor::getData() const {
93 136172122 : if (!data)
94 : return nullptr;
95 :
96 : data->validate();
97 136170503 : return data->getAddr<float>() + offset;
98 : }
99 :
100 0 : void *FloatTensor::getData(size_t idx) const {
101 0 : if (!data)
102 : return nullptr;
103 :
104 : data->validate();
105 0 : return data->getAddr<float>() + offset + idx;
106 : }
107 :
108 5427307 : void *FloatTensor::getAddress(unsigned int i) {
109 5427307 : size_t index = getIndex(batch(), channel(), height(), width());
110 5427307 : if (i > index) {
111 : return nullptr;
112 : }
113 5427307 : return &((float *)getData())[i];
114 : }
115 :
116 148414 : const void *FloatTensor::getAddress(unsigned int i) const {
117 148414 : size_t index = getIndex(batch(), channel(), height(), width());
118 148414 : if (i > index) {
119 : return nullptr;
120 : }
121 148414 : return &((float *)getData())[i];
122 : }
123 :
124 4584 : const float &FloatTensor::getValue(unsigned int i) const {
125 4584 : return ((float *)getData())[i];
126 : }
127 :
128 1291 : float &FloatTensor::getValue(unsigned int i) { return ((float *)getData())[i]; }
129 :
130 4584 : const float &FloatTensor::getValue(unsigned int b, unsigned int c,
131 : unsigned int h, unsigned int w) const {
132 4584 : return getValue(getIndex(b, c, h, w));
133 : }
134 :
135 1260 : float &FloatTensor::getValue(unsigned int b, unsigned int c, unsigned int h,
136 : unsigned int w) {
137 1260 : return getValue(getIndex(b, c, h, w));
138 : }
139 :
140 81317 : void FloatTensor::setValue(float value) {
141 81317 : float *data = (float *)getData();
142 81317 : std::fill(data, data + size(), value);
143 81317 : }
144 :
145 35425636 : void FloatTensor::setValue(unsigned int b, unsigned int c, unsigned int h,
146 : unsigned int w, float value) {
147 35425636 : ((float *)getData())[getIndex(b, c, h, w)] = value;
148 35425636 : }
149 :
150 10913 : void FloatTensor::addValue(unsigned int b, unsigned int c, unsigned int h,
151 : unsigned int w, float value, float beta) {
152 10913 : auto const &idx = getIndex(b, c, h, w);
153 10913 : ((float *)getData())[idx] *= beta;
154 10913 : ((float *)getData())[idx] += value;
155 10913 : }
156 :
157 42378 : void FloatTensor::setZero() {
158 42378 : if (contiguous) {
159 : // sscal(size(), 0, getData<float>(), 1);
160 : /// @note we cannot use sscal, when we set zero. if the data is inf or
161 : /// NaN, then the inf or NaN still remain.
162 42378 : memset((float *)getData(), 0, sizeof(float) * size());
163 : } else {
164 : /// @todo implement apply_i
165 : // apply_i<float>([](float val) -> float { return 0; });
166 0 : setValue(0);
167 : }
168 42378 : }
169 :
170 110 : void FloatTensor::setRandNormal(float mean, float stddev) {
171 110 : setDist<std::normal_distribution<float>>(
172 : std::normal_distribution<float>(mean, stddev));
173 110 : }
174 :
175 18101 : void FloatTensor::setRandUniform(float min, float max) {
176 18101 : setDist<std::uniform_real_distribution<float>>(
177 : std::uniform_real_distribution<float>(min, max));
178 18101 : }
179 :
180 3 : void FloatTensor::setRandBernoulli(float probability) {
181 3 : setDist<std::bernoulli_distribution>(
182 : std::bernoulli_distribution(probability));
183 3 : }
184 :
185 360584 : void FloatTensor::initialize() {
186 360584 : if (empty() || !isAllocated())
187 : return;
188 :
189 : unsigned int fan_in, fan_out;
190 :
191 : /// @fixme: when unit is equal to one, this does not work, we need to rely on
192 : /// effective dimension then actual numbers here. For now, some heuristics
193 : /// added to infer what would be fan_in/fan_out
194 360583 : if (dim.batch() * dim.channel() * dim.height() == 1) {
195 233468 : fan_out = fan_in = dim.width();
196 127115 : } else if (dim.batch() * dim.channel() == 1) { /// fc layer - 2-D tensor
197 12222 : fan_in = dim.height();
198 12222 : fan_out = dim.width();
199 : } else { /// conv2d filters - 4d tensor, @todo extend this to > 4
200 114893 : auto field_size = dim.height() * dim.width();
201 :
202 : // this also handles below cases.
203 : // 1. fan_in = fan_out = 1 as well.
204 : // 2. batch == 1, channel == 1 and height == 1, theoretical rank of 1
205 114893 : fan_in = dim.channel() * field_size;
206 114893 : fan_out = dim.batch() * field_size;
207 : }
208 :
209 360583 : switch (initializer) {
210 5666 : case Initializer::ZEROS:
211 5666 : setZero();
212 5666 : break;
213 242 : case Initializer::ONES:
214 242 : setValue(1.0f);
215 242 : break;
216 0 : case Initializer::LECUN_NORMAL:
217 0 : setRandNormal(0.0f, sqrtFloat(1.0f / fan_in));
218 0 : break;
219 0 : case Initializer::XAVIER_NORMAL:
220 0 : setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in + fan_out)));
221 0 : break;
222 2 : case Initializer::HE_NORMAL:
223 2 : setRandNormal(0.0f, sqrtFloat(2.0f / (fan_in)));
224 2 : break;
225 0 : case Initializer::LECUN_UNIFORM:
226 0 : setRandUniform(-1.0f * sqrtFloat(1.0f / fan_in), sqrtFloat(1.0f / fan_in));
227 0 : break;
228 2294 : case Initializer::XAVIER_UNIFORM:
229 2294 : setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in + fan_out)),
230 2294 : sqrtFloat(6.0 / (fan_in + fan_out)));
231 2294 : break;
232 0 : case Initializer::HE_UNIFORM:
233 0 : setRandUniform(-1.0f * sqrtFloat(6.0f / (fan_in)),
234 0 : sqrtFloat(6.0 / (fan_in)));
235 : default:
236 : break;
237 : }
238 :
239 360583 : putData();
240 : }
241 :
242 7 : void FloatTensor::initialize(Initializer init) {
243 7 : initializer = init;
244 7 : initialize();
245 7 : }
246 :
247 70014 : Tensor &FloatTensor::apply(std::function<float(float)> f,
248 : Tensor &output) const {
249 83169 : CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
250 :
251 70014 : if (contiguous && output.getContiguous()) {
252 64137 : const float *data = (float *)getData();
253 : float *rdata = output.getData<float>();
254 :
255 128274 : std::transform(data, data + size(), rdata, f);
256 5877 : } else if (strides[3] == 1 && output.getStrides()[3] == 1) {
257 : /** @todo optimize this with combining these loops where stride is 1 */
258 12768 : for (unsigned int b = 0; b < batch(); ++b) {
259 13782 : for (unsigned int c = 0; c < channel(); ++c) {
260 13800 : for (unsigned int h = 0; h < height(); ++h) {
261 6909 : float *out_data = output.getAddress<float>(b, c, h, 0);
262 6909 : const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
263 20727 : std::transform(in_data, in_data + width(), out_data, f);
264 : }
265 : }
266 : }
267 : } else {
268 0 : for (unsigned int b = 0; b < batch(); ++b) {
269 0 : for (unsigned int c = 0; c < channel(); ++c) {
270 0 : for (unsigned int h = 0; h < height(); ++h) {
271 0 : for (unsigned int w = 0; w < width(); ++w) {
272 0 : output.setValue(b, c, h, w, f(getValue(b, c, h, w)));
273 : }
274 : }
275 : }
276 : }
277 : }
278 :
279 70014 : return output;
280 : }
281 :
282 12608 : Tensor FloatTensor::multiply_strided(Tensor const &m, Tensor &output,
283 : const float beta) const {
284 12649 : CREATE_IF_EMPTY_DIMS(output, dim, nullptr);
285 :
286 12608 : if (size() != m.size() || size() != output.size())
287 : throw std::invalid_argument(
288 3 : "Strided multiplication does not support broadcasting");
289 :
290 12606 : NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
291 : << getName() << " is not allocated";
292 12605 : NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
293 1 : << m.getName() << " is not allocated";
294 12604 : NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
295 1 : << output.getName() << " is not allocated";
296 :
297 12602 : if (strides[3] != 1 || m.getStrides()[3] != 1 ||
298 37806 : output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
299 2834 : for (unsigned int b = 0; b < batch(); ++b) {
300 3048 : for (unsigned int c = 0; c < channel(); ++c) {
301 3054 : for (unsigned int h = 0; h < height(); ++h) {
302 5655 : for (unsigned int w = 0; w < width(); ++w) {
303 4125 : output.addValue(
304 4125 : b, c, h, w, getValue(b, c, h, w) * m.getValue<float>(b, c, h, w),
305 : beta);
306 : }
307 : }
308 : }
309 : }
310 : } else {
311 : /** @todo optimize by combining these loops where stride is 1 */
312 11292 : if (getFormat() == Tformat::NCHW) {
313 29292 : for (unsigned int b = 0; b < batch(); ++b) {
314 44925 : for (unsigned int c = 0; c < channel(); ++c) {
315 167899 : for (unsigned int h = 0; h < height(); ++h) {
316 140974 : float *out_data = output.getAddress<float>(b, c, h, 0);
317 140974 : const float *m_data = m.getAddress<float>(b, c, h, 0);
318 140974 : const float *in_data = (float *)getAddress(getIndex(b, c, h, 0));
319 140974 : std::transform(in_data, in_data + width(), m_data, out_data,
320 : std::multiplies<float>());
321 : }
322 : }
323 : }
324 : } else {
325 0 : for (unsigned int b = 0; b < batch(); ++b) {
326 0 : for (unsigned int h = 0; h < height(); ++h) {
327 0 : for (unsigned int w = 0; w < width(); ++w) {
328 0 : float *out_data = output.getAddress<float>(b, 0, h, w);
329 0 : const float *m_data = m.getAddress<float>(b, 0, h, w);
330 0 : const float *in_data = (float *)getAddress(getIndex(b, 0, h, w));
331 0 : std::transform(in_data, in_data + channel(), m_data, out_data,
332 : std::multiplies<float>());
333 : }
334 : }
335 : }
336 : }
337 : }
338 :
339 12602 : return output;
340 : }
341 :
342 2609 : int FloatTensor::multiply_i(float const &value) {
343 2609 : float *data = (float *)getData();
344 2609 : unsigned int len = size();
345 :
346 2609 : sscal(len, value, data, 1);
347 :
348 2609 : return ML_ERROR_NONE;
349 : }
350 :
351 7691 : Tensor &FloatTensor::multiply(float const &value, Tensor &out) const {
352 : auto f = std::bind(std::multiplies<float>(), std::placeholders::_1, value);
353 7691 : apply(f, out);
354 7691 : return out;
355 : }
356 :
357 17213 : Tensor &FloatTensor::multiply(Tensor const &m, Tensor &output,
358 : const float beta) const {
359 : auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
360 : float *out_buf) {
361 29863 : ele_mul(e.buffer_size, buf, m_buf, out_buf, 1, beta, e.strides[3],
362 29863 : strides[3]);
363 : };
364 :
365 17213 : NNTR_THROW_IF(m.getFormat() != this->getFormat(), std::invalid_argument)
366 : << "Tensor Format of " << getName() << ":"
367 : << ((bool)(this->getFormat()) ? "NHWC" : "NCHW") << " is not match. ("
368 0 : << ((bool)(m.getFormat()) ? "NHWC" : "NCHW") << ")";
369 :
370 17213 : NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
371 : std::invalid_argument)
372 : << getName() << " is not contiguous, cannot multiply";
373 :
374 17213 : NNTR_THROW_IF(!contiguous || !m.getContiguous() || !output.getContiguous(),
375 : std::invalid_argument)
376 : << getName() << " is not contiguous, cannot multiply";
377 :
378 17213 : apply_broadcast(m, f, output);
379 17074 : return output;
380 : }
381 :
382 6273 : Tensor &FloatTensor::divide(float const &value, Tensor &output) const {
383 : auto f = std::bind(std::divides<float>(), std::placeholders::_1, value);
384 6273 : apply(f, output);
385 6273 : return output;
386 : }
387 :
388 198 : Tensor &FloatTensor::divide(Tensor const &m, Tensor &output) const {
389 : auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
390 : float *out_buf) {
391 333 : ele_div(e.buffer_size, buf, m_buf, out_buf, 1, 0, e.strides[3], strides[3]);
392 : };
393 :
394 198 : apply_broadcast(m, f, output);
395 190 : return output;
396 : }
397 :
398 211 : Tensor &FloatTensor::add_strided(Tensor const &input, Tensor &output,
399 : const float beta) const {
400 212 : NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
401 : << getName() << " is not allocated";
402 210 : NNTR_THROW_IF(input.getData<float>() == nullptr, std::invalid_argument)
403 0 : << input.getName() << " is not allocated";
404 211 : NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
405 1 : << output.getName() << " is not allocated";
406 :
407 209 : if (strides[3] != 1 || input.getStrides()[3] != 1 ||
408 627 : output.getStrides()[3] != 1 || std::fpclassify(beta) != FP_ZERO) {
409 112 : for (unsigned int b = 0; b < batch(); ++b) {
410 174 : for (unsigned int c = 0; c < channel(); ++c) {
411 198 : for (unsigned int h = 0; h < height(); ++h) {
412 567 : for (unsigned int w = 0; w < width(); ++w) {
413 459 : output.setValue(b, c, h, w,
414 459 : getValue(b, c, h, w) +
415 459 : input.getValue<float>(b, c, h, w) * beta);
416 : }
417 : }
418 : }
419 : }
420 : } else {
421 : /** @todo optimize this with combining these loops where stride is 1 */
422 181 : if (this->getFormat() == Tformat::NCHW) {
423 616 : for (unsigned int b = 0; b < batch(); ++b) {
424 876 : for (unsigned int c = 0; c < channel(); ++c) {
425 972 : for (unsigned int h = 0; h < height(); ++h) {
426 531 : float *out_data = output.getAddress<float>(b, c, h, 0);
427 531 : const float *in_data = input.getAddress<float>(b, c, h, 0);
428 531 : const float *_data = (float *)getAddress(getIndex(b, c, h, 0));
429 531 : std::transform(_data, _data + width(), in_data, out_data,
430 : std::plus<float>());
431 : }
432 : }
433 : }
434 : } else {
435 0 : for (unsigned int b = 0; b < batch(); ++b) {
436 0 : for (unsigned int h = 0; h < height(); ++h) {
437 0 : for (unsigned int w = 0; w < width(); ++w) {
438 0 : float *out_data = output.getAddress<float>(b, 0, h, w);
439 0 : const float *in_data = input.getAddress<float>(b, 0, h, w);
440 0 : const float *_data = (float *)getAddress(getIndex(b, 0, h, w));
441 0 : std::transform(_data, _data + channel(), in_data, out_data,
442 : std::plus<float>());
443 : }
444 : }
445 : }
446 : }
447 : }
448 :
449 209 : return output;
450 : }
451 :
452 2698 : int FloatTensor::add_i_partial(unsigned int len, unsigned int addr_idx,
453 : Tensor &m, unsigned int incX, unsigned int incY,
454 : const Tensor alphas, unsigned int alpha_idx) {
455 2698 : saxpy(len, alphas.getValue<float>(alpha_idx), m.getData<float>(), incX,
456 2698 : (float *)getAddress(addr_idx), incY);
457 :
458 2698 : return ML_ERROR_NONE;
459 : }
460 :
461 7432 : Tensor &FloatTensor::add(float const &value, Tensor &output) const {
462 : auto f = std::bind(std::plus<float>(), std::placeholders::_1, value);
463 7432 : apply(f, output);
464 7432 : return output;
465 : }
466 :
467 59384 : Tensor &FloatTensor::add(Tensor const &m, Tensor &output,
468 : float const alpha) const {
469 : auto f = [&](const BroadcastInfo &e, const float *buf, const float *m_buf,
470 : float *out_buf) {
471 112969 : ele_add(e.buffer_size, buf, m_buf, out_buf, alpha, 0, e.strides[3],
472 112969 : strides[3]);
473 : };
474 59384 : apply_broadcast(m, f, output);
475 59353 : return output;
476 : }
477 :
478 174 : Tensor &FloatTensor::subtract(float const &value, Tensor &output) const {
479 : auto f = std::bind(std::minus<float>(), std::placeholders::_1, value);
480 174 : apply(f, output);
481 174 : return output;
482 : }
483 :
484 453 : void FloatTensor::sum_by_batch(Tensor &output) const {
485 453 : size_t feat_len = dim.getFeatureLen();
486 453 : size_t batch = dim.batch();
487 :
488 453 : const float *data = (float *)getData();
489 : float *out_data = output.getData<float>();
490 :
491 453 : Tensor ones(1, 1, 1, feat_len, this->getFormat());
492 453 : ones.setValue(1.0);
493 453 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)batch, (int)feat_len,
494 : 1, data, (int)feat_len, ones.getData<float>(), 1, 0.0, out_data, 1);
495 453 : }
496 :
497 126666 : Tensor &FloatTensor::sum(unsigned int axis, Tensor &output, float alpha,
498 : float beta) const {
499 126666 : const float *data = (float *)getData();
500 :
501 126666 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
502 : << getName() << " is not contiguous, cannot sum";
503 :
504 126666 : if (axis >= 4)
505 3 : throw std::out_of_range("Error: axis is invalid");
506 :
507 126663 : if (dim.getDim()[axis] == 1 and alpha == 1.0 and !beta) {
508 111118 : CREATE_IF_EMPTY_DIMS(output, dim);
509 58861 : scopy(size(), (float *)getData(), 1, output.getData<float>(), 1);
510 58861 : return output;
511 : }
512 :
513 67802 : switch (axis) {
514 1654 : case 0: {
515 1939 : CREATE_IF_EMPTY_DIMS(output, 1, dim.channel(), dim.height(), dim.width(),
516 : getTensorType());
517 1654 : size_t feat_len = dim.getFeatureLen();
518 1654 : size_t batch = dim.batch();
519 1654 : Tensor ones(1, 1, 1, batch, getTensorType());
520 1654 : ones.setValue(alpha);
521 1654 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)batch, (int)feat_len,
522 : 1, data, (int)feat_len, ones.getData<float>(), 1, beta,
523 : output.getData<float>(), 1);
524 1654 : } break;
525 64 : case 1: {
526 90 : CREATE_IF_EMPTY_DIMS(output, dim[0], 1, dim[2], dim[3], getTensorType());
527 64 : if (this->getFormat() == Tformat::NHWC) {
528 0 : unsigned int feat_len = output.getDim().getDataLen();
529 0 : unsigned int t_axis = dim[1];
530 0 : Tensor ones(1, 1, 1, t_axis, getTensorType());
531 0 : ones.setValue(alpha);
532 0 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)feat_len,
533 : (int)t_axis, 1, data, (int)t_axis, ones.getData<float>(), 1, beta,
534 : output.getData<float>(), 1);
535 0 : } else {
536 64 : unsigned int feat_len = dim[2] * dim[3];
537 64 : unsigned int t_axis = dim[1];
538 64 : Tensor ones(1, 1, 1, t_axis, getTensorType());
539 64 : ones.setValue(alpha);
540 : float *rdata = output.getData<float>();
541 218 : for (unsigned int k = 0; k < dim[0]; ++k) {
542 154 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
543 154 : (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
544 154 : ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
545 : }
546 64 : }
547 : } break;
548 6658 : case 2: {
549 13211 : CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], 1, dim[3], getTensorType());
550 : if (this->getFormat() == Tformat::NHWC) {
551 0 : unsigned int feat_len = dim[1] * dim[3];
552 0 : unsigned int t_axis = dim[2];
553 0 : Tensor ones(1, 1, 1, t_axis, getTensorType());
554 0 : ones.setValue(alpha);
555 : float *rdata = output.getData<float>();
556 0 : for (unsigned int k = 0; k < dim[0]; ++k) {
557 0 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
558 0 : (int)feat_len, 1, &data[k * dim.getFeatureLen()], (int)feat_len,
559 0 : ones.getData<float>(), 1, beta, &rdata[k * feat_len], 1);
560 : }
561 0 : } else {
562 6658 : unsigned int t_3 = dim[3];
563 6658 : unsigned int t_axis = dim[2];
564 6658 : Tensor ones(1, 1, 1, t_axis, getTensorType());
565 6658 : ones.setValue(alpha);
566 :
567 6658 : if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
568 : float *rdata = output.getData<float>();
569 13364 : for (unsigned int k = 0; k < dim[0]; ++k) {
570 13516 : for (unsigned int c = 0; c < dim[1]; ++c) {
571 6810 : unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
572 : unsigned int ridx =
573 6810 : k * output.getDim().getFeatureLen() + c * dim[3];
574 :
575 13620 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
576 6810 : (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
577 6810 : beta, &rdata[ridx], 1);
578 : }
579 : }
580 : } else {
581 0 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
582 0 : (int)output.getDim().getDataLen(), 1, data, (int)t_axis,
583 : ones.getData<float>(), 1, beta, output.getData<float>(), 1);
584 : }
585 6658 : }
586 : } break;
587 59426 : case 3: {
588 117785 : CREATE_IF_EMPTY_DIMS(output, dim[0], dim[1], dim[2], 1,
589 : this->getTensorType());
590 59426 : if (this->getFormat() == Tformat::NHWC) {
591 0 : unsigned int t_3 = dim[1];
592 0 : unsigned int t_axis = dim[3];
593 0 : Tensor ones(1, 1, 1, t_axis, getTensorType());
594 0 : ones.setValue(alpha);
595 : float *rdata = output.getData<float>();
596 0 : for (unsigned int k = 0; k < dim[0]; ++k) {
597 0 : for (unsigned int c = 0; c < dim[2]; ++c) {
598 0 : unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[1];
599 0 : unsigned int ridx = k * output.getDim().getFeatureLen() + c * dim[1];
600 0 : sgemv((unsigned int)dim.getStorageOrder(), true, (int)t_axis,
601 0 : (int)t_3, 1, &data[idx], (int)t_3, ones.getData<float>(), 1,
602 0 : beta, &rdata[ridx], 1);
603 : }
604 : }
605 0 : } else {
606 59426 : unsigned int m = output.getDim().getDataLen();
607 59426 : unsigned int n = dim[3];
608 59426 : Tensor ones(1, 1, 1, n, getTensorType());
609 59426 : ones.setValue(alpha);
610 :
611 59426 : if (dim.getStorageOrder() == TStorageOrder::ROW_MAJOR) {
612 59426 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)m, (int)n, 1,
613 : data, (int)n, ones.getData<float>(), 1, beta,
614 : output.getData<float>(), 1);
615 : } else {
616 : float *rdata = output.getData<float>();
617 :
618 0 : for (unsigned int k = 0; k < dim[0]; ++k) {
619 0 : for (unsigned int c = 0; c < dim[1]; ++c) {
620 0 : unsigned int idx = k * dim.getFeatureLen() + c * dim[3] * dim[2];
621 0 : unsigned int ridx = k * dim[1] * dim[2] + c * dim[2];
622 :
623 0 : sgemv((unsigned int)dim.getStorageOrder(), false, (int)dim[2],
624 0 : (int)n, 1, &data[idx], (int)dim[2], ones.getData<float>(), 1,
625 0 : beta, &rdata[ridx], 1);
626 : }
627 : }
628 : }
629 59426 : }
630 : } break;
631 : default:
632 : throw std::out_of_range("Error: Dimension cannot exceed 3");
633 : }
634 :
635 : return output;
636 : }
637 :
638 0 : Tensor &FloatTensor::abs(Tensor &output) const {
639 : auto f = [](float in) { return std::abs(in); };
640 0 : apply(f, output);
641 0 : return output;
642 : }
643 :
644 1931 : float FloatTensor::l2norm() const {
645 1931 : return snrm2(size(), (float *)getData(), 1);
646 : }
647 :
648 790 : Tensor &FloatTensor::pow(float exponent, Tensor &output) const {
649 128994 : auto f = [exponent](float in) { return powf(in, exponent); };
650 790 : apply(f, output);
651 790 : return output;
652 : }
653 :
654 6 : Tensor &FloatTensor::sqrt(Tensor &output) const {
655 : auto f = [](float in) { return std::sqrt(in); };
656 6 : apply(f, output);
657 6 : return output;
658 : }
659 :
660 1 : Tensor &FloatTensor::erf(Tensor &output) const {
661 : auto f = [](float in) { return std::erf(in); };
662 1 : apply(f, output);
663 1 : return output;
664 : }
665 :
666 11 : void FloatTensor::sin(Tensor &out, float alpha) {
667 11 : if (!contiguous) {
668 90 : auto f = [alpha](float val) -> float { return std::sin(alpha * val); };
669 2 : apply(f, out);
670 : } else {
671 10 : sine(size(), (float *)getData(), out.getData<float>(), alpha);
672 : }
673 11 : }
674 :
675 14 : void FloatTensor::cos(Tensor &out, float alpha) {
676 14 : if (!contiguous) {
677 90 : auto f = [alpha](float val) -> float { return std::cos(alpha * val); };
678 2 : apply(f, out);
679 : } else {
680 13 : cosine(size(), (float *)getData(), out.getData<float>(), alpha);
681 : }
682 14 : }
683 :
684 6 : void FloatTensor::tan(Tensor &output, float alpha) {
685 12 : auto f = [alpha](float val) -> float { return std::tan(alpha * val); };
686 6 : apply(f, output);
687 6 : }
688 :
689 4 : void FloatTensor::inv_sqrt(Tensor &out) {
690 4 : apply([](float val) -> float { return 1 / std::sqrt(val); }, out);
691 4 : }
692 :
693 36830 : Tensor &FloatTensor::dot(Tensor const &input, Tensor &output, bool trans,
694 : bool trans_in, float beta) const {
695 : /**
696 : * @note FP32.dot(input);
697 : * according to the input type, invoked kernels can be varied.
698 : */
699 36830 : switch (input.getDataType()) {
700 : /** applying sgemm/sgemv after type casting to FP32 */
701 36824 : case Tdatatype::FP32:
702 36824 : dotFloat(input, output, trans, trans_in, beta);
703 36821 : break;
704 0 : case Tdatatype::FP16:
705 0 : dotFloat32Float16(input, output, trans, trans_in, beta);
706 0 : break;
707 : /** applying gemm_q4_k / gemm_q6_k / gemm_q4_0 */
708 6 : case Tdatatype::Q4_K:
709 : case Tdatatype::Q6_K:
710 : case Tdatatype::Q4_0:
711 6 : dotQnK(input, output, trans, trans_in, beta, input.getDataType());
712 6 : break;
713 0 : case Tdatatype::QINT16:
714 : case Tdatatype::QINT8:
715 : case Tdatatype::QINT4:
716 0 : dotQInteger(input, output, trans, trans_in, beta, input.getDataType());
717 0 : break;
718 0 : default:
719 0 : throw std::invalid_argument("Error: unsupported datatype");
720 : }
721 36827 : return output;
722 : }
723 :
724 0 : void FloatTensor::dot(std::vector<Tensor *> input, std::vector<Tensor *> output,
725 : bool trans, bool trans_in, float beta) const {
726 0 : float *data = (float *)getData();
727 0 : unsigned int M = getDim().height();
728 0 : unsigned int K = getDim().width();
729 0 : Tdatatype input_dtype = input[0]->getDataType();
730 :
731 : // Handle standard inputs
732 0 : if (input_dtype != Tdatatype::Q4_0 && input_dtype != Tdatatype::QINT4) {
733 0 : for (unsigned int i = 0; i < input.size(); ++i) {
734 0 : dot(*input[i], *output[i], trans, trans_in, beta);
735 : }
736 0 : return;
737 : }
738 :
739 : std::vector<unsigned int> Ns;
740 : std::vector<void *> mdatas;
741 : std::vector<float *> rdatas;
742 :
743 0 : for (unsigned int i = 0; i < input.size(); ++i) {
744 0 : Ns.push_back(input[i]->getDim().width());
745 0 : mdatas.push_back((void *)input[i]->getData<uint8_t>());
746 0 : rdatas.push_back(output[i]->getData<float>());
747 : }
748 :
749 : #ifdef ENABLE_OPENCL
750 : if (input_dtype == Tdatatype::Q4_0) {
751 : if (M == 1) {
752 : for (unsigned int i = 0; i < input.size(); ++i) {
753 : gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
754 : }
755 : } else {
756 : gemm_q4_0_async_cl(mdatas, data, rdatas, M, Ns, K);
757 : }
758 : } else { // QINT4
759 : /// Run on GPU only when memory is a Shared Virual Memory
760 : if (input[0]->getMemoryData()->isSVM() &&
761 : output[0]->getMemoryData()->isSVM() && getMemoryData()->isSVM()) {
762 : std::vector<uint16_t *> scales;
763 : for (unsigned int i = 0; i < input.size(); ++i) {
764 : scales.push_back(input[i]->getScale<uint16_t>());
765 : }
766 : if (M == 1) {
767 : gemv_int4_async_cl(mdatas, scales, data, rdatas, K, Ns,
768 : Int4QTensor::getGroupSize());
769 : } else {
770 : openvino_gemm_async_cl(data, mdatas, scales, rdatas, M, Ns, K,
771 : Int4QTensor::getGroupSize());
772 : }
773 : } else {
774 : /// @todo This should be replaced with standard CPU INT4 computation
775 : for (unsigned int i = 0; i < input.size(); ++i) {
776 : gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
777 : rdatas[i], Ns[i]);
778 : }
779 : }
780 : }
781 : #else
782 0 : if (input_dtype == Tdatatype::Q4_0) {
783 : /// @todo Support multi-weight q4_0 for x64
784 0 : for (unsigned int i = 0; i < input.size(); ++i) {
785 0 : gemm_q4_0(M, Ns[i], K, data, K, mdatas[i], Ns[i], rdatas[i], Ns[i]);
786 : }
787 : } else { // QINT4
788 : /// @note It is essential to understand that this section of the code
789 : /// requires the `input` data to be converted to Q4_0 type, not QINT4 type.
790 : /// This should be replaced with standard CPU INT4 computation instead of
791 : /// using Q4_0.
792 0 : for (unsigned int i = 0; i < input.size(); ++i) {
793 0 : gemm_q4_0(M, Ns[i], K, data, K, (void *)input[i]->getData(), Ns[i],
794 : rdatas[i], Ns[i]);
795 : }
796 : }
797 : #endif
798 0 : }
799 :
800 36824 : Tensor &FloatTensor::dotFloat(Tensor const &input, Tensor &output, bool trans,
801 : bool trans_in, float beta) const {
802 : // Comment out with intension to support the calculation wrt. batch and
803 : // height direction. It supposes to have this->dim as [ BxCxH,W ] and
804 : // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
805 : // throw exception::not_supported("Error: support only for rank of dot "
806 : // "matrix <= 2");
807 : // }
808 :
809 : // Comment out with intension to support the calculation wrt. batch and
810 : // height direction of this tensor. It is OK as long as input is 2D
811 36824 : if (trans && dim.rank() > 2) {
812 932 : ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
813 : }
814 : unsigned int first_three_flat, last_axis, input_first_three_flat,
815 : input_last_axis, M, N, K, lda, ldb, ldc;
816 :
817 36824 : calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
818 : last_axis, input_first_three_flat, input_last_axis, M, N,
819 : K, lda, ldb, ldc);
820 :
821 36821 : const float *data = (float *)getData();
822 : const float *mdata = input.getData<float>();
823 : float *rdata = output.getData<float>();
824 : const float alpha = 1.0f;
825 :
826 : /// shortcut handling in case of vector
827 : /// for vector, (1 * K) == (K * 1) in current memory layout...
828 : /// and please note that N, K, M is a fixed place holder after considering
829 : /// transpose.
830 : /// For example, there is no case like (1 * K) X (1 * K) while
831 : /// (1 * K) X (1 * M) can be a case
832 : /// case1: (1 * K) X (K * 1)
833 36821 : if (M == 1 && N == 1) {
834 182 : *rdata =
835 182 : sdot(K, data, 1, mdata, 1) + ((0.0f == beta) ? 0.0f : beta * *rdata);
836 : }
837 : /// case2: (M * K) X (K * 1)
838 36639 : else if (N == 1) {
839 10955 : sgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
840 : last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
841 : }
842 : /// case3: (1 * K) X (K * N) = 1 * N = R
843 : /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
844 : /// Effectively a translation of sgemv
845 25684 : else if (M == 1) {
846 8736 : sgemv((unsigned int)dim.getStorageOrder(), !trans_in,
847 : input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
848 : beta, rdata, 1);
849 : }
850 : /// case others: use gemm
851 : else {
852 16948 : sgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
853 : data, lda, mdata, ldb, beta, rdata, ldc);
854 : }
855 :
856 36821 : return output;
857 : }
858 :
859 0 : Tensor &FloatTensor::dotFloat32Float16(Tensor const &input, Tensor &output,
860 : bool trans, bool trans_in,
861 : float beta) const {
862 : /// @todo remove #ifdef ENABLE_FP16
863 : #ifdef ENABLE_FP16
864 :
865 : // Comment out with intension to support the calculation wrt. batch and
866 : // height direction. It supposes to have this->dim as [ BxCxH,W ] and
867 : // input.dim is [BxCxH,W] as well if (input.dim.rank() > 2) {
868 : // throw exception::not_supported("Error: support only for rank of dot "
869 : // "matrix <= 2");
870 : // }
871 :
872 : // Comment out with intension to support the calculation wrt. batch and
873 : // height direction of this tensor. It is OK as long as input is 2D
874 : if (trans && dim.rank() > 2) {
875 : ml_logw("Warning: support only for rank of dot matrix <= 2 with trans");
876 : }
877 : unsigned int first_three_flat, last_axis, input_first_three_flat,
878 : input_last_axis, M, N, K, lda, ldb, ldc;
879 :
880 : calculateFlattenDot(input, output, trans, trans_in, first_three_flat,
881 : last_axis, input_first_three_flat, input_last_axis, M, N,
882 : K, lda, ldb, ldc);
883 :
884 : const float *data = (float *)getData();
885 : const _FP16 *mdata = input.getData<_FP16>();
886 : float *rdata = output.getData<float>();
887 : const float alpha = 1.0f;
888 :
889 : /// shortcut handling in case of vector
890 : /// for vector, (1 * K) == (K * 1) in current memory layout...
891 : /// and please note that N, K, M is a fixed place holder after considering
892 : /// transpose.
893 : /// For example, there is no case like (1 * K) X (1 * K) while
894 : /// (1 * K) X (1 * M) can be a case
895 : /// case1: (1 * K) X (K * 1)
896 : NNTR_THROW_IF((M == 1 && N == 1), std::invalid_argument)
897 : << "dotQnK does not support trans / trans_in";
898 : /// case2: (M * K) X (K * 1)
899 : if (N == 1) {
900 : shgemv((unsigned int)dim.getStorageOrder(), trans, first_three_flat,
901 : last_axis, alpha, data, lda, mdata, 1, beta, rdata, 1);
902 : }
903 : /// case3: (1 * K) X (K * N) = 1 * N = R
904 : /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
905 : /// Effectively a translation of sgemv
906 : else if (M == 1) {
907 : hsgemv((unsigned int)dim.getStorageOrder(), !trans_in,
908 : input_first_three_flat, input_last_axis, alpha, mdata, ldb, data, 1,
909 : beta, rdata, 1);
910 : }
911 : /// case others: use gemm
912 : else {
913 : shgemm((unsigned int)dim.getStorageOrder(), trans, trans_in, M, N, K, alpha,
914 : data, lda, mdata, ldb, beta, rdata, ldc);
915 : }
916 :
917 : return output;
918 : #else
919 0 : throw std::invalid_argument("Error: enable-fp16 is not enabled");
920 : #endif
921 : }
922 :
923 6 : Tensor &FloatTensor::dotQnK(Tensor const &input, Tensor &output, bool trans,
924 : bool trans_in, float beta, Tdatatype dtype) const {
925 : ///@note Be cautious.
926 : /// Qn_K does not support transpose in principle.
927 : /// This trans option only aims to support Tensor Dimension only,
928 : /// not data.
929 : ///@note trans is not yet applied
930 6 : NNTR_THROW_IF(trans, std::invalid_argument)
931 : << "dotQnK does not support trans";
932 :
933 6 : float *data = (float *)getData();
934 : uint8_t *mdata = input.getData<uint8_t>();
935 : float *rdata = output.getData<float>();
936 :
937 : unsigned int M, N, K;
938 6 : M = getDim().height();
939 6 : K = getDim().width();
940 6 : N = trans_in ? input.getDim().height() : input.getDim().width();
941 :
942 6 : switch (dtype) {
943 3 : case Tdatatype::Q4_K:
944 3 : gemm_q4_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
945 3 : break;
946 2 : case Tdatatype::Q6_K:
947 2 : gemm_q6_K(M, N, K, data, K, (void *)mdata, N, rdata, N);
948 2 : break;
949 1 : case Tdatatype::Q4_0:
950 1 : M = getDim().height();
951 1 : K = getDim().width();
952 1 : N = input.getDim().width();
953 : #ifdef ENABLE_OPENCL
954 : if (M == 1) {
955 : gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
956 : } else {
957 : gemm_q4_0_cl((void *)mdata, data, rdata, M, N, K);
958 : }
959 : #else
960 1 : gemm_q4_0(M, N, K, data, K, (void *)mdata, N, rdata, N);
961 : #endif
962 1 : break;
963 :
964 0 : default:
965 0 : throw std::invalid_argument("Error: unsupported datatype");
966 : }
967 :
968 6 : return output;
969 : }
970 :
971 0 : Tensor &FloatTensor::dotQInteger(Tensor const &input, Tensor &output,
972 : bool trans, bool trans_in, float beta,
973 : Tdatatype dtype) const {
974 :
975 0 : float *data = (float *)getData();
976 : char *mdata = input.getData<char>();
977 : float *rdata = output.getData<float>();
978 :
979 0 : unsigned int M = getDim().height();
980 0 : unsigned int K = getDim().width();
981 0 : unsigned int N = output.getDim().width();
982 :
983 : #ifndef ENABLE_OPENCL
984 : #ifdef ENABLE_FP16
985 : if (input.q_scheme() == QScheme::PER_CHANNEL_AFFINE) {
986 : uint32_t opt_kernel_idx = (M == 1) ? 1 : 5;
987 : nntr_gemm_qai8dxp_qsi4cxp_packed(
988 : M, N, K, (void *)data, (void *)mdata, rdata, opt_kernel_idx,
989 : true); /// @todo kernel supports both trans / noTrans situation
990 : } else {
991 : throw std::runtime_error(
992 : "Error: QINT4 Dot on CPU only supports PER_CHANNEL_AFFINE scheme");
993 : }
994 : #else
995 : /// @note It is essential to understand that this section of the code requires
996 : /// the `input` data to be converted to Q4_0 type, not QINT4 type. This should
997 : /// be replaced with standard CPU INT4 computation instead of using Q4_0.
998 0 : gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
999 : #endif
1000 : #else
1001 : if (input.getMemoryData()->isSVM() && output.getMemoryData()->isSVM() &&
1002 : getMemoryData()->isSVM()) {
1003 : if (M == 1) {
1004 : gemv_int4_cl(mdata, input.getScale<uint16_t>(), data, rdata, K, N,
1005 : Int4QTensor::getGroupSize());
1006 : } else {
1007 : openvino_sgemm_cl(data, mdata, input.getScale<uint16_t>(), rdata, M, N, K,
1008 : Int4QTensor::getGroupSize());
1009 : }
1010 : } else {
1011 : /// @todo This should be replaced with standard CPU INT4 computation
1012 : gemm_q4_0(M, N, K, data, K, (void *)input.getData(), N, rdata, N);
1013 : }
1014 : #endif
1015 :
1016 0 : return output;
1017 : }
1018 :
1019 84504 : void FloatTensor::copy(const Tensor &from) {
1020 84504 : reshape(from.getDim());
1021 84504 : copy(from.getData<float>());
1022 84504 : }
1023 :
1024 13730 : void FloatTensor::copyData(const Tensor &from) {
1025 13730 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
1026 : << getName() << " is not contiguous, cannot copy.";
1027 :
1028 13730 : NNTR_THROW_IF(size() != from.size(), std::invalid_argument)
1029 : << "Size of tensor to copy must match";
1030 :
1031 13730 : switch (from.getDataType()) {
1032 : case ml::train::TensorDim::DataType::FP32:
1033 13721 : copy(from.getData<float>());
1034 13721 : break;
1035 0 : case ml::train::TensorDim::DataType::FP16:
1036 : /// @todo remove #ifdef ENABLE_FP16
1037 : #ifdef ENABLE_FP16
1038 : scopy(size(), from.getData<_FP16>(), 1, (float *)getData(), 1);
1039 : #else
1040 0 : throw std::invalid_argument("Error: enable-fp16 is not enabled");
1041 : #endif
1042 : break;
1043 1 : case ml::train::TensorDim::DataType::QINT16:
1044 2 : copy_s16_fp32(from.size(), from.getData<int16_t>(), (float *)getData());
1045 1 : break;
1046 5 : case ml::train::TensorDim::DataType::QINT8:
1047 5 : scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
1048 5 : (float *)getData(), 1);
1049 5 : break;
1050 1 : case ml::train::TensorDim::DataType::UINT16:
1051 2 : copy_u16_fp32(from.size(), from.getData<uint16_t>(), (float *)getData());
1052 1 : break;
1053 2 : case ml::train::TensorDim::DataType::UINT8:
1054 2 : scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
1055 2 : (float *)getData(), 1);
1056 2 : break;
1057 0 : default:
1058 : throw std::invalid_argument(
1059 0 : "[FloatTensor::copyData] Error: Unsupported data type");
1060 : break;
1061 : }
1062 13730 : }
1063 :
1064 3509 : void FloatTensor::copy_with_stride(const Tensor &input, Tensor &output) {
1065 7132 : for (unsigned int b = 0; b < output.batch(); ++b) {
1066 7246 : for (unsigned int c = 0; c < output.channel(); ++c) {
1067 16688 : for (unsigned int h = 0; h < output.height(); ++h) {
1068 91392 : for (unsigned int w = 0; w < output.width(); ++w) {
1069 78327 : output.setValue(b, c, h, w, input.getValue<float>(b, c, h, w));
1070 : }
1071 : }
1072 : }
1073 : }
1074 3509 : }
1075 :
1076 648 : std::vector<unsigned int> FloatTensor::argmax() const {
1077 : std::vector<unsigned int> result;
1078 648 : const float *data = (float *)getData();
1079 : size_t batch_size = batch();
1080 648 : size_t feature_len = dim.getFeatureLen();
1081 :
1082 648 : result.resize(batch_size);
1083 :
1084 8198 : for (unsigned int b = 0; b < batch_size; b++) {
1085 : auto max_iter =
1086 7550 : std::max_element(data + b * feature_len, data + (b + 1) * feature_len);
1087 7550 : result[b] = std::distance(data, max_iter) - (b * feature_len);
1088 : }
1089 648 : return result;
1090 0 : }
1091 :
1092 0 : std::vector<unsigned int> FloatTensor::argmin() const {
1093 : std::vector<unsigned int> result;
1094 0 : const float *data = (float *)getData();
1095 : size_t batch_size = batch();
1096 0 : size_t feature_len = dim.getFeatureLen();
1097 :
1098 0 : result.resize(batch_size);
1099 :
1100 0 : for (unsigned int b = 0; b < batch_size; b++) {
1101 : auto min_iter =
1102 0 : std::min_element(data + b * feature_len, data + (b + 1) * feature_len);
1103 0 : result[b] = std::distance(data, min_iter) - (b * feature_len);
1104 : }
1105 0 : return result;
1106 0 : }
1107 :
1108 6 : void FloatTensor::topK(unsigned int k, void *output_data,
1109 : uint32_t *indices_data) {
1110 : const auto &input_dim = getDim();
1111 : const Tformat format = input_dim.getFormat();
1112 6 : const auto batch = input_dim.batch();
1113 6 : const auto channel = input_dim.channel();
1114 6 : const auto height = input_dim.height();
1115 6 : const auto width = input_dim.width();
1116 :
1117 6 : if (k == 0 || k > width) {
1118 : throw std::invalid_argument(
1119 0 : "k must be greater than 0 and less than or equal to width");
1120 : }
1121 :
1122 : float *output_buffer = static_cast<float *>(output_data);
1123 :
1124 : // Calculate strides for input and output
1125 6 : const auto input_strides = input_dim.computeStrides();
1126 6 : TensorDim output_dim = input_dim;
1127 6 : output_dim.width(k);
1128 6 : const auto output_strides = output_dim.computeStrides();
1129 :
1130 : #ifdef _MSC_VER
1131 : #pragma warning(push)
1132 : #pragma warning(disable : 4849)
1133 : #endif
1134 6 : #pragma omp parallel for collapse(3)
1135 : #ifdef _MSC_VER
1136 : #pragma warning(pop)
1137 : #endif
1138 : for (int b = 0; b < static_cast<int>(batch); ++b) {
1139 : for (int c = 0; c < static_cast<int>(channel); ++c) {
1140 : for (int h = 0; h < static_cast<int>(height); ++h) {
1141 :
1142 : size_t offset;
1143 : if (format == Tformat::NCHW) {
1144 : // NCHW: [b][c][h][i]
1145 : offset =
1146 : b * input_strides[0] + c * input_strides[1] + h * input_strides[2];
1147 : } else {
1148 : // NHWC: [b][h][i][c]
1149 : offset = b * input_strides[0] + h * input_strides[1] + c;
1150 : }
1151 :
1152 : const unsigned int width_stride =
1153 : format == Tformat::NHWC ? input_strides[2] : 1;
1154 : const float *B = static_cast<const float *>(getData()) + offset;
1155 : std::vector<size_t> idx(width);
1156 : std::iota(idx.begin(), idx.end(), 0);
1157 : std::partial_sort(idx.begin(), idx.begin() + k, idx.end(),
1158 : [&B, width_stride](size_t i1, size_t i2) {
1159 103 : return B[i1 * width_stride] > B[i2 * width_stride];
1160 : });
1161 :
1162 : // write top-k values and their indices to output
1163 : for (unsigned int i = 0; i < k; ++i) {
1164 : size_t output_idx;
1165 : if (format == Tformat::NCHW) {
1166 : // NCHW: [b][c][h][i]
1167 : output_idx = b * output_strides[0] + c * output_strides[1] +
1168 : h * output_strides[2] + i;
1169 : } else {
1170 : // NHWC: [b][h][i][c]
1171 : output_idx = b * output_strides[0] + h * output_strides[1] +
1172 : i * output_strides[2] + c;
1173 : }
1174 : output_buffer[output_idx] = B[idx[i]];
1175 : indices_data[output_idx] = static_cast<uint32_t>(idx[i]);
1176 : }
1177 : }
1178 : }
1179 : }
1180 6 : }
1181 :
1182 3 : float FloatTensor::max_abs() const {
1183 3 : const float *data = (float *)getData();
1184 3 : unsigned int idx = isamax(size(), data, 1);
1185 3 : return *(data + idx);
1186 : }
1187 :
1188 169 : float FloatTensor::maxValue() const {
1189 169 : const float *data = (float *)getData();
1190 169 : return *std::max_element(data, data + size());
1191 : }
1192 :
1193 169 : float FloatTensor::minValue() const {
1194 169 : const float *data = (float *)getData();
1195 169 : return *std::min_element(data, data + size());
1196 : }
1197 :
1198 1114 : Tensor &FloatTensor::transpose(const std::string &direction,
1199 : Tensor &output) const {
1200 : unsigned int SL, SI, SJ, SK;
1201 :
1202 1114 : output.reshape(dim.transpose(direction));
1203 :
1204 1113 : int indexI = direction[0] - '0';
1205 1113 : int indexJ = direction[2] - '0';
1206 :
1207 1113 : SL = dim.batch(), SI = dim.channel(), SJ = dim.height(), SK = dim.width();
1208 :
1209 : bool is_format_nchw = (getFormat() == Tformat::NCHW);
1210 :
1211 1113 : const float *inptr = (float *)getData();
1212 : float *outptr = output.getData<float>();
1213 1113 : switch (indexI) {
1214 17 : case 0:
1215 17 : if (indexJ == 1) {
1216 2 : if (is_format_nchw) {
1217 308 : transposeloop(l, i, j, k, SL, SI, SJ, SK);
1218 : } else {
1219 0 : transposeloop_nhwc(l, j, k, i, SL, SJ, SK, SI);
1220 : }
1221 : } else {
1222 15 : if (is_format_nchw) {
1223 34 : for (unsigned int b = 0; b < batch(); ++b) {
1224 44 : for (unsigned int c = 0; c < channel(); ++c) {
1225 50 : transpose_matrix(
1226 25 : height(), width(), (float *)getData() + getIndex(b, c, 0, 0),
1227 25 : width(), (float *)output.getData() + output.getIndex(b, c, 0, 0),
1228 25 : output.width());
1229 : }
1230 : }
1231 : } else {
1232 0 : transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
1233 : }
1234 : }
1235 : break;
1236 1092 : case 1:
1237 1092 : if (indexJ == 0) {
1238 1086 : if (is_format_nchw) {
1239 118863 : transposeloop(l, j, i, k, SL, SJ, SI, SK);
1240 : } else {
1241 0 : transposeloop_nhwc(l, i, k, j, SL, SI, SK, SJ);
1242 : }
1243 : } else {
1244 6 : if (is_format_nchw) {
1245 2928 : transposeloop(l, j, k, i, SL, SJ, SK, SI);
1246 : } else {
1247 0 : transposeloop_nhwc(l, k, i, j, SL, SK, SI, SJ);
1248 : }
1249 : }
1250 : break;
1251 4 : case 2:
1252 4 : if (indexJ == 0) {
1253 2 : if (is_format_nchw) {
1254 338 : transposeloop(l, k, i, j, SL, SK, SI, SJ);
1255 : } else {
1256 0 : transposeloop_nhwc(l, i, j, k, SL, SI, SJ, SK);
1257 : }
1258 : } else {
1259 2 : if (is_format_nchw) {
1260 398 : transposeloop(l, k, j, i, SL, SK, SJ, SI);
1261 : } else {
1262 0 : transposeloop_nhwc(l, j, i, k, SL, SJ, SI, SK);
1263 : }
1264 : }
1265 : break;
1266 : }
1267 :
1268 1113 : return output;
1269 : }
1270 :
1271 10 : void FloatTensor::dropout_mask(float dropout) {
1272 10 : float scale = 1.0 / (1 - dropout);
1273 10 : float *data_ = (float *)getData();
1274 370 : for (unsigned int i = 0; i < size(); ++i) {
1275 360 : if (data_[i] >= dropout)
1276 148 : data_[i] = scale;
1277 : else
1278 212 : data_[i] = 0.0;
1279 : }
1280 10 : }
1281 :
1282 0 : void FloatTensor::filter_mask(const Tensor &mask_len, bool reverse) {
1283 : float fill_mask_val = 0.0;
1284 : float en_mask_val = 1.0 - fill_mask_val;
1285 :
1286 0 : if (reverse) {
1287 : fill_mask_val = 1.0;
1288 : en_mask_val = 1.0 - fill_mask_val;
1289 : }
1290 :
1291 0 : setValue(fill_mask_val);
1292 :
1293 0 : NNTR_THROW_IF(mask_len.batch() != batch(), std::invalid_argument)
1294 : << "Number of filter masks mismatched";
1295 :
1296 0 : for (unsigned int b = 0; b < batch(); b++) {
1297 0 : float *addr = (float *)getAddress(getIndex(b, 0, 0, 0));
1298 : const unsigned int *mask_len_val =
1299 0 : mask_len.getAddress<unsigned int>(b, 0, 0, 0);
1300 0 : std::fill(addr, addr + (*mask_len_val), en_mask_val);
1301 : }
1302 0 : }
1303 :
1304 3 : void FloatTensor::zoneout_mask(Tensor &opposite, float zoneout) {
1305 3 : opposite.setRandBernoulli(zoneout);
1306 :
1307 3 : float *data = (float *)getData();
1308 : float *opposite_data = opposite.getData<float>();
1309 :
1310 2010003 : for (unsigned int i = 0; i < size(); ++i) {
1311 2010000 : if (opposite_data[i] > epsilon) {
1312 603513 : data[i] = 0.0f;
1313 : } else {
1314 1406487 : data[i] = 1.0f;
1315 : }
1316 : }
1317 3 : }
1318 :
1319 13 : std::vector<Tensor> FloatTensor::split(std::vector<size_t> sizes, int axis) {
1320 : size_t num_size = sizes.size();
1321 :
1322 13 : if (axis == -1) {
1323 : axis = 3;
1324 : }
1325 :
1326 : size_t total_size =
1327 : std::accumulate(sizes.begin(), sizes.end(), static_cast<size_t>(0));
1328 14 : NNTR_THROW_IF(dim.getTensorDim(axis) != total_size, std::invalid_argument)
1329 : << "given sum of sizes did not match with origin tensor dim, tensor dim: "
1330 1 : << dim.getTensorDim(axis) << " total size: " << total_size;
1331 :
1332 12 : std::vector<TensorDim> ret_dims(num_size, dim);
1333 43 : for (unsigned int i = 0; i < num_size; ++i) {
1334 31 : ret_dims[i].setTensorDim(axis, sizes[i]);
1335 : }
1336 :
1337 12 : bool is_format_nchw = (dim.getFormat() == Tformat::NCHW) ? true : false;
1338 : std::vector<Tensor> ret;
1339 :
1340 1248 : auto iter_value = [this, is_format_nchw](
1341 : std::array<size_t, 4> &loc,
1342 : const std::array<size_t, 4> &end_loc,
1343 : const std::array<size_t, 4> &reset_dim_arr) -> float & {
1344 1248 : auto &value = (is_format_nchw) ? getValue(loc[0], loc[1], loc[2], loc[3])
1345 0 : : getValue(loc[0], loc[3], loc[1], loc[2]);
1346 1960 : for (int i = 3; i >= 0; --i) {
1347 1929 : loc[i]++;
1348 1929 : if (loc[i] == end_loc[i]) {
1349 712 : loc[i] -= reset_dim_arr[i];
1350 : continue;
1351 : }
1352 : break;
1353 : }
1354 1248 : return value;
1355 12 : };
1356 :
1357 : unsigned int accumulated_size = 0;
1358 43 : for (unsigned int i = 0; i < num_size; ++i) {
1359 31 : std::array<size_t, 4> loc = {0, 0, 0, 0};
1360 :
1361 31 : if (is_format_nchw) {
1362 31 : loc[axis] += accumulated_size;
1363 : } else {
1364 0 : if (axis == 0) {
1365 0 : loc[0] += accumulated_size;
1366 0 : } else if (axis == 1) {
1367 0 : loc[3] += accumulated_size;
1368 0 : } else if (axis == 2 || axis == 3) {
1369 0 : loc[axis - 1] += accumulated_size;
1370 : }
1371 : }
1372 :
1373 62 : ret.push_back(Tensor(ret_dims[i]));
1374 : auto &ret_t = ret.back();
1375 :
1376 : std::array<size_t, 4> end_loc;
1377 :
1378 31 : if (is_format_nchw) {
1379 62 : end_loc = {ret_dims[i].batch(), ret_dims[i].channel(),
1380 31 : ret_dims[i].height(), ret_dims[i].width()};
1381 : } else {
1382 0 : end_loc = {ret_dims[i].batch(), ret_dims[i].height(), ret_dims[i].width(),
1383 0 : ret_dims[i].channel()};
1384 : }
1385 :
1386 31 : accumulated_size += sizes[i];
1387 :
1388 31 : if (is_format_nchw) {
1389 31 : end_loc[axis] = accumulated_size;
1390 : } else {
1391 0 : if (axis == 0) {
1392 0 : end_loc[0] = accumulated_size;
1393 0 : } else if (axis == 1) {
1394 0 : end_loc[3] = accumulated_size;
1395 0 : } else if (axis == 2 || axis == 3) {
1396 0 : end_loc[axis - 1] = accumulated_size;
1397 : }
1398 : }
1399 :
1400 : std::array<size_t, 4> reset_dim_arr;
1401 : if (is_format_nchw) {
1402 62 : reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].channel(),
1403 31 : ret_dims[i].height(), ret_dims[i].width()};
1404 : } else {
1405 0 : reset_dim_arr = {ret_dims[i].batch(), ret_dims[i].height(),
1406 0 : ret_dims[i].width(), ret_dims[i].channel()};
1407 : }
1408 :
1409 31 : ret_t.apply_i<float>(
1410 62 : [&iter_value, &loc, &end_loc, &reset_dim_arr](float _) {
1411 1248 : return iter_value(loc, end_loc, reset_dim_arr);
1412 : });
1413 : }
1414 :
1415 12 : return ret;
1416 12 : }
1417 :
1418 5 : Tensor FloatTensor::concat(const std::vector<Tensor> &tensors, int axis,
1419 : Tensor &output) {
1420 5 : bool is_format_nchw = (tensors.front().getDim().getFormat() == Tformat::NCHW);
1421 :
1422 : auto iter_value =
1423 746 : [is_format_nchw](std::array<unsigned, 4> &loc,
1424 : const std::array<unsigned, 4> &start_loc, Tensor &t,
1425 : const std::array<unsigned, 4> &ref_dim_arr) -> float & {
1426 746 : auto &value = is_format_nchw
1427 746 : ? t.getValue<float>(loc[0], loc[1], loc[2], loc[3])
1428 0 : : t.getValue<float>(loc[0], loc[3], loc[1], loc[2]);
1429 :
1430 1044 : for (int i = 3; i >= 0; --i) {
1431 1033 : loc[i]++;
1432 1033 : if (loc[i] - start_loc[i] == ref_dim_arr[i]) {
1433 298 : loc[i] = start_loc[i];
1434 : continue;
1435 : }
1436 : break;
1437 : }
1438 746 : return value;
1439 5 : };
1440 :
1441 5 : std::array<unsigned, 4> loc = {0, 0, 0, 0};
1442 16 : for (auto &t : tensors) {
1443 11 : std::array<unsigned, 4> start_loc = loc;
1444 : std::array<unsigned, 4> tensor_dim_arr;
1445 11 : TensorDim curr_dim = t.getDim();
1446 :
1447 11 : tensor_dim_arr[0] = curr_dim.getTensorDim(0);
1448 22 : tensor_dim_arr[1] =
1449 11 : is_format_nchw ? curr_dim.getTensorDim(1) : curr_dim.getTensorDim(2);
1450 22 : tensor_dim_arr[2] =
1451 11 : is_format_nchw ? curr_dim.getTensorDim(2) : curr_dim.getTensorDim(3);
1452 22 : tensor_dim_arr[3] =
1453 11 : is_format_nchw ? curr_dim.getTensorDim(3) : curr_dim.getTensorDim(1);
1454 :
1455 757 : for (size_t i = 0u, sz = t.size(); i < sz; ++i) {
1456 746 : iter_value(loc, start_loc, output, tensor_dim_arr) = t.getValue<float>(i);
1457 : }
1458 :
1459 11 : if (is_format_nchw) {
1460 11 : loc[axis] += curr_dim.getTensorDim(axis);
1461 : } else {
1462 0 : if (axis == 0) {
1463 0 : loc[0] += curr_dim.getTensorDim(axis);
1464 0 : } else if (axis == 1) {
1465 0 : loc[3] += curr_dim.getTensorDim(axis);
1466 0 : } else if (axis == 2 || axis == 3) {
1467 0 : loc[axis - 1] += curr_dim.getTensorDim(axis);
1468 : }
1469 : }
1470 : }
1471 :
1472 5 : return output;
1473 : }
1474 :
1475 10 : void FloatTensor::print(std::ostream &out) const {
1476 10 : const float *data = (float *)getData();
1477 10 : unsigned int len = size();
1478 10 : out << "data addr: " << data << '\n';
1479 10 : out << dim;
1480 :
1481 10 : if (len > 100) {
1482 6 : out << '[' << data[0] << ' ' << data[1] << ' ' << data[2] << " ... "
1483 6 : << data[len - 3] << ' ' << data[len - 2] << ' ' << data[len - 1] << ']'
1484 : << std::endl;
1485 6 : return;
1486 : }
1487 :
1488 4 : std::ios init(NULL);
1489 4 : init.copyfmt(out);
1490 :
1491 4 : if (getFormat() == Tformat::NCHW) {
1492 10 : for (unsigned int k = 0; k < batch(); k++) {
1493 12 : for (unsigned int l = 0; l < channel(); l++) {
1494 17 : for (unsigned int i = 0; i < height(); i++) {
1495 47 : for (unsigned int j = 0; j < width(); j++) {
1496 : out << std::setw(10) << std::setprecision(10)
1497 36 : << data[getIndex(k, l, i, j)] << " ";
1498 : }
1499 : out << std::endl;
1500 : }
1501 : out << std::endl;
1502 : }
1503 : out << "-------" << std::endl;
1504 : }
1505 : } else {
1506 0 : for (unsigned int k = 0; k < batch(); k++) {
1507 0 : for (unsigned int i = 0; i < height(); i++) {
1508 0 : for (unsigned int j = 0; j < width(); j++) {
1509 0 : for (unsigned int l = 0; l < channel(); l++) {
1510 : out << std::setw(10) << std::setprecision(10)
1511 0 : << data[getIndex(k, l, i, j)] << " ";
1512 : }
1513 : out << std::endl;
1514 : }
1515 : out << std::endl;
1516 : }
1517 : out << "-------" << std::endl;
1518 : }
1519 : }
1520 4 : out.copyfmt(init);
1521 : }
1522 :
1523 112088 : void FloatTensor::copy(const void *buf) {
1524 112088 : NNTR_THROW_IF(!contiguous, std::invalid_argument)
1525 : << getName() << " is not contiguous, cannot copy.";
1526 :
1527 112088 : if (buf == getData()) {
1528 : return;
1529 : }
1530 :
1531 111785 : scopy(size(), (float *)buf, 1, (float *)getData(), 1);
1532 : }
1533 :
1534 100882 : void FloatTensor::apply_broadcast_util(
1535 : Tensor const &m,
1536 : std::function<void(const BroadcastInfo &e, const float *, const float *,
1537 : float *)>
1538 : v_func,
1539 : Tensor &output, const BroadcastInfo &e, int cur_axis, size_t offset,
1540 : size_t m_offset) const {
1541 :
1542 100882 : const float *buf = (float *)this->getData();
1543 : const float *m_buf = m.getData<float>();
1544 : float *out_buf = output.getData<float>();
1545 :
1546 100882 : if (e.buffer_axis == cur_axis) {
1547 77211 : v_func(e, buf + offset, m_buf + m_offset, out_buf + offset);
1548 77211 : return;
1549 : }
1550 :
1551 23671 : cur_axis++;
1552 23671 : unsigned int continuity[4] = {0, 1, 2, 3};
1553 23671 : if (getFormat() == Tformat::NHWC) {
1554 10 : continuity[1] = 2;
1555 10 : continuity[2] = 3;
1556 10 : continuity[3] = 1;
1557 : }
1558 113890 : for (unsigned int i = 0; i < dim.getTensorDim(continuity[cur_axis]); ++i) {
1559 90219 : size_t next_offset = offset + i * strides[cur_axis];
1560 90219 : size_t next_m_offset = m_offset + i * e.strides[cur_axis];
1561 180438 : apply_broadcast_util(m, v_func, output, e, cur_axis, next_offset,
1562 : next_m_offset);
1563 : }
1564 : }
1565 :
1566 76795 : void FloatTensor::apply_broadcast(
1567 : Tensor const &m,
1568 : std::function<void(const BroadcastInfo &e, const float *, const float *,
1569 : float *)>
1570 : v_func,
1571 : Tensor &output) const {
1572 91891 : CREATE_IF_EMPTY_DIMS(output, dim);
1573 :
1574 76799 : NNTR_THROW_IF(getData() == nullptr, std::invalid_argument)
1575 : << getName() << " is not allocated";
1576 76795 : NNTR_THROW_IF(m.getData<float>() == nullptr, std::invalid_argument)
1577 4 : << m.getName() << " is not allocated";
1578 76791 : NNTR_THROW_IF(output.getData<float>() == nullptr, std::invalid_argument)
1579 4 : << output.getName() << " is not allocated";
1580 :
1581 : /// shortcut to cover when dimension matches
1582 : /// note that buffer_size, the last stride is only used in v_func but it
1583 : /// might be changed
1584 76783 : if (dim == m.getDim()) {
1585 : BroadcastInfo e;
1586 65954 : e.buffer_size = size();
1587 65954 : e.strides[3] = 1;
1588 65954 : e.tensor_type = getTensorType();
1589 131908 : v_func(e, (float *)getData(), m.getData<float>(), output.getData<float>());
1590 : return;
1591 : }
1592 :
1593 21492 : return apply_broadcast_util(m, v_func, output, this->computeBroadcastInfo(m));
1594 : }
1595 :
1596 12 : bool FloatTensor::isValid() const {
1597 12 : return is_valid(dim.getDataLen(), (float *)getData());
1598 : }
1599 :
1600 : } // namespace nntrainer
|